In [28]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import sys; sys.path.append('../../src/helpers')
from data_manipulation import data
import pickle

In [2]:
books = pd.read_csv('../../data/booksummaries/books_Porter_True.csv')

In [3]:
books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle,plotSum2vec
0,"['comedy', 'children', 'speculative_fiction', ...",old old boar manor farm call anim farm meet co...,Animal Farm,[ 0.1988665 -0.07466647 -0.14304179 -0.318821...
1,"['science_fiction', 'fiction', 'speculative_fi...",teenag live nearfutur england lead gang nightl...,A Clockwork Orange,[ 1.83506605e-01 6.06400681e-02 -1.32821921e-...
2,"['existential_philosophy', 'fiction', 'absurdi...",text plagu divid five part town oran thousand ...,The Plague,[ 0.437975 0.0055958 -0.09298557 -0.316397...


In [53]:
corpus = books.iloc[:,1]
corpus.shape

(12671,)

In [54]:
corpus.head(3)

0    old old boar manor farm call anim farm meet co...
1    teenag live nearfutur england lead gang nightl...
2    text plagu divid five part town oran thousand ...
Name: plotSum, dtype: object

In [55]:
# get our documents into BOW style.
tf_vectorizer = CountVectorizer(max_df=0.90, max_features=500)
tf = tf_vectorizer.fit_transform(corpus)
tf_feature_names = tf_vectorizer.get_feature_names()

In [56]:
tf_feature_names[:10]

['abandon',
 'abil',
 'abl',
 'accept',
 'across',
 'act',
 'action',
 'actual',
 'adventur',
 'affair']

In [57]:
tf.shape

(12671, 500)

In [58]:
# Run LDA
lda = LatentDirichletAllocation(n_topics=20, max_iter=5, learning_method='online', learning_offset=50.,random_state=123)
lda.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=20, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [59]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic: "+ str(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [60]:
display_topics(lda, tf_feature_names, 20)

Topic: 0
dog anim cat hunt human day name food old live find home one kill take leader hous becom two walk
Topic: 1
vampir british england sir uncl blood becom de kill turn return also lord friend take help set queen boy young
Topic: 2
mother father famili children school boy girl home sister parent hous live brother friend find year becom take child away
Topic: 3
life famili becom love son new wife live young daughter marri year novel begin husband death relationship work father die
Topic: 4
love tell leav return meet take day see ask marri one make visit two father back give come find later
Topic: 5
kill power battl dragon forc attack fight lord use armi help find return citi dark take one magic defeat death
Topic: 6
kill agent team investig murder attempt discov plan new escap member inform reveal use assassin work secret one help also
Topic: 7
ship island captain sea command take board rescu return one escap find captur two back arriv leav attack water land
Topic: 8
human earth pla

In [49]:
corpus = ['''the magic man once gave me 100 dollars and then the dragon came and
                swept me magically with magic off to the swords sword wizard''', 
         '''my best friend had my back growing up and my family was always 
             there and i loved love them''']

X_test = tf_vectorizer.transform(corpus)
doc_topic_dist_unnormalized = np.matrix(lda.transform(X_test))

# normalize the distribution (only needed if you want to work with the probabilities)
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)

In [50]:
X_test

<2x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [51]:
doc_topic_dist_unnormalized

matrix([[0.00294118, 0.00294118, 0.16247232, 0.00294118, 0.35172802,
         0.00294118, 0.00294118, 0.00294118, 0.00294118, 0.43579966,
         0.00294118, 0.00294118, 0.00294118, 0.00294118, 0.00294118,
         0.00294118, 0.00294118, 0.00294118, 0.00294118, 0.00294118],
        [0.00625   , 0.00625   , 0.00625   , 0.00625   , 0.00625   ,
         0.00625   , 0.00625   , 0.88125   , 0.00625   , 0.00625   ,
         0.00625   , 0.00625   , 0.00625   , 0.00625   , 0.00625   ,
         0.00625   , 0.00625   , 0.00625   , 0.00625   , 0.00625   ]])

In [52]:
doc_topic_dist

matrix([[0.00294118, 0.00294118, 0.16247232, 0.00294118, 0.35172802,
         0.00294118, 0.00294118, 0.00294118, 0.00294118, 0.43579966,
         0.00294118, 0.00294118, 0.00294118, 0.00294118, 0.00294118,
         0.00294118, 0.00294118, 0.00294118, 0.00294118, 0.00294118],
        [0.00625   , 0.00625   , 0.00625   , 0.00625   , 0.00625   ,
         0.00625   , 0.00625   , 0.88125   , 0.00625   , 0.00625   ,
         0.00625   , 0.00625   , 0.00625   , 0.00625   , 0.00625   ,
         0.00625   , 0.00625   , 0.00625   , 0.00625   , 0.00625   ]])

So now you've gotten it to where you've "found" topics based on your big corpus, by training your count_vectorizer and LDA model on the original big corpus, and then by entering new text you can get apply the same vectorization transform to them and then get an LDA score for each sentence in terms of the topics you've already found!