In [1]:
import sys; sys.path.append('../../src/helpers')
from data_manipulation import data
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
books = pd.read_csv('../../data/booksummaries/books_Porter_True.csv')

In [3]:
books.head(3)

Unnamed: 0,bookGenre,plotSum,Drop,plotSum2vec
0,"['comedy', 'children', 'speculative_fiction', ...",old old boar manor farm call anim farm meet co...,False,"[0.200577126220817, -0.07933575619857114, -0.1..."
1,"['science_fiction', 'fiction', 'speculative_fi...",teenag live nearfutur england lead gang nightl...,False,"[0.18350660521234127, 0.06064006807343155, -0...."
2,"['existential_philosophy', 'fiction']",text plagu divid five part town oran thousand ...,False,"[0.4374694909215338, 0.004273443157695797, -0...."


In [4]:
corpus = books.iloc[:,1]
corpus.shape

(12667,)

In [5]:
corpus.head(3)

0    old old boar manor farm call anim farm meet co...
1    teenag live nearfutur england lead gang nightl...
2    text plagu divid five part town oran thousand ...
Name: plotSum, dtype: object

In [6]:
# get our documents into BOW style.
tf_vectorizer = CountVectorizer(max_df=0.90, max_features=500)
tf = tf_vectorizer.fit_transform(corpus)
tf_feature_names = tf_vectorizer.get_feature_names()

In [7]:
tf_feature_names[:10]

['abandon',
 'abil',
 'abl',
 'accept',
 'across',
 'act',
 'action',
 'actual',
 'adventur',
 'affair']

In [8]:
tf.shape

(12667, 500)

In [9]:
# Run LDA
lda = LatentDirichletAllocation(n_topics=20, max_iter=5, learning_method='online', learning_offset=50.,random_state=123)
lda.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=20, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [10]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic: "+ str(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [11]:
display_topics(lda, tf_feature_names, 20)

Topic: 0
dog anim cat hunt human day name old food live home find one take kill walk two hous becom sleep
Topic: 1
vampir british england uncl blood boy kill becom turn return human set also friend soldier take help discov queen young
Topic: 2
mother father famili children school girl boy home sister parent hous live brother friend find year becom take child away
Topic: 3
life famili becom love new son wife live daughter young marri year novel begin husband death relationship father work die
Topic: 4
love leav tell return meet take day see ask marri one make visit father two give back come find later
Topic: 5
kill power battl dragon forc fight attack armi use help lord citi return find dark take one defeat death destroy
Topic: 6
kill agent team investig murder attempt discov new plan member escap inform reveal use assassin secret work one oper help
Topic: 7
ship island captain sea command take board rescu return one find escap captur two arriv back leav land attack water
Topic: 8
human

In [12]:
corpus = ['''the magic man once gave me 100 dollars and then the dragon came and
                swept me magically with magic off to the swords sword wizard'''] 
         #'''my best friend had my back growing up and my family was always 
             #there and i loved love them''']

X_test = tf_vectorizer.transform(corpus)
doc_topic_dist_unnormalized = np.matrix(lda.transform(X_test))

# normalize the distribution (only needed if you want to work with the probabilities)
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)

In [13]:
X_test

<1x500 sparse matrix of type '<class 'numpy.int64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [14]:
doc_topic_dist_unnormalized

matrix([[0.01      , 0.01      , 0.01      , 0.01      , 0.01      ,
         0.24977069, 0.01      , 0.01      , 0.01      , 0.01      ,
         0.01      , 0.01      , 0.01      , 0.01      , 0.17066925,
         0.01      , 0.01      , 0.40956006, 0.01      , 0.01      ]])

So now you've gotten it to where you've "found" topics based on your big corpus, by training your count_vectorizer and LDA model on the original big corpus, and then by entering new text you can get apply the same vectorization transform to them and then get an LDA score for each sentence in terms of the topics you've already found!