In [33]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import sys; sys.path.append('../../src/helpers')
from data_manipulation import data

In [34]:
books = pd.read_csv('../../data/booksummaries/books_Porter_True.csv')

In [35]:
books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"['roman_à_clef', 'satire', 'childrens_literatu...",old old boar manor farm call anim farm meet co...,Animal Farm
1,"['science_fiction', 'novella', 'speculative_fi...",teenag live nearfutur england lead gang nightl...,A Clockwork Orange
2,"['existentialism', 'fiction', 'absurdist_ficti...",text plagu divid five part town oran thousand ...,The Plague


In [36]:
corpus = books.iloc[:,1]
corpus.shape

(12841,)

In [37]:
# get our documents into BOW style.
tf_vectorizer = CountVectorizer(max_df=0.90, max_features=1000)
tf = tf_vectorizer.fit_transform(corpus)
tf_feature_names = tf_vectorizer.get_feature_names()

In [38]:
tf_feature_names[:3]

['34', '39', 'abandon']

In [39]:
tf.shape

(12841, 1000)

In [40]:
# Run LDA
lda = LatentDirichletAllocation(n_topics=5, max_iter=5, learning_method='online', learning_offset=50.,random_state=123)
lda.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=5, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [41]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic: "+ str(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [42]:
display_topics(lda, tf_feature_names, 20)

Topic: 0
find kill one return take back help power use magic escap tell attack leav come name make two citi way
Topic: 1
ship war forc kill attack take armi order new plan escap captain attempt command one use two return state unit
Topic: 2
find get tell murder one mr hous go day back take kill man leav see call say ask meet tri
Topic: 3
human book world novel one time earth stori use peopl also planet first new charact life year work becom power
Topic: 4
father famili mother love life becom live friend year school stori one new marri young begin home take time girl


In [43]:
corpus = ['''the magic man once gave me 100 dollars and then the dragon came and
                swept me magically with magic off to the swords sword wizard''', 
         '''my best friend had my back growing up and my family was always 
             there and i loved love them''']

X_test = tf_vectorizer.transform(corpus)
doc_topic_dist_unnormalized = np.matrix(lda.transform(X_test))

# normalize the distribution (only needed if you want to work with the probabilities)
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)

In [44]:
X_test

<2x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [45]:
doc_topic_dist_unnormalized

matrix([[0.89906603, 0.02507761, 0.02544386, 0.02517808, 0.02523442],
        [0.04095923, 0.040429  , 0.04102727, 0.04022157, 0.83736293]])

In [46]:
doc_topic_dist

matrix([[0.89906603, 0.02507761, 0.02544386, 0.02517808, 0.02523442],
        [0.04095923, 0.040429  , 0.04102727, 0.04022157, 0.83736293]])

So now you've gotten it to where you've "found" topics based on your big corpus, by training your count_vectorizer and LDA model on the original big corpus, and then by entering new text you can get apply the same vectorization transform to them and then get an LDA score for each sentence in terms of the topics you've already found!