In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import sys; sys.path.append('../../src/helpers')
from data_manipulation import data

In [16]:
books = pd.read_csv('../../data/booksummaries/books_Porter_True.csv')

In [17]:
books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"['roman_à_clef', 'satire', 'childrens_literatu...",old old boar manor farm call anim farm meet co...,Animal Farm
1,"['science_fiction', 'novella', 'speculative_fi...",teenag live nearfutur england lead gang nightl...,A Clockwork Orange
2,"['existentialism', 'fiction', 'absurdist_ficti...",text plagu divid five part town oran thousand ...,The Plague


In [18]:
corpus = books.iloc[:,1]
corpus.shape

(12841,)

In [19]:
# get our documents into BOW style.
tf_vectorizer = CountVectorizer(max_df=0.90, max_features=1000)
tf = tf_vectorizer.fit_transform(corpus)
tf_feature_names = tf_vectorizer.get_feature_names()

In [20]:
tf_feature_names[:3]

['34', '39', 'abandon']

In [21]:
tf.shape

(12841, 1000)

In [22]:
# Run LDA
lda = LatentDirichletAllocation(n_topics=8, max_iter=5, learning_method='online', learning_offset=50.,random_state=123)
lda.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=8, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [23]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic: "+ str(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [24]:
display_topics(lda, tf_feature_names, 20)

Topic: 0
find kill one return power take help magic escap use attack back citi two way name leav dragon howev fight
Topic: 1
kill war forc attack ship offic order take armi plan state agent captain command use unit escap new attempt one
Topic: 2
mr return marri murder wife love de visit man hous take young sir letter father meet ladi leav arriv son
Topic: 3
human earth ship planet time world one alien space use year destroy race new system doctor discov find travel technolog
Topic: 4
famili father mother life becom love live year children son begin friend young new daughter child girl home die stori
Topic: 5
book novel stori charact first also life narrat one time chapter end work includ describ new set world part begin
Topic: 6
find get tell go one back day hous take see leav come tri say call friend meet make time goe
Topic: 7
peopl would one world tom wolf use also even human power make time way work state like could chang need


In [29]:
corpus = ['''the magic man once gave me 100 dollars and then the dragon came and
                swept me magically with magic off to the swords sword wizard''', 
         '''my best friend had my back growing up and my family was always 
             there and i loved love them''']

X_test = tf_vectorizer.transform(corpus)
doc_topic_dist_unnormalized = np.matrix(lda.transform(X_test))

# normalize the distribution (only needed if you want to work with the probabilities)
doc_topic_dist = doc_topic_dist_unnormalized/doc_topic_dist_unnormalized.sum(axis=1)

In [30]:
X_test

<2x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [31]:
doc_topic_dist_unnormalized

matrix([[0.8905413 , 0.01562893, 0.01564237, 0.01563046, 0.01563336,
         0.01563696, 0.01564651, 0.01564011],
        [0.02505473, 0.02502553, 0.02503071, 0.02502919, 0.82473257,
         0.02501223, 0.0250976 , 0.02501744]])

In [32]:
doc_topic_dist

matrix([[0.8905413 , 0.01562893, 0.01564237, 0.01563046, 0.01563336,
         0.01563696, 0.01564651, 0.01564011],
        [0.02505473, 0.02502553, 0.02503071, 0.02502919, 0.82473257,
         0.02501223, 0.0250976 , 0.02501744]])

So now you've gotten it to where you've "found" topics based on your big corpus, by training your count_vectorizer and LDA model on the original big corpus, and then by entering new text you can get apply the same vectorization transform to them and then get an LDA score for each sentence in terms of the topics you've already found!