In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import sys; sys.path.append('../custom_python_packages')
from data_manipulation import data

In [2]:
books = data.loadAndClean('../../data/booksummaries/booksummaries.txt')
books = books[['bookGenre', 'plotSum', 'bookTitle']]

In [3]:
books.head(3)

Unnamed: 0,bookGenre,plotSum,bookTitle
0,"[roman_à_clef, satire, childrens_literature, s...","Old Major, the old boar on the Manor Farm, ca...",Animal Farm
1,"[science_fiction, novella, speculative_fiction...","Alex, a teenager living in near-future Englan...",A Clockwork Orange
2,"[existentialism, fiction, absurdist_fiction, n...",The text of The Plague is divided into five p...,The Plague


In [20]:
corpus = books.iloc[:,3]
corpus.shape

(12841,)

In [21]:
# get our documents into BOW style.
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
tf = tf_vectorizer.fit_transform(corpus)
tf_feature_names = tf_vectorizer.get_feature_names()

In [29]:
tf.shape

(12841, 1000)

In [30]:
# Run LDA
lda = LatentDirichletAllocation(n_topics=8, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda.fit(tf)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=8, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [33]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic: "+ str(topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

display_topics(lda, tf_feature_names, 10)

Topic: 0
world time new people earth human book planet life years
Topic: 1
father mother family school home life children new time house
Topic: 2
king help city magic battle way tells kill return escape
Topic: 3
police man house murder doctor finds tells killed room case
Topic: 4
war tom george british states american army united michael new
Topic: 5
novel book story life love young family wife characters richard
Topic: 6
jack john narrator thomas alex james chapter anna wolfe bond
Topic: 7
ship crew island captain team rachel escape sea group mission


In [16]:
import nltk.chunk

In [46]:
example_corp = books.iloc[0:10,1][:300]

In [48]:
tokenized_example_corp = [word_tokenize(example_sent) for example_sent in example_corp]

In [49]:
chunked_example_sentences = [nltk.chunk.ne_chunk(pos_tag(tokenized_example_sent)) for tokenized_example_sent in tokenized_example_corp]

In [51]:
names=[]
for sentence in chunked_example_sentences:
    for chunk in sentence:
        if type(chunk) == nltk.tree.Tree:
            if chunk.label() == 'PERSON':
                names.append(' '.join([c[0] for c in chunk]))
names

['Old',
 'Snowball',
 'Napoleon',
 'Napoleon',
 'Napoleon',
 'Snowball',
 'Napoleon',
 'Squealer',
 'Napoleon',
 'Napoleon',
 'Squealer',
 'Napoleon',
 'Napoleon',
 'Squealer',
 'Napoleon',
 'Napoleon',
 'Jones',
 'Squealer',
 'Frederick',
 'Boxer',
 'Napoleon',
 'Boxer',
 'Benjamin',
 'Squealer',
 'Napoleon',
 'Napoleon',
 'Napoleon',
 'Napoleon',
 'Pilkington',
 'Squealer',
 'Old Major',
 'Animalism',
 'Napoleon',
 'Squealer',
 'Squealer',
 'Napoleon',
 'Animal Farm',
 'Alex',
 'Alex',
 'Pete',
 'Alex',
 'Ludwig Van',
 'Alex',
 'Alex',
 'Dim',
 'Alex',
 'Alex',
 'Deltoid',
 'Alex',
 'Alex',
 'Georgie',
 'Alex',
 'Alex',
 'Dim',
 'Georgie',
 'Alex',
 'Georgie',
 'Alex',
 'Dim',
 'Alex',
 'Alex',
 'Alex',
 'Alex',
 'Alex',
 'Alex',
 'Fifth',
 'Alex',
 'Alex',
 'Alex',
 'Alex',
 'Alex',
 'Alex',
 'Dim',
 'Billyboy',
 'Alex',
 'Dazed',
 'Alex',
 'Alex',
 'Alexander',
 'Alex',
 'Alexander',
 'Alexander',
 'Alex',
 'Alex',
 'Alexander',
 'Alex',
 'Alexander',
 'Alex',
 'Alex',
 'Alexander'

In [52]:
from collections import Counter

In [53]:
names_count = Counter(names)

In [54]:
names_count

Counter({'Old': 1,
         'Snowball': 2,
         'Napoleon': 17,
         'Squealer': 8,
         'Jones': 1,
         'Frederick': 1,
         'Boxer': 2,
         'Benjamin': 1,
         'Pilkington': 1,
         'Old Major': 1,
         'Animalism': 1,
         'Animal Farm': 1,
         'Alex': 39,
         'Pete': 2,
         'Ludwig Van': 1,
         'Dim': 4,
         'Deltoid': 1,
         'Georgie': 3,
         'Fifth': 1,
         'Billyboy': 1,
         'Dazed': 1,
         'Alexander': 6,
         'Bernard Rieux': 1,
         'Michel': 1,
         'Rieux': 7,
         'Raymond Rambert': 1,
         'Father Paneloux': 1,
         'Cottard': 5,
         'Jean Tarrou': 1,
         'Joseph Grand': 1,
         'Rambert': 5,
         'Tarrou': 6,
         'Gluck': 1,
         'Orpheus': 2,
         'Castel': 1,
         'Othon': 4,
         'Paneloux': 3,
         'Grand': 2,
         'Milky Way': 1,
         'Zones': 1,
         'Slow Zone': 1,
         'Suspicious': 1,
     