In [1]:
import pandas as pd
import pickle
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from corextopic import corextopic as ct
from corextopic import vis_topic as vt
from wordcloud import STOPWORDS

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize   
from nltk.stem import WordNetLemmatizer 

In [2]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]        

In [3]:
with open('Leslie_all.pickle','rb') as read_file:
    leslie_episodes = pickle.load(read_file)

leslie_episodes.head()

Unnamed: 0,Character,Episode,Episode_Split,Episode_Text
0,Leslie Knope,s1e01,s1e01,"Hello. Hi. My name is Leslie Knope, and I work..."
1,Leslie Knope,s1e02,s1e02,"Well, one of the funner things that we do here..."
2,Leslie Knope,s1e03,s1e03,The Parks Department has so many programs. Jer...
3,Leslie Knope,s1e04,s1e04,"I don't believe it. Oh, my God. It's real. Hey..."
4,Leslie Knope,s1e05,s1e05,"In a town as old as Pawnee, there's a lot of h..."


In [5]:
leslie_episodes['Episode_Text'] = leslie_episodes['Episode_Text'].str.replace('\d+', '') # for digits
leslie_episodes['Episode_Text'] = leslie_episodes['Episode_Text'].str.replace(r'(\b\w{1,2}\b)', '') # for words
leslie_episodes['Episode_Text'] = leslie_episodes['Episode_Text'].str.replace('[^\w\s]', '') # for punctuation 
leslie_episodes.head()

Unnamed: 0,Character,Episode,Episode_Split,Episode_Text
0,Leslie Knope,s1e01,s1e01,Hello name Leslie Knope and work for the P...
1,Leslie Knope,s1e02,s1e02,Well one the funner things that here Pawne...
2,Leslie Knope,s1e03,s1e03,The Parks Department has many programs Jerry ...
3,Leslie Knope,s1e04,s1e04,don believe God real Hey Hey Hello Boys ...
4,Leslie Knope,s1e05,s1e05,town old Pawnee there lot history every...


In [6]:
with open('Ron_all.pickle','rb') as read_file:
    ron_episodes = pickle.load(read_file)

ron_episodes.head()

Unnamed: 0,Character,Episode,Episode_Split,Episode_Text
0,Ron Swanson,s1e01,s1e01,Tonight is our next monthly community outreach...
1,Ron Swanson,s1e02,s1e02,"Uh, sure, Paul. What can I do for you? Yeah, a..."
2,Ron Swanson,s1e03,s1e03,"No comment. Hey, Haverford, maybe one day you'..."
3,Ron Swanson,s1e04,s1e04,Go to jail? What's going on? Put it in an emai...
4,Ron Swanson,s1e05,s1e05,The only reason anybody's going to this thing ...


In [7]:
ron_episodes['Episode_Text'] = ron_episodes['Episode_Text'].str.replace('\d+', '') # for digits
ron_episodes['Episode_Text'] = ron_episodes['Episode_Text'].str.replace(r'(\b\w{1,2}\b)', '') # for words
ron_episodes['Episode_Text'] = ron_episodes['Episode_Text'].str.replace('[^\w\s]', '') # for punctuation 
ron_episodes.head()

Unnamed: 0,Character,Episode,Episode_Split,Episode_Text
0,Ron Swanson,s1e01,s1e01,Tonight our next monthly community outreach p...
1,Ron Swanson,s1e02,s1e02,sure Paul What can for you Yeah absolutely ...
2,Ron Swanson,s1e03,s1e03,comment Hey Haverford maybe one day you figur...
3,Ron Swanson,s1e04,s1e04,jail What going Put email Let not blow t...
4,Ron Swanson,s1e05,s1e05,The only reason anybody going this thing bec...


In [180]:
commonwords = ['use', 'own', 'barely', 'bottom'] #['use', 'someday', 'own'] #, 'own', 'went']


names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle', 'tommy', 'mark', 'brendanawicz'] 


nltk_stopwords =  stopwords.words('english')

stop_words = list(set(commonwords + names))

In [354]:
commonwords = ['use', 'own', 'barely', 'bottom'] #['use', 'someday', 'own'] #, 'own', 'went']


names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle', 'tommy', 'mark', 'brendanawicz'] 


#nltk_stopwords =  stopwords.words('english')

stop_words = list(set(commonwords + names))

In [355]:
vectorizer = CountVectorizer(max_features=20000, 
                             stop_words=stop_words, 
                             tokenizer = LemmaTokenizer(),
                             binary=True)

doc_word = vectorizer.fit_transform(leslie_episodes.Episode_Text)
words = list(np.asarray(vectorizer.get_feature_names()))

In [356]:
topic_model = ct.Corex(n_hidden=2, words=words,
                       max_iter=200, verbose = False, seed = 1)

topic_model.fit(doc_word, words=words, docs=leslie_episodes.Episode_Text, 
                anchors=[['park'],
                         ['government']]
                         , anchor_strength=2)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: park,department,director,recreation,live,choice,summer,further,subcommittee,neighborhood
1: government,federal,wheel,turning,intern,ready,brainstorming,themselves,moved,truck


In [344]:
commonwords = ['become', 'new', 'even']
#['such', 'new', 'wasn', 'also', 'even', 'will', 'this', 'should', 'must', 'out', 'consider', 'old',
              # 'really', 'tomorrow', 'ever', 'not', 'recently', 'offering', 'made', 'room']
#['new', 'should', 'old', 'such', 'out', 'even', 'while', 'them', 'tomorrow']
#['new', 'should', 'old', 'such', 'even', 'seemed', 'while', 'them'] #['even', 'new', 'should', 'wasn', 'such', 'made', 'seemed', 'old']


names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle', 'tommy', 'mark', 'brendanawicz'] 


nltk_stopwords =  stopwords.words('english')

stop_words = list(set(names + commonwords + nltk_stopwords))

In [346]:
vectorizer = CountVectorizer(max_features=20000, 
                             stop_words=stop_words, 
                             tokenizer = LemmaTokenizer(),
                             binary=True)

doc_word = vectorizer.fit_transform(ron_episodes.Episode_Text)
words = list(np.asarray(vectorizer.get_feature_names()))

In [347]:
topic_model = ct.Corex(n_hidden=2, words=words,
                       max_iter=200, verbose = False, seed = 1)

topic_model.fit(doc_word, words=words, docs=ron_episodes.Episode_Text, 
                anchors=[['park'],
                         ['government']]
                         , anchor_strength=2)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: park,drive,want,job,killed,nothing,girl,better,person,would
1: government,office,taxpayer,meet,large,crap,stand,dollar,hired,wasting
