In [113]:
import pandas as pd
import pickle
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from corextopic import corextopic as ct
from corextopic import vis_topic as vt
from wordcloud import STOPWORDS

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize   
from nltk.stem import WordNetLemmatizer 


In [100]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
        

In [120]:
with open('Ron_all.pickle','rb') as read_file:
    ron_episodes = pickle.load(read_file)

ron_episodes.head()

Unnamed: 0,Character,Episode,Episode_Split,Episode_Text
0,Ron Swanson,s1e01,s1e01,Tonight is our next monthly community outreach...
1,Ron Swanson,s1e02,s1e02,"Uh, sure, Paul. What can I do for you? Yeah, a..."
2,Ron Swanson,s1e03,s1e03,"No comment. Hey, Haverford, maybe one day you'..."
3,Ron Swanson,s1e04,s1e04,Go to jail? What's going on? Put it in an emai...
4,Ron Swanson,s1e05,s1e05,The only reason anybody's going to this thing ...


In [121]:
ron_episodes['Episode_Text'] = ron_episodes['Episode_Text'].str.replace('\d+', '') # for digits
ron_episodes['Episode_Text'] = ron_episodes['Episode_Text'].str.replace(r'(\b\w{1,2}\b)', '') # for words
ron_episodes['Episode_Text'] = ron_episodes['Episode_Text'].str.replace('[^\w\s]', '') # for punctuation 

In [125]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before', 'must', 'wasn', 'use']

names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle', 'tommy', 'mark'] 


nltk_stopwords =  stopwords.words('english')

stop_words = list(set(names + nltk_stopwords))

In [181]:
commonwords = ['cream', 'cheese', 'want', 'new', 'large', 'made', 'small', 'throat', 'old', 'feel' ,'must']


names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle', 'tommy', 'mark'] 


nltk_stopwords =  stopwords.words('english')

stop_words = list(set(names + nltk_stopwords + commonwords))

In [182]:
vectorizer = CountVectorizer(max_features=20000, 
                             stop_words=stop_words, tokenizer = LemmaTokenizer(),
                             binary=True)

doc_word = vectorizer.fit_transform(ron_episodes.Episode_Text)
words = list(np.asarray(vectorizer.get_feature_names()))

In [177]:
topic_model = ct.Corex(n_hidden=2, words=words,
                       max_iter=200, verbose = False, seed = 1)

topic_model.fit(doc_word, words=words, docs=ron_episodes.Episode_Text, 
                anchors=[['park'],
                         ['government']]
                         , anchor_strength=2)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: park,leave,took,adult,rest,fine,honestly,shotgun,two,hello
1: government,taxpayer,meet,project,crap,ridiculous,office,win,right,drink


In [183]:
topic_model = ct.Corex(n_hidden=2, words=words,
                       max_iter=200, verbose = False, seed = 1)

topic_model.fit(doc_word, words=words, docs=ron_episodes.Episode_Text, 
                anchors=[['park'],
                         ['government']]
                         , anchor_strength=2)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: park,leave,took,adult,rest,fine,honestly,shotgun,two,hello
1: government,taxpayer,meet,project,crap,ridiculous,office,win,right,drink


In [41]:
with open('Leslie_all.pickle','rb') as read_file:
    leslie_episodes = pickle.load(read_file)

leslie_episodes.head()

Unnamed: 0,Character,Episode,Episode_Split,Episode_Text
0,Leslie Knope,s1e01,s1e01,"Hello. Hi. My name is Leslie Knope, and I work..."
1,Leslie Knope,s1e02,s1e02,"Well, one of the funner things that we do here..."
2,Leslie Knope,s1e03,s1e03,The Parks Department has so many programs. Jer...
3,Leslie Knope,s1e04,s1e04,"I don't believe it. Oh, my God. It's real. Hey..."
4,Leslie Knope,s1e05,s1e05,"In a town as old as Pawnee, there's a lot of h..."


In [235]:
commonwords = ['wheel', 'turning','use', 'barely', 'bottom', 'truck', 'bird', 'page']


names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle', 'tommy', 'mark'] 


nltk_stopwords =  stopwords.words('english')

stop_words = list(set(nltk_stopwords + names+ commonwords))

In [233]:
vectorizer = CountVectorizer(max_features=20000, 
                             stop_words=stop_words, tokenizer = LemmaTokenizer(),
                             binary=True)

doc_word = vectorizer.fit_transform(leslie_episodes.Episode_Text)
words = list(np.asarray(vectorizer.get_feature_names()))

In [237]:
topic_model = ct.Corex(n_hidden=2, words=words,
                       max_iter=200, verbose=False, seed=2)

topic_model.fit(doc_word, words=words, docs=leslie_episodes.Episode_Text, 
                anchors=[['park'],
                        ['government']]
                         , anchor_strength=2)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: park,director,recreation,department,choice,year,system,sell,neighborhood,summer
1: government,ready,gun,48,intern,staff,except,notice,people,budget


In [227]:
topic_model.get_top_docs(topic=1, n_docs=2)

[('Hello. Hi. My name is Leslie Knope, and I work for the Parks and Recreation Department. Can I ask you a few questions? Would you say that you are, "Enjoying yourself and having fun, having a moderate amount of fun and somewhat enjoying yourself, or having no fun and no enjoyment?" I\'m gonna put a lot of fun. Sir, this is a children\'s slide. You\'re not allowed to sleep in here. You know, when I first tell people that I work in the government, they say, "Oh." "The government." "The government stinks." "The lines are too long at the DMV." But now things have changed. People need our help. And it feels good to be needed. Could you put your arms to your side? And that might help you slide down a little easier. Do you want to come this way? Okay, we\'re gonna need you to get out. Get out of the slide. Okay? Here we go! Okay, wake up. Here we go. Out of the slide. You know, government isn\'t just a boys\' club anymore. Women are everywhere. It\'s a great time to be a woman in politics. 

In [206]:
topic_model = ct.Corex(n_hidden=2, words=words,
                       max_iter=200, verbose=False, seed=1)

topic_model.fit(doc_word, words=words, docs=leslie_episodes.Episode_Text, 
                anchors=[['government'],
                        ['government']]
                         , anchor_strength=2)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: government,month,losing,taken,people,judge,somewhere,chapter,monster,emailing
1: government,intern,bird,park,community,prepared,vendor,priority,term,director


In [94]:
vectorizer = CountVectorizer(max_features=200000,
                             stop_words=stopwords, #token_pattern="\\b[a-z][a-z]+\\b",
                             binary=True)

doc_word = vectorizer.fit_transform(ron_episodes.Episode_Text)
words = list(np.asarray(vectorizer.get_feature_names()))

In [97]:
topic_model = ct.Corex(n_hidden=2, words=words,
                       max_iter=200, verbose=False, seed=5)

topic_model.fit(doc_word, words=words, docs=ron_episodes.Episode_Text, 
                anchors=[['parks'],
                        ['government']]
                         , anchor_strength=2)

# Print all topics from the CorEx topic model
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: parks,pounds,waste,face,three,citizen,remember,seven,hole,self
1: government,right,person,dollars,stand,tax,wasting,speech,offering,40
