### Import Libraries and Data Set

In [6]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import NMF

In [7]:
with open('all_scripts.pickle','rb') as read_file:
    all_scripts = pickle.load(read_file)

all_scripts.head()

Unnamed: 0,Character,Line,Line_Number,Episode
0,Leslie Knope,Hello.,0,s1e01
1,Leslie Knope,Hi.,1,s1e01
2,Leslie Knope,"My name is Leslie Knope, and I work for the Pa...",2,s1e01
3,Leslie Knope,Can I ask you a few questions?,3,s1e01
4,Leslie Knope,"Would you say that you are, ""Enjoying yourself...",4,s1e01


### Topic Modeling

#### Initial Model With No Stop Words

In [88]:
vectorizer = CountVectorizer()
doc_word = vectorizer.fit_transform(all_scripts.Line)
doc_word.shape

(65942, 17378)

In [89]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(65942, 3)

In [90]:
topic_word = nmf_model.components_
topic_word.shape

(3, 17378)

In [91]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['the', 'of', 'in', 'is', 'and', 'that', 'for'],
 ['you', 're', 'know', 'what', 'are', 'that', 'do'],
 ['to', 'and', 'it', 'we', 'that', 'my', 'this']]

#### Topic Modeling With English Stop Words

In [84]:
vectorizer = CountVectorizer(stop_words = 'english' )
doc_word = vectorizer.fit_transform(all_scripts.Line)
doc_word.shape

(65942, 17089)

In [85]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(65942, 3)

In [86]:
topic_word = nmf_model.components_
topic_word.shape

(3, 17089)

In [87]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['know', 'don', 'like', 'want', 'think', 'really', 'leslie'],
 ['just', 'okay', 'like', 'gonna', 'let', 'say', 'll'],
 ['oh', 'god', 'yeah', 'okay', 'hey', 'uh', 'really']]

#### Topic Modeling With Manually Chosen Stop Words

In [131]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before']
borderline = []
names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle'] 
stopwords = commonwords + names
vectorizer = CountVectorizer(stop_words = stopwords)
doc_word = vectorizer.fit_transform(all_scripts.Line)
doc_word.shape

(65942, 17181)

In [132]:
nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(65942, 4)

In [133]:
topic_word = nmf_model.components_
topic_word.shape

(4, 17181)

In [134]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-9:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['right', 'kind', 'life', 'feel', 'give', 'park', 'home', 'work'],
 ['time', 'spend', 'lost', 'together', 'thought', 'job', 'catch', 'show'],
 ['pawnee', 'love', 'city', 'work', 'town', 'parks', 'eagleton', 'department'],
 ['great', 'job', 'idea', 'work', 'news', 'city', 'story', 'place']]

#### Topic Modeling With Max Adjustments

In [80]:
vectorizer = CountVectorizer(max_df = 0.0009)
doc_word = vectorizer.fit_transform(all_scripts.Line)
doc_word.shape

(65942, 16693)

In [81]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(65942, 3)

In [82]:
topic_word = nmf_model.components_
topic_word.shape

(3, 16693)

In [83]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['kick', 'chop', 'master', 'throat', 'tighten', 'punch', 'drop'],
 ['ha', 'nose', 'classic', 'gross', 'feels', 'jerk', 'whoo'],
 ['blah', 'poker', 'calls', 'learn', 'basic', 'medical', 'records']]