### Import Libraries and Data Set

In [6]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import NMF

In [7]:
with open('episodes.pickle','rb') as read_file:
    episodes = pickle.load(read_file)

episodes.head()

Unnamed: 0,Episode,Episode_Text
0,s1e01,"Hello. Hi. My name is Leslie Knope, and I work..."
1,s1e02,"Well, one of the funner things that we do here..."
2,s1e03,"Okay, now, see, here's a good example of a pla..."
3,s1e04,"So, we've been called out to this hiking trail..."
4,s1e05,"In a town as old as Pawnee, there's a lot of h..."


### Topic Modeling

#### Initial Model With No Stop Words

In [8]:
vectorizer = CountVectorizer()
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(122, 17378)

In [9]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape



(122, 3)

In [10]:
topic_word = nmf_model.components_
topic_word.shape

(3, 17378)

In [11]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['the', 'you', 'to', 'and', 'it', 'that', 'we'],
 ['you', 'that', 'it', 'to', 'the', 'my', 'and'],
 ['you', 'to', 'and', 'me', 'is', 'what', 'it']]

#### Topic Modeling With English Stop Words

In [12]:
vectorizer = CountVectorizer(stop_words = 'english' )
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(122, 17089)

In [13]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape



(122, 3)

In [14]:
topic_word = nmf_model.components_
topic_word.shape

(3, 17089)

In [15]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['know', 'just', 'don', 'okay', 'yeah', 'oh', 'like'],
 ['going', 'okay', 'just', 'oh', 'know', 'don', 'like'],
 ['like', 'just', 'oh', 'pawnee', 'know', 'gonna', 'don']]

#### Topic Modeling With Manually Chosen Stop Words

In [16]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before']
borderline = ['great']
names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle'] 
stopwords = commonwords + names + borderline
vectorizer = CountVectorizer(stop_words = stopwords)
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(122, 17181)

In [17]:
nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(122, 4)

In [18]:
topic_word = nmf_model.components_
topic_word.shape

(4, 17181)

In [19]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-9:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['right', 'time', 'love', 'great', 'sure', 'wait', 'never', 'work'],
 ['pawnee', 'time', 'right', 'love', 'eagleton', 'town', 'work', 'great'],
 ['newport', 'bobby', 'campaign', 'city', 'idea', 'great', 'right', 'job'],
 ['park', 'right', 'great', 'parks', 'pit', 'government', 'department', 'job']]

#### Topic Modeling With Max Adjustments

In [49]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before']
names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle'] 
stopwords = commonwords + names 
vectorizer = CountVectorizer(min_df = 2, max_df = 0.95, stop_words = stopwords)
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(122, 8654)

In [53]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(122, 5)

In [54]:
topic_word = nmf_model.components_
topic_word.shape

(5, 8654)

In [55]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['wait', 'night', 'chris', 'work', 'fine', 'help', 'thought'],
 ['newport', 'bobby', 'campaign', 'city', 'idea', 'job', 'better'],
 ['pawnee', 'eagleton', 'town', 'work', 'book', 'everyone', 'joan'],
 ['park', 'parks', 'pit', 'government', 'department', 'mark', 'meeting'],
 ['work', 'kids', 'job', 'karate', 'pawnee', 'show', 'life']]