### Import Libraries and Data Set

In [60]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import NMF

In [61]:
with open('episodes.pickle','rb') as read_file:
    episodes = pickle.load(read_file)

episodes.head()

Unnamed: 0,Episode,Episode_Text
0,s1e01,"Hello. Hi. My name is Leslie Knope, and I work..."
1,s1e02,"Well, one of the funner things that we do here..."
2,s1e03,"Okay, now, see, here's a good example of a pla..."
3,s1e04,"So, we've been called out to this hiking trail..."
4,s1e05,"In a town as old as Pawnee, there's a lot of h..."


### Topic Modeling

#### Initial Model With No Stop Words

In [62]:
vectorizer = CountVectorizer()
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 17378)

In [63]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic



array([[3.31863112, 2.20387707, 2.23150465],
       [3.93961158, 2.12028603, 1.60596938],
       [3.53329894, 4.2841771 , 0.        ],
       [3.17919686, 3.00471073, 1.77920504],
       [3.8900341 , 2.13706108, 1.41127637],
       [2.24097932, 5.36423713, 0.91918698],
       [3.71074046, 4.14868157, 0.27137849],
       [2.80173364, 3.04694323, 2.47517043],
       [2.70828457, 2.72104876, 1.76220089],
       [1.68399457, 6.41058152, 1.31152712],
       [4.47995263, 0.3132251 , 1.58648019],
       [3.1619122 , 3.8043182 , 0.32944369],
       [2.97338922, 3.8399051 , 1.13420041],
       [2.88381695, 3.68085148, 2.06972667],
       [4.8443509 , 0.9932011 , 0.16594705],
       [3.03300647, 4.58524029, 0.        ],
       [1.92372338, 3.98909553, 2.46052461],
       [3.46854751, 2.85772844, 1.34651549],
       [2.34234696, 7.76571527, 2.46021902],
       [3.00334972, 3.27202199, 2.11998497],
       [3.13093088, 3.52596364, 0.6858832 ],
       [1.70824317, 4.61641608, 1.48074252],
       [3.

In [64]:
topic_word = nmf_model.components_
#topic_word.shape
topic_word

array([[2.07296713e-02, 2.53667046e-01, 0.00000000e+00, ...,
        1.85052941e-03, 8.60877358e-05, 2.82908147e-03],
       [5.12774178e-02, 0.00000000e+00, 0.00000000e+00, ...,
        5.00628750e-04, 9.08554631e-03, 0.00000000e+00],
       [2.20278804e-01, 1.27841962e-02, 3.92571938e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [65]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['the', 'you', 'to', 'and', 'it', 'that', 'we'],
 ['you', 'that', 'it', 'the', 'my', 'and', 'to'],
 ['you', 'to', 'and', 'is', 'me', 'what', 'it']]

#### Topic Modeling With English Stop Words

In [66]:
vectorizer = CountVectorizer(stop_words = 'english' )
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 17089)

In [67]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape



(125, 3)

In [69]:
topic_word = nmf_model.components_
topic_word.shape

(3, 17089)

In [70]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['know',
  'just',
  'don',
  'yeah',
  'okay',
  'oh',
  'gonna',
  'like',
  'hey',
  'right'],
 ['like',
  'just',
  'oh',
  'pawnee',
  'know',
  'leslie',
  'don',
  'gonna',
  've',
  'time'],
 ['okay',
  'going',
  'just',
  'oh',
  'know',
  'don',
  'like',
  'yeah',
  'right',
  'leslie']]

#### Topic Modeling With Manually Chosen Stop Words

In [71]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before']
borderline = ['great']
names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle'] 
stopwords = commonwords + names + borderline
vectorizer = CountVectorizer(stop_words = stopwords)
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 17180)

In [72]:
nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(125, 4)

In [73]:
topic_word = nmf_model.components_
topic_word.shape

(4, 17180)

In [74]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-9:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['right', 'time', 'love', 'sure', 'work', 'wait', 'never', 'night'],
 ['pawnee', 'time', 'eagleton', 'right', 'town', 'work', 'love', 'best'],
 ['newport', 'bobby', 'campaign', 'city', 'idea', 'right', 'job', 'love'],
 ['park', 'right', 'parks', 'pit', 'job', 'government', 'department', 'work']]

#### Topic Modeling With Max Adjustments

In [49]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before']
names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle'] 
stopwords = commonwords + names 
vectorizer = CountVectorizer(min_df = 2, max_df = 0.95, stop_words = stopwords)
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(122, 8654)

In [53]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(122, 5)

In [54]:
topic_word = nmf_model.components_
topic_word.shape

(5, 8654)

In [55]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['wait', 'night', 'chris', 'work', 'fine', 'help', 'thought'],
 ['newport', 'bobby', 'campaign', 'city', 'idea', 'job', 'better'],
 ['pawnee', 'eagleton', 'town', 'work', 'book', 'everyone', 'joan'],
 ['park', 'parks', 'pit', 'government', 'department', 'mark', 'meeting'],
 ['work', 'kids', 'job', 'karate', 'pawnee', 'show', 'life']]