### Import Libraries and Data Set

In [28]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

<br>

## Topic Modeling With Individual Line Data Set

In [3]:
with open('all_scripts.pickle','rb') as read_file:
    all_scripts = pickle.load(read_file)

all_scripts.head()

Unnamed: 0,Character,Line,Line_Number,Episode,Episode_Split
0,Leslie Knope,Hello.,0,s1e01,s1e01
1,Leslie Knope,Hi.,1,s1e01,s1e01
2,Leslie Knope,"My name is Leslie Knope, and I work for the Pa...",2,s1e01,s1e01
3,Leslie Knope,Can I ask you a few questions?,3,s1e01,s1e01
4,Leslie Knope,"Would you say that you are, ""Enjoying yourself...",4,s1e01,s1e01


<br>

### Topic Modeling - CountVectorizer and NMF

#### Initial Model With No Stop Words

In [88]:
vectorizer = CountVectorizer()
doc_word = vectorizer.fit_transform(all_scripts.Line)
doc_word.shape

(65942, 17378)

In [89]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(65942, 3)

In [90]:
topic_word = nmf_model.components_
topic_word.shape

(3, 17378)

In [91]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['the', 'of', 'in', 'is', 'and', 'that', 'for'],
 ['you', 're', 'know', 'what', 'are', 'that', 'do'],
 ['to', 'and', 'it', 'we', 'that', 'my', 'this']]

#### Topic Modeling With English Stop Words

In [84]:
vectorizer = CountVectorizer(stop_words = 'english' )
doc_word = vectorizer.fit_transform(all_scripts.Line)
doc_word.shape

(65942, 17089)

In [85]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(65942, 3)

In [86]:
topic_word = nmf_model.components_
topic_word.shape

(3, 17089)

In [87]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['know', 'don', 'like', 'want', 'think', 'really', 'leslie'],
 ['just', 'okay', 'like', 'gonna', 'let', 'say', 'll'],
 ['oh', 'god', 'yeah', 'okay', 'hey', 'uh', 'really']]

#### Topic Modeling With Manually Chosen Stop Words

In [131]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before']
borderline = []
names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle'] 
stopwords = commonwords + names
vectorizer = CountVectorizer(stop_words = stopwords)
doc_word = vectorizer.fit_transform(all_scripts.Line)
doc_word.shape

(65942, 17181)

In [132]:
nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(65942, 4)

In [133]:
topic_word = nmf_model.components_
topic_word.shape

(4, 17181)

In [134]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-9:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['right', 'kind', 'life', 'feel', 'give', 'park', 'home', 'work'],
 ['time', 'spend', 'lost', 'together', 'thought', 'job', 'catch', 'show'],
 ['pawnee', 'love', 'city', 'work', 'town', 'parks', 'eagleton', 'department'],
 ['great', 'job', 'idea', 'work', 'news', 'city', 'story', 'place']]

#### Topic Modeling With Max Adjustments

In [4]:
vectorizer = CountVectorizer(max_df = 0.0009, min_df = 2)
doc_word = vectorizer.fit_transform(all_scripts.Line)
doc_word.shape

(65942, 9053)

In [5]:
nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(65942, 4)

In [6]:
topic_word = nmf_model.components_
topic_word.shape

(4, 9053)

In [7]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['kick', 'chop', 'master', 'throat', 'tighten', 'punch', 'drop'],
 ['ha', 'nose', 'classic', 'gross', 'feels', 'jerk', 'whoo'],
 ['blah', 'poker', 'calls', 'learn', 'basic', 'medical', 'records'],
 ['grid', 'entirely', 'pictures', 'early', 'completely', 'places', 'dope']]

<br>

## Topic Modeling With Episode Data Set

In [9]:
with open('episodes.pickle','rb') as read_file:
    episodes = pickle.load(read_file)

episodes.head()

Unnamed: 0,Episode,Episode_Split,Episode_Text
0,s1e01,s1e01,"Hello. Hi. My name is Leslie Knope, and I work..."
1,s1e02,s1e02,"Well, one of the funner things that we do here..."
2,s1e03,s1e03,"Okay, now, see, here's a good example of a pla..."
3,s1e04,s1e04,"So, we've been called out to this hiking trail..."
4,s1e05,s1e05,"In a town as old as Pawnee, there's a lot of h..."


<br>

### Topic Modeling - CountVectorizer and NMF

#### Initial Model With No Stop Words

In [10]:
vectorizer = CountVectorizer()
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 17378)

In [11]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)



In [13]:
topic_word = nmf_model.components_
topic_word.shape

(3, 17378)

In [14]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['the', 'you', 'to', 'and', 'it', 'that', 'we'],
 ['you', 'that', 'it', 'the', 'my', 'and', 'to'],
 ['you', 'to', 'and', 'is', 'me', 'what', 'it']]

#### Topic Modeling With English Stop Words

In [15]:
vectorizer = CountVectorizer(stop_words = 'english' )
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 17089)

In [16]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape



(125, 3)

In [17]:
topic_word = nmf_model.components_
topic_word.shape

(3, 17089)

In [18]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-11:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['know',
  'just',
  'don',
  'yeah',
  'okay',
  'oh',
  'gonna',
  'like',
  'hey',
  'right'],
 ['like',
  'just',
  'oh',
  'pawnee',
  'know',
  'leslie',
  'don',
  'gonna',
  've',
  'time'],
 ['okay',
  'going',
  'just',
  'oh',
  'know',
  'don',
  'like',
  'yeah',
  'right',
  'leslie']]

#### Topic Modeling With Manually Chosen Stop Words

In [19]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before']
borderline = ['great']
names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle'] 
stopwords = commonwords + names + borderline
vectorizer = CountVectorizer(stop_words = stopwords)
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 17180)

In [20]:
nmf_model = NMF(4)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(125, 4)

In [21]:
topic_word = nmf_model.components_
topic_word.shape

(4, 17180)

In [22]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-9:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['right', 'time', 'love', 'sure', 'work', 'wait', 'never', 'night'],
 ['pawnee', 'time', 'eagleton', 'right', 'town', 'work', 'love', 'best'],
 ['newport', 'bobby', 'campaign', 'city', 'idea', 'right', 'job', 'love'],
 ['park', 'right', 'parks', 'pit', 'job', 'government', 'department', 'work']]

#### Topic Modeling With Adjustments and Stopwords

In [23]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before']
names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle'] 
stopwords = commonwords + names 
vectorizer = CountVectorizer(min_df = 2, max_df = 0.95, stop_words = stopwords)
doc_word = vectorizer.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 8659)

In [24]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(125, 5)

In [25]:
topic_word = nmf_model.components_
topic_word.shape

(5, 8659)

In [26]:
words = vectorizer.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['work', 'wait', 'night', 'chris', 'fine', 'help', 'thought'],
 ['newport', 'bobby', 'campaign', 'city', 'idea', 'job', 'better'],
 ['pawnee', 'eagleton', 'town', 'work', 'book', 'help', 'best'],
 ['park', 'parks', 'pit', 'job', 'department', 'government', 'work'],
 ['karate', 'show', 'johnny', 'gryzzl', 'kids', 'work', 'land']]

<br>

### Topic Modeling - TD-IDF and NMF

#### Initial Model With No Stop Words

In [29]:
tfidf = TfidfVectorizer(ngram_range=(1,2))
doc_word = tfidf.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 180851)

In [30]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape



(125, 3)

In [14]:
topic_word = nmf_model.components_
topic_word.shape

(3, 180852)

In [31]:
words = tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['you', 'to', 'the', 'it', 'and', 'that', 'is'],
 ['the', 'you', 'to', 'and', 'it', 'that', 'is'],
 ['you', 'the', 'to', 'newport', 'and', 'bobby', 'it']]

#### Topic Modeling With English Stop Words

In [32]:
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words = 'english' )
doc_word = tfidf.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 158091)

In [33]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape



(125, 3)

In [34]:
topic_word = nmf_model.components_
topic_word.shape

(3, 158091)

In [35]:
words = tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['just', 'okay', 'know', 'oh', 'don', 'like', 'yeah'],
 ['newport', 'bobby', 'bobby newport', 'campaign', 'just', 'leslie', 'like'],
 ['just', 'like', 'eagleton', 'oh', 'know', 'pawnee', 'don']]

#### Topic Modeling With Manually Chosen Stop Words

In [36]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before']
borderline = ['great']
names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle', 'tammy', 'justin',
        'bobby', 'newport', 'chris', 'trager', 'mark', 'larry'] 
stopwords = commonwords + names + borderline
tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words = stopwords)
doc_word = tfidf.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 145292)

In [37]:
nmf_model = NMF(3)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(125, 3)

In [38]:
topic_word = nmf_model.components_
topic_word.shape

(3, 145292)

In [39]:
words = tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-9:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['right', 'time', 'love', 'pawnee', 'work', 'city', 'never', 'campaign'],
 ['pit',
  'park',
  'right',
  'fell pit',
  'fell',
  'kaboom',
  'pit fell',
  'government'],
 ['eagleton',
  'pawnee',
  'gryzzl',
  'karate',
  'right',
  'merger',
  'concert',
  'johnny']]

#### Topic Modeling With Adjustments and Stopwords

In [51]:
commonwords = ['sorry', 'guys', 'did', 'be', 'get', 'he', 'on', 'been', 'in', 'not', 'are', 'so', 'one', 'to', 'at',
            'for', 'but', 'the', 'me', 'your', 'is', 'this', 'if', 'just', 'that', 'of', 'my', 'do', 'was', 'have',
            'it', 'and', 'with', 'what', 'like', 'want', 'all', 'gonna', 'we', 'you', 're', 'there', 'here', 'okay',
            'no', 'up', 'yeah', 'don', 'they', 'now', 'go', 'well', 'hey', 'uh', 'can', 'who', 'how', 'know', 'as',
            'out', 'would', 'really', 'her', 'about', 'look', 'll', 'am', 've', 'let', 'good', 'an', 'from', 'has',
            'going', 'oh', 'she', 'or', 'got', 'our', 'take', 'when', 'then', 'will', 'some', 'need', 'had', 'say',
            'why', 'could', 'him', 'come', 'should', 'were', 'think', 'might', 'actually', 'them', 'his', 'hi',
            'thanks', 'more', 'because', 'please', 'thank', 'make', 'see', 'any', 'every', 'by', 'after', 'back',
            'very', 'away', 'being', 'way', 'long', 'else', 'most', 'said', 'too', 'other', 'each', 'new', 'into',
            'than', 'still', 'something', 'everything', 'happening', 'start', 'whole', 'talking', 'only', 'anything',
            'us', 'tell', 'talk', 'much', 'through', 'thing', 'pretty', 'sir', 'two', 'little', 'doing', 'guy',
              'does', 'mean', 'ever', 'yes', 'same', 'put', 'over', 'call', 'day', 'their', 'off', 'these', 'where',
              'stop', 'man', 'maybe', 'people', 'down', 'even', 'god', 'first', 'last', 'next', 'old', 'didn', 
               'capsule', 'having', 'name', 'find', 'ask', 'again', 'lot', 'before']
names = ['leslie', 'knope', 'ron', 'swanson', 'ben', 'wyatt', 'april', 'ludgate', 'tom', 'haverford', 
         'ann', 'perkins', 'andy', 'dwyer', 'jerry', 'gergich', 'donna', 'meagle'] 
stopwords = commonwords + names 
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df = 2, max_df = 0.95, stop_words = stopwords)
doc_word = tfidf.fit_transform(episodes.Episode_Text)
doc_word.shape

(125, 17092)

In [52]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)
doc_topic.shape

(125, 5)

In [53]:
topic_word = nmf_model.components_
topic_word.shape

(5, 17092)

In [54]:
words = tfidf.get_feature_names()
t = nmf_model.components_.argsort(axis=1)[:,-1:-8:-1]
topic_words = [[words[e] for e in l] for l in t]
topic_words

[['chris', 'pawnee', 'work', 'party', 'wait', 'city', 'wanna'],
 ['newport', 'bobby', 'bobby newport', 'campaign', 'vote', 'vans', 'city'],
 ['pit', 'park', 'mark', 'fell', 'fell pit', 'parks', 'government'],
 ['tammy', 'library', 'wanna', 'diane', 'ex', 'vote', 'jamm'],
 ['eagleton', 'pawnee', 'gryzzl', 'karate', 'larry', 'merger', 'concert']]