# The CBS Secret Sauce Investigation: NCIS:NLP -- Topic Modeling

#### Creator: Mitch Brinkman

### Imports

In [18]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import patsy
import re
import pickle
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer 
from nltk.util import ngrams
from nltk import pos_tag
from gensim import matutils, models
import scipy.sparse
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Topic Modeling - NCIS

In [45]:
# Let's read in our document-term matrix


ncis_data = pd.read_pickle('./data/dtm/ncis_dtm_stopmm.pkl')
ncis_full_data = pd.read_pickle('./data/dtm/ncis_dtm.pkl')
ncis_full_tdm = ncis_full_data.transpose()
ncis_tdm = ncis_data.transpose()
ncis_tdm.head()

ep_id,ncis_season-1/episode-1-Yankee_White,ncis_season-1/episode-2-Hung_Out_to_Dry,ncis_season-1/episode-3-Seadog,ncis_season-1/episode-4-The_Immortals,ncis_season-1/episode-5-The_Curse,ncis_season-1/episode-6-High_Seas,ncis_season-1/episode-7-Sub_Rosa,ncis_season-1/episode-8-Minimum_Security,ncis_season-1/episode-9-Marine_Down,ncis_season-1/episode-10-Left_for_Dead,...,ncis_season-17/episode-6-Institutionalized,ncis_season-17/episode-8-No_Vacancy,ncis_season-17/episode-9-IRL,ncis_season-17/episode-10-The_North_Pole,ncis_season-17/episode-11-In_the_Wind,ncis_season-17/episode-12-Flight_Plan,ncis_season-17/episode-13-Sound_Off,ncis_season-17/episode-14-On_Fire,ncis_season-17/episode-15-Lonely_Hearts,ncis_season-17/episode-16-Ephemera
aa,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
aah,0,0,0,1,0,0,0,0,0,0,...,0,1,2,0,0,0,0,0,0,0
ab,0,0,0,0,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
abandoned,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
abb,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
# converting to gensim format

ncis_sparse_counts = scipy.sparse.csr_matrix(ncis_tdm)
ncis_corpus = matutils.Sparse2Corpus(ncis_sparse_counts)
ncis_cv = pickle.load(open("./data/cross_vec/ncis_cv_stopmm.pkl", "rb"))
id2word = dict((v, k) for k, v in ncis_cv.vocabulary_.items())

In [None]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
# cv = pickle.load(open("shield_cv_stopmm.pkl", "rb"))
# id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [49]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),

ncis_lda = models.LdaModel(corpus=ncis_corpus, id2word=id2word, num_topics=4, passes=80)
ncis_lda.print_topics()

[(0,
  '0.005*"talk" + 0.005*"bishop" + 0.004*"night" + 0.004*"life" + 0.004*"wait" + 0.004*"shes" + 0.004*"car" + 0.004*"friend" + 0.004*"kid" + 0.004*"home"'),
 (1,
  '0.006*"marine" + 0.005*"officer" + 0.005*"sergeant" + 0.005*"body" + 0.004*"kate" + 0.004*"mc" + 0.004*"mr" + 0.004*"ducky" + 0.004*"car" + 0.004*"petty"'),
 (2,
  '0.007*"ziva" + 0.007*"director" + 0.004*"talk" + 0.004*"shes" + 0.003*"phone" + 0.003*"mc" + 0.003*"theyre" + 0.003*"bad" + 0.003*"security" + 0.003*"better"'),
 (3,
  '0.014*"lieutenant" + 0.011*"commander" + 0.010*"sir" + 0.007*"officer" + 0.006*"captain" + 0.005*"ship" + 0.005*"navy" + 0.004*"body" + 0.004*"petty" + 0.004*"week"')]

In [None]:
# LDA for num_topics = 3
ncis_lda = models.LdaModel(corpus=ncis_corpus, id2word=id2word, num_topics=8, passes=10)
ncis_lda.print_topics()

In [None]:
# LDA for num_topics = 4
ncis_lda = models.LdaModel(corpus=ncis_corpus, id2word=id2word, num_topics=5, passes=10)
ncis_lda.print_topics()

In [46]:
# converting to gensim format

ncis_sparse_counts = scipy.sparse.csr_matrix(ncis_full_tdm)
ncis_corpus = matutils.Sparse2Corpus(ncis_sparse_counts)
ncis_cv = pickle.load(open("./data/cross_vec/ncis_cv.pkl", "rb"))
id2word = dict((v, k) for k, v in ncis_cv.vocabulary_.items())

In [49]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),

ncis_lda = models.LdaModel(corpus=ncis_corpus, id2word=id2word, num_topics=4, passes=80)
ncis_lda.print_topics()

[(0,
  '0.005*"talk" + 0.005*"bishop" + 0.004*"night" + 0.004*"life" + 0.004*"wait" + 0.004*"shes" + 0.004*"car" + 0.004*"friend" + 0.004*"kid" + 0.004*"home"'),
 (1,
  '0.006*"marine" + 0.005*"officer" + 0.005*"sergeant" + 0.005*"body" + 0.004*"kate" + 0.004*"mc" + 0.004*"mr" + 0.004*"ducky" + 0.004*"car" + 0.004*"petty"'),
 (2,
  '0.007*"ziva" + 0.007*"director" + 0.004*"talk" + 0.004*"shes" + 0.003*"phone" + 0.003*"mc" + 0.003*"theyre" + 0.003*"bad" + 0.003*"security" + 0.003*"better"'),
 (3,
  '0.014*"lieutenant" + 0.011*"commander" + 0.010*"sir" + 0.007*"officer" + 0.006*"captain" + 0.005*"ship" + 0.005*"navy" + 0.004*"body" + 0.004*"petty" + 0.004*"week"')]

In [None]:
# LDA for num_topics = 3
ncis_lda = models.LdaModel(corpus=ncis_corpus, id2word=id2word, num_topics=8, passes=10)
ncis_lda.print_topics()

In [None]:
# LDA for num_topics = 4
ncis_lda = models.LdaModel(corpus=ncis_corpus, id2word=id2word, num_topics=5, passes=10)
ncis_lda.print_topics()

## Topic Modeling - The Shield

In [40]:
# Let's read in our document-term matrix


shield_data = pd.read_pickle('./data/dtm/shield_dtm_stopmm.pkl')
shield_full_data = pd.read_pickle('./data/dtm/shield_dtm.pkl')
shield_tdm = shield_data.transpose()
shield_full_tdm = shield_full_data.transpose()
shield_tdm.head()

ep_id,the_shield_season-1/episode-1-Pilot,the_shield_season-1/episode-2-Our_Gang,the_shield_season-1/episode-3-The_Spread,the_shield_season-1/episode-4-Dawg_Days,the_shield_season-1/episode-5-Blowback,the_shield_season-1/episode-6-Cherrypoppers,the_shield_season-1/episode-7-Pay_in_Pain,the_shield_season-1/episode-8-Cupid__Psycho,the_shield_season-1/episode-9-Throwaway,the_shield_season-1/episode-10-Dragonchasers,...,the_shield_season-7/episode-4-Genocide,the_shield_season-7/episode-5-Game_Face,the_shield_season-7/episode-6-Animal_Control,the_shield_season-7/episode-7-Bitches_Brew,the_shield_season-7/episode-8-Parricide,the_shield_season-7/episode-9-Moving_Day,the_shield_season-7/episode-10-Party_Line,the_shield_season-7/episode-11-Petty_Cash,the_shield_season-7/episode-12-Possible_Kill_Screen,the_shield_season-7/episode-13-Family_Meeting
aah,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,4
abandoned,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
abetting,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,2,1
ability,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
able,1,0,1,1,1,2,1,1,0,1,...,2,1,1,3,0,3,0,0,1,2


In [41]:
# converting to gensim format
sparse_counts = scipy.sparse.csr_matrix(shield_tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)
cv = pickle.load(open("./data/cross_vec/shield_cv_stopmm.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [44]:
# LDA modeling w/ different numbers of topics & passes

shield_lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=80)
shield_lda.print_topics()

[(0,
  '0.014*"shane" + 0.009*"mara" + 0.007*"beltran" + 0.005*"ronnie" + 0.005*"deal" + 0.005*"wan" + 0.005*"ta" + 0.005*"baby" + 0.004*"shot" + 0.004*"margos"'),
 (1,
  '0.007*"lem" + 0.005*"wan" + 0.005*"killed" + 0.005*"ta" + 0.005*"antwon" + 0.005*"girl" + 0.005*"people" + 0.004*"deal" + 0.004*"mackey" + 0.004*"house"'),
 (2,
  '0.013*"antwon" + 0.006*"deal" + 0.005*"uh" + 0.005*"seizure" + 0.004*"mitchell" + 0.004*"obrien" + 0.004*"week" + 0.004*"antwons" + 0.004*"ta" + 0.004*"wan"'),
 (3,
  '0.007*"shane" + 0.006*"mexican" + 0.006*"armenian" + 0.005*"pezuela" + 0.005*"rezian" + 0.005*"gun" + 0.005*"people" + 0.004*"lat" + 0.004*"box" + 0.004*"murder"'),
 (4,
  '0.006*"wan" + 0.005*"ta" + 0.005*"uh" + 0.005*"car" + 0.004*"people" + 0.004*"night" + 0.004*"saw" + 0.004*"goddamn" + 0.004*"house" + 0.004*"year"')]

In [None]:
# LDA for num_topics = 10
shield_lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, passes=10)
shield_lda.print_topics()

In [None]:
# LDA for num_topics = 4
shield_lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

In [41]:
# converting to gensim format
sparse_counts = scipy.sparse.csr_matrix(shield__full_tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)
cv = pickle.load(open("./data/cross_vec/shield_big_vec_lib.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [44]:
# LDA modeling w/ different numbers of topics & passes

shield_lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=80)
shield_lda.print_topics()

[(0,
  '0.014*"shane" + 0.009*"mara" + 0.007*"beltran" + 0.005*"ronnie" + 0.005*"deal" + 0.005*"wan" + 0.005*"ta" + 0.005*"baby" + 0.004*"shot" + 0.004*"margos"'),
 (1,
  '0.007*"lem" + 0.005*"wan" + 0.005*"killed" + 0.005*"ta" + 0.005*"antwon" + 0.005*"girl" + 0.005*"people" + 0.004*"deal" + 0.004*"mackey" + 0.004*"house"'),
 (2,
  '0.013*"antwon" + 0.006*"deal" + 0.005*"uh" + 0.005*"seizure" + 0.004*"mitchell" + 0.004*"obrien" + 0.004*"week" + 0.004*"antwons" + 0.004*"ta" + 0.004*"wan"'),
 (3,
  '0.007*"shane" + 0.006*"mexican" + 0.006*"armenian" + 0.005*"pezuela" + 0.005*"rezian" + 0.005*"gun" + 0.005*"people" + 0.004*"lat" + 0.004*"box" + 0.004*"murder"'),
 (4,
  '0.006*"wan" + 0.005*"ta" + 0.005*"uh" + 0.005*"car" + 0.004*"people" + 0.004*"night" + 0.004*"saw" + 0.004*"goddamn" + 0.004*"house" + 0.004*"year"')]

In [None]:
# LDA for num_topics = 10
shield_lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=10, passes=10)
shield_lda.print_topics()

In [None]:
# LDA for num_topics = 4
shield_lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, passes=10)
lda.print_topics()

## LEFTOVERS - scratch area

In [None]:
# # Read in the cleaned data, before the CountVectorizer step
# data_clean = pd.read_pickle('data_clean.pkl')
# data_clean

In [None]:
# # Apply the nouns function to the transcripts to filter only on nouns
# data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
# data_nouns

In [1]:
# # Create a new document-term matrix using only nouns
# from sklearn.feature_extraction import text
# from sklearn.feature_extraction.text import CountVectorizer

# # Re-add the additional stop words since we are recreating the document-term matrix
# add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
#                   'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
# stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# # Recreate a document-term matrix with only nouns
# cvn = CountVectorizer(stop_words=stop_words)
# data_cvn = cvn.fit_transform(data_nouns.transcript)
# data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
# data_dtmn.index = data_nouns.index
# data_dtmn

In [2]:
# # Create the gensim corpus
# corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# # Create the vocabulary dictionary
# id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [3]:
# # Let's start with 2 topics
# ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
# ldan.print_topics()

In [4]:
# # Let's try topics = 3
# ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=10)
# ldan.print_topics()

In [5]:
# # Let's try 4 topics
# ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
# ldan.print_topics()

In [6]:
# shield_clean = pd.read_pickle('./pickles/clean_shield_series.pkl')

In [7]:
# # Let's create a function to pull out nouns from a string of text
# def nouns_adj(text):
#     '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
#     is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
#     tokenized = word_tokenize(text)
#     nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
#     return ' '.join(nouns_adj)

In [8]:
# # Apply the nouns function to the transcripts to filter only on nouns
# data_nouns_adj = pd.DataFrame(shield_clean.dialogue.apply(nouns_adj))
# data_nouns_adj

In [9]:
# # Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
# cvna = CountVectorizer(stop_words='english', max_df=.95)
# data_cvna = cvna.fit_transform(data_nouns_adj.dialogue)
# data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
# data_dtmna.index = data_nouns_adj.index
# data_dtmna

In [11]:
# # Create the gensim corpus
# corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# # Create the vocabulary dictionary
# id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [12]:
# # Let's start with 2 topics
# ldana = models.LdaModel(corpus=corpusna, num_topics=5, id2word=id2wordna, passes=10)
# ldana.print_topics()