# Topic Modeling


Topic modeling finds the most common topics in each corpus. One common method of topic modeling is Latent Dirichlet Allocation (LDA). LDA picks up topics from a text given a document term matrix and the number of topics in the text.

I/O:

* Input: Document Term  Matrix 
* Output:
* Packages: gensim, scipy


Next Meeting:
* Figure out what LDA does (figuring out topics)
* Figure out what LDA needs (corpus, id2word)
* Figure out next rounds (N, V+N, A+N)

Helpful Links:
* https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/
* https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
* https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

In [5]:
import pandas as pd
import pickle 

# corpus
corpus = pd.read_pickle('corpus.pkl')
corpus

# document term matrix: documents are the rows
dtm = pd.read_pickle('dtm.pkl')

# term document matrix: terms are the rows
tdm = dtm.transpose() 
tdm.head(10)

Unnamed: 0,nyt,fox,cnn,ac
aapi,0,0,1,0
aaron,1,1,1,0
aberrant,1,0,0,0
abolishing,0,1,0,0
absolutely,0,1,1,1
absurd,0,1,0,0
abundance,0,0,1,0
academics,0,0,0,1
accidentnnnew,0,1,0,0
according,4,4,1,0


In [6]:
# corpus
corpus = pd.read_pickle('corpus.pkl')
corpus

Unnamed: 0,article
nyt,among the first things i did upon learning abo...
fox,on the afternoon of march police say a man c...
cnn,at an office where i worked some years ago a m...
ac,it is striking to see how quickly our media ha...


In [21]:
from gensim import matutils, models
import gensim.corpora as corpora
import scipy.sparse

# import stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /Users/maxin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# create list of words
word_list = []
for i in corpus.index:
    word_list.append(corpus.loc[i][0].split())
print(word_list)



In [24]:
# create id2word
id2word = corpora.Dictionary(word_list)
id2word

<gensim.corpora.dictionary.Dictionary at 0x7fce083515b0>

In [22]:
# change term-document matrix into a format for gensim, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [26]:
num_topics = 2
lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics)
# Print the Keyword in the 10 topics
doc_lda = lda_model[corpus]
lda_model.print_topics()

[(0,
  '0.007*"word" + 0.006*"welsh" + 0.006*"but" + 0.005*"completely" + 0.005*"describe" + 0.004*"cha" + 0.004*"thousand" + 0.004*"believe" + 0.004*"obsessive" + 0.004*"might"'),
 (1,
  '0.009*"but" + 0.008*"describe" + 0.007*"completely" + 0.007*"word" + 0.006*"welsh" + 0.005*"believe" + 0.004*"tyler" + 0.004*"march" + 0.004*"cha" + 0.004*"wrong"')]

In [None]:
# BUNCH OF CODE IM COPY PASTING

In [27]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns

# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

FileNotFoundError: [Errno 2] No such file or directory: 'data_clean.pkl'

In [None]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()