## Setup

This notebook was adapted from the following article: https://towardsdatascience.com/improving-the-interpretation-of-topic-models-87fd2ee3847d

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd
import re
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
RANDOM_STATE = 28

## Loading Data

In [13]:
comments = '/Users/Nina/Documents/classes/msds621/nina_nyt/nyt-comments/CommentsApril2018.csv'
articles = '/Users/Nina/Documents/classes/msds621/nina_nyt/nyt-comments/ArticlesApril2018.csv'

In [14]:
comm = pd.read_csv(comments)
art = pd.read_csv(articles)

  interactivity=interactivity, compiler=compiler, result=result)


Column "userTitle" has both NaN and string data types. Replace Nan with "Unknown" to have one uniform data type in the column

In [15]:
comm.iloc[:, 32] = comm.iloc[:, 32].replace(np.nan, 'Unknown', regex=True); comm.head()

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1524594282,5adf6684068401528a2aa69b,781.0,How could the league possibly refuse this offe...,26853969.0,26853969.0,<br/>,comment,1524594011,1.0,...,approved,1,0,News,1524594282,Christopher Rillo,46566740.0,San Francisco,Unknown,
1,1524594252,5adf6684068401528a2aa69b,781.0,"So then the execs can be like ""yeah...we will ...",26853699.0,26853699.0,<br/>,comment,1524593146,1.0,...,approved,1,0,News,1524594252,Matt Brand,64324866.0,"Williamsburg, Brooklyn",Unknown,
2,1524594250,5adf6684068401528a2aa69b,781.0,I would not want to play chess against these c...,26853677.0,26853677.0,<br/>,comment,1524593032,1.0,...,approved,1,0,News,1524594250,Joseph,78105093.0,"Fayetteville, AR",Unknown,
3,1524593431,5adf6684068401528a2aa69b,781.0,Could the cheerleaders join the Actors' Equity...,26853784.0,26853784.0,<br/>,comment,1524593426,1.0,...,approved,0,0,News,1524593431,Stephen,81939618.0,"Phoenix, AZ",Unknown,
4,1524595048,5adf653f068401528a2aa697,656.0,Seeking conclusions which support preconceived...,26854236.0,26854236.0,<br/>,comment,1524595043,1.0,...,approved,1,0,News,1524595048,Paul Zorsky,58642997.0,Texas,Unknown,


Keeping the comments with an 'unknown' section name

In [28]:
# keeping comments with unknown section name 
unknown_filt = (comm.sectionName == "Unknown") & (comm.commentType == "comment")
comm_filt = comm[unknown_filt].copy().reset_index()

In [38]:
# keeping articles that match the unknown section name 
art_filt = art.loc[art['articleID'].isin(set(comm_filt['articleID']))]

In [50]:
df_articles = art_filt.loc[:,['articleID', 'headline']]

In [51]:
df_comments = comm_filt.loc[:, ['articleID', 'commentBody']]

In [62]:
data = df_articles.merge(df_comments, how='inner')

In [64]:
data.head()

Unnamed: 0,articleID,headline,commentBody
0,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,Seeking conclusions which support preconceived...
1,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,Pruitt: First eliminate the scientists and no...
2,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,This is a shameful attack on the environment a...
3,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,“It gives people the opportunity in real time ...
4,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,Given recent revelations about Mr. Pruitt's mo...


In [65]:
documents = list(data['commentBody'])

In [67]:
no_features = 1000

In [68]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [69]:
# there are 35 sectionName types 
no_topics = 35 

In [91]:
def display_topics(H, W, feature_names, data, documents, no_top_words, no_top_documents):
    """
    https://towardsdatascience.com/improving-the-interpretation-of-topic-models-87fd2ee3847d
    
    outputs a 
    - numerical index as a topic name 
    - top words of a topic 
    - adjusted code to print the top documents (article titles) within the topic 
        
    """
    for topic_idx, topic in enumerate(H):
        print(f"Topic %d:{topic_idx}")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(data.loc[data['commentBody'] == documents[doc_index]]['headline'].to_string(header=False, index=False))

In [122]:
lda_comm = LatentDirichletAllocation(n_components=no_topics,random_state=RANDOM_STATE)
lda_model = lda_comm.fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

KeyboardInterrupt: 

In [92]:
no_top_words = 5
no_top_documents = 2
display_topics(lda_H, lda_W, tf_feature_names, data, documents, no_top_words, no_top_documents)

Topic %d:0
party republican republicans congress vote
Yipes! They Canned the Chaplain
Where’s the Tea Party of the Left?
Topic %d:1
years gun ago racism guns
No More Hasty Scrawls: The End of Credit Card ...
Trump Wants America to Revert to the Queens of...
Topic %d:2
best voters mind ve respect
‘Roseanne’ Is Funny. And Scary.
Queen Lear  Moments
Topic %d:3
state government national paid federal
Cuomo vs. de Blasio: One of the Country’s Ugli...
Earth, Wind And Liars
Topic %d:4
women elected different fact male
Like to Go Out in Sweatpants?  Don’t Be an N.F...
Like to Go Out in Sweatpants?  Don’t Be an N.F...
Topic %d:5
job trying important bush guy
E.P.A. to Unveil a New Rule. Its Effect: Less ...
Bush 41, Trump, and American Decline
Topic %d:6
police people story control knew
At the Crossroads
Sea Turtles Use Magnetic Fields to Find Their ...
Topic %d:7
white children black stop americans
How to Win An Argument About Guns
Why Does Trump Treat Immigrant Kids Cruelly? B...
Topic %d:8
tr

## Using Grid Search

In [97]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [118]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)],
                   clf__learning_method=['online', 'batch'])

In [119]:
lda_gs = Pipeline([('vec', CountVectorizer(max_df=0.95, min_df=2, max_features=no_features,stop_words='english')),
                       ('clf', LatentDirichletAllocation(n_components=no_topics, max_iter=5, random_state=RANDOM_STATE))])


In [120]:
gs = GridSearchCV(estimator=lda_gs,
                 param_grid=grid_params,
                 cv=5,
                 n_jobs=-1)

In [121]:
gs.fit(documents)

KeyboardInterrupt: 