## Setup

This notebook was adapted from the following article: https://towardsdatascience.com/improving-the-interpretation-of-topic-models-87fd2ee3847d

In [1]:
reset -fs

In [2]:
import numpy as np
import pandas as pd
import re
import string

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
RANDOM_STATE = 28

## Loading Data

In [4]:
comments = 'Data/CommentsApril2018.csv'
articles = 'Data/ArticlesApril2018.csv'

In [5]:
comm = pd.read_csv(comments)
art = pd.read_csv(articles)

  interactivity=interactivity, compiler=compiler, result=result)


Column "userTitle" has both NaN and string data types. Replace Nan with "Unknown" to have one uniform data type in the column

In [6]:
comm.iloc[:, 32] = comm.iloc[:, 32].replace(np.nan, 'Unknown', regex=True); comm.head()

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1524594282,5adf6684068401528a2aa69b,781.0,How could the league possibly refuse this offe...,26853969.0,26853969.0,<br/>,comment,1524594011,1.0,...,approved,1,0,News,1524594282,Christopher Rillo,46566740.0,San Francisco,Unknown,
1,1524594252,5adf6684068401528a2aa69b,781.0,"So then the execs can be like ""yeah...we will ...",26853699.0,26853699.0,<br/>,comment,1524593146,1.0,...,approved,1,0,News,1524594252,Matt Brand,64324866.0,"Williamsburg, Brooklyn",Unknown,
2,1524594250,5adf6684068401528a2aa69b,781.0,I would not want to play chess against these c...,26853677.0,26853677.0,<br/>,comment,1524593032,1.0,...,approved,1,0,News,1524594250,Joseph,78105093.0,"Fayetteville, AR",Unknown,
3,1524593431,5adf6684068401528a2aa69b,781.0,Could the cheerleaders join the Actors' Equity...,26853784.0,26853784.0,<br/>,comment,1524593426,1.0,...,approved,0,0,News,1524593431,Stephen,81939618.0,"Phoenix, AZ",Unknown,
4,1524595048,5adf653f068401528a2aa697,656.0,Seeking conclusions which support preconceived...,26854236.0,26854236.0,<br/>,comment,1524595043,1.0,...,approved,1,0,News,1524595048,Paul Zorsky,58642997.0,Texas,Unknown,


Keeping the comments with an 'unknown' section name

In [7]:
# keeping comments with unknown section name 
unknown_filt = (comm.sectionName == "Unknown") & (comm.commentType == "comment")
comm_filt = comm[unknown_filt].copy().reset_index()

In [8]:
# keeping articles that match the unknown section name 
art_filt = art.loc[art['articleID'].isin(set(comm_filt['articleID']))]

In [9]:
df_articles = art_filt.loc[:,['articleID', 'headline']]

In [10]:
df_comments = comm_filt.loc[:, ['articleID', 'commentBody']]

In [11]:
data = df_articles.merge(df_comments, how='inner')

In [12]:
data.head()

Unnamed: 0,articleID,headline,commentBody
0,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,Seeking conclusions which support preconceived...
1,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,Pruitt: First eliminate the scientists and no...
2,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,This is a shameful attack on the environment a...
3,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,“It gives people the opportunity in real time ...
4,5adf653f068401528a2aa697,E.P.A. to Unveil a New Rule. Its Effect: Less ...,Given recent revelations about Mr. Pruitt's mo...


In [13]:
documents = list(data['commentBody'])

In [14]:
no_features = 1000

In [15]:
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

In [16]:
# there are 35 sectionName types 
no_topics = 35 

In [17]:
def display_topics(H, W, feature_names, data, documents, no_top_words, no_top_documents):
    """
    https://towardsdatascience.com/improving-the-interpretation-of-topic-models-87fd2ee3847d
    
    outputs a 
    - numerical index as a topic name 
    - top words of a topic 
    - adjusted code to print the top documents (article titles) within the topic 
        
    """
    for topic_idx, topic in enumerate(H):
        print(f"Topic %d:{topic_idx}")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
        for doc_index in top_doc_indices:
            print(data.loc[data['commentBody'] == documents[doc_index]]['headline'].to_string(header=False, index=False))

In [18]:
lda_comm = LatentDirichletAllocation(n_components=no_topics,random_state=RANDOM_STATE)
lda_model = lda_comm.fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_



In [19]:
no_top_words = 5
no_top_documents = 2
display_topics(lda_H, lda_W, tf_feature_names, data, documents, no_top_words, no_top_documents)

Topic %d:0
political behavior certainly god tv
Why Not Mike Pence?
Is It Real Policy, or Is It Just Reality TV?
Topic %d:1
gun class guns middle racism
Fighting Over Chores? Spend Some Money, Save t...
Trump Wants America to Revert to the Queens of...
Topic %d:2
best mind respect expect cohen
Unknown
Unknown
Topic %d:3
ll paid federal buy based
Orlando Gunman’s Wife Is Acquitted in Shootings
Photos of Gynecological Tools From Centuries Past
Topic %d:4
party republican republicans democrats voters
Unknown
Happy Birthday, Karl Marx. You Were Right!
Topic %d:5
care job trying important child
Painfully Sick From Marijuana
E.P.A. to Unveil a New Rule. Its Effect: Less ...
Topic %d:6
society police story control days
An Overdue Thanks
A Prodigy Redeemed
Topic %d:7
children black pruitt congress citizens
How to Win An Argument About Guns
Unknown
Topic %d:8
trump white president comey house
Patron Who Wrested Rifle Says He Was ‘Just Try...
The Tragedy of James Comey
Topic %d:9
years time work 

## Using Grid Search

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [21]:
grid_params = dict(vec__ngram_range=[(1,1), (1,2), (1,3)],
                   clf__learning_method=['online', 'batch'])

In [22]:
lda_gs = Pipeline([('vec', CountVectorizer(max_df=0.95, min_df=2, max_features=no_features,stop_words='english')),
                       ('clf', LatentDirichletAllocation(n_components=no_topics, max_iter=5, random_state=RANDOM_STATE))])


In [23]:
gs = GridSearchCV(estimator=lda_gs,
                 param_grid=grid_params,
                 cv=5,
                 n_jobs=-1)

In [24]:
gs.fit(documents)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=1000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ..._tol=0.1, random_state=28,
             topic_word_prior=None, total_samples=1000000.0, verbose=0))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'vec__ngram_range': [(1, 1), (1, 2), (1, 3)], 'clf__learning_method': ['online', 'batch']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [25]:
gs.best_estimator_

Pipeline(memory=None,
     steps=[('vec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.95, max_features=1000, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...          random_state=28, topic_word_prior=None,
             total_samples=1000000.0, verbose=0))])

In [26]:
gs.best_score_

-2814310.2509819125

<br>
<br>
<br>

## Save Model

In [30]:
from joblib import dump, load
dump(gs.best_estimator_, 'Models/LDA_Unknowns.joblib') 

['Models/LDA_Unknowns.joblib']

## Save Notebook

In [31]:
import dill
dill.dump_session('Notebook_Saves/LDA_Unknown_Exploration.db')