In [8]:
import numpy as np
import pandas as pd

In [9]:
df = pd.read_csv('data/coup.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413600 entries, 0 to 413599
Data columns (total 37 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   id                            413600 non-null  int64  
 1   tweet_url                     413600 non-null  object 
 2   created_at                    413600 non-null  object 
 3   parsed_created_at             413600 non-null  object 
 4   user_screen_name              413600 non-null  object 
 5   text                          413600 non-null  object 
 6   tweet_type                    413600 non-null  object 
 7   coordinates                   7 non-null       object 
 8   hashtags                      13057 non-null   object 
 9   media                         11642 non-null   object 
 10  urls                          52890 non-null   object 
 11  favorite_count                413600 non-null  int64  
 12  in_reply_to_screen_name       31057 non-null

In [11]:
# count number of different languages in data set
count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

44 ['en' 'und' 'fr' 'de' 'nl' 'in' 'th' 'ja' 'pt' 'es' 'et' 'ca' 'fa' 'it'
 'tr' 'pl' 'ht' 'cy' 'ko' 'ro' 'tl' 'eu' 'zh' 'ar' 'no' 'ru' 'fi' 'pa'
 'el' 'vi' 'sv' 'iw' 'sr' 'ur' 'hu' 'lt' 'cs' 'lv' 'da' 'hi' 'is' 'sl'
 'ta' 'dv']


In [12]:
# tweets are in 44 different languages. 
# I'll be working only with tweets in English
# so, I'll drop tweets in all other languages
df = df[df.lang == 'en']
df.shape

(396586, 37)

In [25]:
# drop unnecessary columns 
df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang'], axis=1)

In [26]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers

import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag

stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [27]:
df['processed_text'] = df['text'].apply(process_text)

In [28]:
pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  """Entry point for launching an IPython kernel.


0     [assault, democracy, sen, josh, hawley, blood, hands, capitol, coup, attempt]                                                                                                                                              
1     [call, old-fashioned, armed, insurgents, breach, capitol, building, request, president, i'd, call, attempted, coup]                                                                                                        
2     [@vritrite, @dreddersart, oh, yeah, like, politics, bad, im, like, ignore, coup, attempt, happening]                                                                                                                       
3     [ridiculously, divisive, statement, btw, gun, violence, continues, baltimore, oh, still, picked, trash]                                                                                                                    
4     [assault, democracy, sen, josh, hawley, blood, hands, capitol, coup, attempt]             

In [29]:
ready_for_pos = df['processed_text']

In [30]:
# part-of-speech tagging 
def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [31]:
pos_tagged = df['pos_tagged']

In [32]:
# lemmatizing

from nltk.stem import WordNetLemmatizer 
wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [33]:
df['lemmatized'] = lemmatized

In [34]:
df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))

In [35]:
df['final_docs'][30000:30020]

31488    call old-fashioned armed insurgent breach capitol building request president i'd call attempted coup                                                                                                
31489    anyone elected house senate challenging result presidential election congress part attempted coup inspired president resign insurrection ...                                                        
31490    twitter ball facebook none                                                                                                                                                                          
31491    almost permanent ban inciting fucking coup attempt ridiculous even ballpark jack twitter fucking disgrace complicit point blank #trumpcoupattempt                                                   
31492    @jtylerconway president u staged coup disrupt certification election cannot believe happening one always remember                                                      

In [36]:
final_docs = df['final_docs']

In [37]:
#create document term matrix with TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer
# initial tuning of parameters
#set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

In [38]:
doc_term_matrix_1.shape

(396586, 2000)

In [39]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=10)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)

In [40]:
from sklearn.decomposition import TruncatedSVD

lsi_model = TruncatedSVD(n_components=10)
lsi_Z = lsi_model.fit_transform(doc_term_matrix_1)

In [41]:
from sklearn.decomposition import LatentDirichletAllocation
#  LDA model 
lda_model = LatentDirichletAllocation(n_components = 10, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [42]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, tfidfconverter )
print("=" * 20)

LDA Model:
Topic 0:
[('working', 2185.4006462079947), ('working coup', 1889.4091325613183), ('cruz', 1858.2633569414795), ('hawley', 1774.1264162549417), ('heard', 1595.7936825920865), ('said', 1520.6920394268275), ('thing', 1495.327102673753), ('supposed', 1477.8844474692698), ('ve heard', 1472.034690421095), ('friend', 1469.0647063998829)]
Topic 1:
[('supporter', 2868.837388799131), ('trump supporter', 2622.9135802725505), ('trump', 2366.6144313051036), ('trying', 2177.7993926581657), ('white', 2116.90602026502), ('nationalist', 2096.0121682476274), ('white nationalist', 2086.262346959691), ('nationalist trump', 1864.6670661495457), ('tweet', 1716.3302006698655), ('coup today', 1699.976821326038)]
Topic 2:
[('black', 5810.502666063768), ('white', 5647.793057453274), ('ago', 5336.362002103651), ('image', 5141.540752064805), ('cleaning', 5115.875670233713), ('couple', 5083.929740489027), ('supremacist', 5058.537666708727), ('supremacist thug', 5058.111263591265), ('coup couple', 5057.4

In [43]:
import pyLDAvis.sklearn

In [44]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [45]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(doc_term_matrix_1))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(doc_term_matrix_1))

Log Likelihood:  -10212790.256827
Perplexity:  1228.8310123398423


In [None]:
from sklearn.model_selection import GridSearchCV

# Define Search Param
search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(doc_term_matrix_1)

In [None]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(doc_term_matrix_1))