In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('data/abrams.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6824 entries, 0 to 6823
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            6824 non-null   int64  
 1   tweet_url                     6824 non-null   object 
 2   created_at                    6824 non-null   object 
 3   parsed_created_at             6824 non-null   object 
 4   user_screen_name              6824 non-null   object 
 5   text                          6824 non-null   object 
 6   tweet_type                    6824 non-null   object 
 7   coordinates                   1 non-null      object 
 8   hashtags                      261 non-null    object 
 9   media                         570 non-null    object 
 10  urls                          626 non-null    object 
 11  favorite_count                6824 non-null   int64  
 12  in_reply_to_screen_name       470 non-null    object 
 13  in_

In [4]:
# count number of different languages in data set
count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

23 ['en' 'und' 'es' 'fr' 'pt' 'ca' 'sv' 'ro' 'cy' 'de' 'zh' 'no' 'in' 'tl'
 'iw' 'ht' 'da' 'cs' 'ar' 'fi' 'tr' 'pl' 'it']


In [5]:
# tweets are in 23 different languages. 
# I'll be working only with tweets in English
# so, I'll drop tweets in all other languages
df = df[df.lang == 'en']
df.shape

(6659, 37)

In [6]:
# drop unnecessary columns 
df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang', 'user_location', 'user_time_zone', 'user_urls', 'place'], axis=1)

In [7]:
text = df['text'] 

In [8]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers

import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag

stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [9]:
df['processed_text'] = df['text'].apply(process_text)

In [10]:
pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  """Entry point for launching an IPython kernel.


0     [stacey, abrams, brags, allowing, fake, voters, signature, verification]                                                                                                                                                   
1     [stacey, abrams, damn, thing]                                                                                                                                                                                              
2     [house, cards, shit, republicans, stole, election, abrams, abrams, plotted, took, washington]                                                                                                                              
3     [actually, happens, actually, win, discussion, stacey, abrams, saved, america]                                                                                                                                             
4     [chuck, schumer, stacey, abrams, years, team, many, women's, groups, black, women's, group

In [11]:
ready_for_pos = df['processed_text']

In [12]:
# part-of-speech tagging 
def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [13]:
pos_tagged = df['pos_tagged']

In [14]:
# lemmatizing

from nltk.stem import WordNetLemmatizer 
wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [15]:
df['lemmatized'] = lemmatized

In [16]:
df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))

In [17]:
df['final_docs'][3000:3020]

3083    good morning @staceyabrams stacey abrams                                                                                                                                                               
3084    guess good idea stacey abrams run senate instead helped biden two senator win revolutionized organizing put stake gop                                                                                  
3085    give stacey abrams fair fight new georgia project every single organizer made possible credit done georgia remarkable                                                                                  
3086    stacey abrams spent decade building democratic infrastructure georgia democrat move closer flipping georgia senate seat many see person responsible shifting political landscape                       
3087    stacey abrams destroyed election integrity georgia brian kemp sat watched happen georgia blue state stolen state                                                

In [18]:
final_docs = df['final_docs']

In [19]:
#create document term matrix with TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer
# initial tuning of parameters
#set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

In [20]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=10)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)


In [21]:
from sklearn.decomposition import LatentDirichletAllocation
#  LDA model 
lda_model = LatentDirichletAllocation(n_components = 10, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [22]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 20)
 


LDA Model:
Topic 0:
[('president', 31.99579936668867), ('mcconnell', 31.31803817465682), ('biden', 27.569885149012975), ('mitch mcconnell', 27.452868502004083), ('mitch', 26.77208849354789), ('american', 24.375810467677073), ('america', 22.14469200099454), ('power', 21.894561031410518), ('fact', 20.734354662031322), ('medal', 19.553887173266823)]
Topic 1:
[('stacey', 136.58723361525503), ('stacey abrams', 135.7136584841917), ('tweet', 125.5378676184316), ('carrying', 122.02203094684948), ('abrams tweet', 121.46089611237687), ('carrying democracy', 121.05831502880673), ('abrams carrying', 121.05831496535741), ('democracy', 104.42116132059934), ('absolutely', 36.64458886360368), ('day', 36.05101868999824)]
Topic 2:
[('state', 141.90970979301954), ('lead', 90.5573367919726), ('follow', 88.03895374755996), ('abrams state', 87.14168257983332), ('follow lead', 85.77980609457458), ('invest', 85.5181462702445), ('state stacey', 85.38181613985952), ('trust', 85.3647244065772), ('invest follow',

In [23]:
import pyLDAvis.sklearn

In [24]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [25]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(doc_term_matrix_1))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(doc_term_matrix_1))

Log Likelihood:  -167336.67622249882
Perplexity:  901.0673248447437


In [26]:
from sklearn.model_selection import GridSearchCV

# Define Search Param
search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(doc_term_matrix_1)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 8, 10, 12]})

In [27]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(doc_term_matrix_1))

Best Model's Params:  {'learning_decay': 0.9, 'n_components': 5}
Best Log Likelihood Score:  -39593.25449893281
Model Perplexity:  1000.9217320781232


In [30]:
# 2nd LDA model
lda_model_2 = LatentDirichletAllocation(n_components = 5, max_iter=10, learning_method='online', learning_decay=0.9)
lda_Z_2 = lda_model_2.fit_transform(doc_term_matrix_1)


pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel