# Part 1: Data cleaning 


In [1]:
#import initial libraries

import pandas as pd
import numpy as np


In [2]:
#import data 

df = pd.read_csv('data/abrams.csv')

In [3]:
# look at basic info about data

df.info()
# this data set consists of 6824 Tweets

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6824 entries, 0 to 6823
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            6824 non-null   int64  
 1   tweet_url                     6824 non-null   object 
 2   created_at                    6824 non-null   object 
 3   parsed_created_at             6824 non-null   object 
 4   user_screen_name              6824 non-null   object 
 5   text                          6824 non-null   object 
 6   tweet_type                    6824 non-null   object 
 7   coordinates                   1 non-null      object 
 8   hashtags                      261 non-null    object 
 9   media                         570 non-null    object 
 10  urls                          626 non-null    object 
 11  favorite_count                6824 non-null   int64  
 12  in_reply_to_screen_name       470 non-null    object 
 13  in_

In [4]:
# count number of languages in data set

count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

23 ['en' 'und' 'es' 'fr' 'pt' 'ca' 'sv' 'ro' 'cy' 'de' 'zh' 'no' 'in' 'tl'
 'iw' 'ht' 'da' 'cs' 'ar' 'fi' 'tr' 'pl' 'it']


In [5]:
# tweets are in 23 different languages

# I'll be working only with tweets in English
# drop tweets in all other languages
# now working with 6659 Tweets 

df = df[df.lang == 'en']
df.shape

(6659, 37)

In [6]:
# drop unnecessary columns 
df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang', 'user_location', 'user_time_zone', 'user_urls', 'place'], axis=1)

In [7]:
# check start time & date of data

df.iloc[0]

# first Tweet downloaded Jan 6, 2021 at 18:42:31

id                                                  1346889904639381509
parsed_created_at                             2021-01-06 18:42:31+00:00
user_screen_name                                            dragonwick2
text                  Stacey Abrams brags about allowing fake voters...
tweet_type                                                      retweet
hashtags                                                            NaN
favorite_count                                                     3071
possibly_sensitive                                                False
retweet_count                                                         0
user_id                                                       828492452
Name: 0, dtype: object

In [8]:
# check end time & date of data 

df.iloc[-1]

# last Tweet on Jan 6, 2021 at 19:02:01

id                                                  1346894809219420160
parsed_created_at                             2021-01-06 19:02:01+00:00
user_screen_name                                           schultziepie
text                  Stacy Abrams is a QUEEN!! We all owe her for s...
tweet_type                                                     original
hashtags                          StacyAbramsSavedAmerica thankyouqueen
favorite_count                                                        0
possibly_sensitive                                                False
retweet_count                                                         0
user_id                                                        17032152
Name: 6823, dtype: object

# Part 2: Text processing for NLP 

In [9]:
# create variable for "text" column 
text = df['text'] 

In [10]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers
# import necessary libraries: ntlk etc.

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 

stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [11]:
# apply text processing functions to text
df['processed_text'] = df['text'].apply(process_text)

In [12]:
# look at some of processed text
pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  


0     [stacey, abrams, brags, allowing, fake, voters, signature, verification]                                                                                                                                                   
1     [stacey, abrams, damn, thing]                                                                                                                                                                                              
2     [house, cards, shit, republicans, stole, election, abrams, abrams, plotted, took, washington]                                                                                                                              
3     [actually, happens, actually, win, discussion, stacey, abrams, saved, america]                                                                                                                                             
4     [chuck, schumer, stacey, abrams, years, team, many, women's, groups, black, women's, group

In [13]:
# part-of-speech tagging 

ready_for_pos = df['processed_text']

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [14]:
# lemmatizing

pos_tagged = df['pos_tagged']

wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [15]:
# look at lemmatized text

df['lemmatized'] = lemmatized
lemmatized[:20]

[['stacey',
  'abrams',
  'brag',
  'allowing',
  'fake',
  'voter',
  'signature',
  'verification'],
 ['stacey', 'abrams', 'damn', 'thing'],
 ['house',
  'card',
  'shit',
  'republican',
  'stole',
  'election',
  'abrams',
  'abrams',
  'plotted',
  'took',
  'washington'],
 ['actually',
  'happens',
  'actually',
  'win',
  'discussion',
  'stacey',
  'abrams',
  'saved',
  'america'],
 ['chuck',
  'schumer',
  'stacey',
  'abrams',
  'year',
  'team',
  'many',
  "women's",
  'group',
  'black',
  "women's",
  'group',
  'worked',
  'worked',
  'worked',
  'change',
  'georgia',
  'stacey',
  'would',
  'first',
  'tell',
  'alone',
  'huge',
  'thank',
  'many',
  'organization'],
 ['stacey',
  'abrams',
  'spent',
  'decade',
  'building',
  'democratic',
  'infrastructure',
  'georgia',
  'jon',
  'ossoff',
  'rev',
  'raphael',
  'warnock',
  'look',
  'flip',
  'georgia',
  'two',
  'senate',
  'seat',
  'many',
  'see',
  'person',
  'responsible',
  'shifting',
  'politica

In [16]:
# before vectorizing, cast lists of words back into strings

df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))
pd.set_option('display.max_colwidth', -1)
final_docs = df['final_docs']
final_docs[3000:3020]

  after removing the cwd from sys.path.


3083    good morning @staceyabrams stacey abrams                                                                                                                                                               
3084    guess good idea stacey abrams run senate instead helped biden two senator win revolutionized organizing put stake gop                                                                                  
3085    give stacey abrams fair fight new georgia project every single organizer made possible credit done georgia remarkable                                                                                  
3086    stacey abrams spent decade building democratic infrastructure georgia democrat move closer flipping georgia senate seat many see person responsible shifting political landscape                       
3087    stacey abrams destroyed election integrity georgia brian kemp sat watched happen georgia blue state stolen state                                                

In [17]:
#create document term matrix with TFIDF

#import vectorizing tool (usee TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

# Part 3: run NMF and LDA models, for topic modeling

In [18]:
#run NMF model 

#import NMF tool 
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=6)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)


In [19]:
# run LDA model

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 6, max_iter=10, learning_method='online', learning_decay=.9)
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [20]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 20)
 

LDA Model:
Topic 0:
[('warnock', 51.087305554155506), ('georgia', 49.31615812431324), ('raphael warnock', 48.96188815239803), ('raphael', 48.961643856104324), ('stacey', 47.34381725930895), ('goddess', 44.87749260452265), ('abrams goddess', 44.43981031139755), ('ossoff', 44.33475644730052), ('fucking', 43.885129461171374), ('stacey abrams', 42.92092915455827)]
Topic 1:
[('state', 134.21048737564496), ('lead', 91.0051543315015), ('follow', 88.32754492515112), ('abrams state', 87.62138801343116), ('follow lead', 86.24750837088325), ('invest', 86.07149693856188), ('trust', 85.94635602063347), ('state stacey', 85.90964134970554), ('invest follow', 85.7311290070955), ('trust invest', 85.73096996237854)]
Topic 2:
[('stacey abrams', 153.13208162255603), ('stacey', 148.81733179518542), ('carrying', 119.58444173924158), ('carrying democracy', 118.47411411486952), ('abrams carrying', 118.47376024136989), ('democracy', 106.36655630207743), ('election stacey', 89.98597294449864), ('block', 89.4782

# Part 4: Run visualization and testing of LDA model

In [21]:
# visualization of LDA model 
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [22]:
# test LDA model

# log likelihood (higher score is better)
print("Log Likelihood: ", lda_model.score(doc_term_matrix_1))

# perplexity (lower score is better)
print("Perplexity: ", lda_model.perplexity(doc_term_matrix_1))

Log Likelihood:  -169071.29047048374
Perplexity:  966.9105409189247


In [23]:
# cross-validation to find best parameters for LDA model

#import cross-validation tool
from sklearn.model_selection import GridSearchCV

# define search parameters
search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# initialize model for cross-validation
lda = LatentDirichletAllocation()

# initialize grid search class 
model = GridSearchCV(lda, param_grid=search_params)

# run grid search 
model.fit(doc_term_matrix_1)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 8, 10, 12]})

In [24]:
# what is best model? 
best_lda_model = model.best_estimator_

# print parameters for best model
print("Parameters for best model: ", model.best_params_)

# print log likelihood score
print("Log likelihood score of best model: ", model.best_score_)

# print perplexity score
print("Perplexity score of best model: ", best_lda_model.perplexity(doc_term_matrix_1))

Parameters for best model:  {'learning_decay': 0.9, 'n_components': 5}
Log likelihood score of best model:  -39683.51244595625
Perplexity score of best model:  976.449507372202


In [27]:
# top topics of 2nd LDA model

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model_2, tfidfconverter )
print("=" * 20)

LDA Model:
Topic 0:
[('stacey', 204.96052899778172), ('stacey abrams', 192.35578751666122), ('carrying', 128.46987865355473), ('carrying democracy', 127.81259238491126), ('abrams carrying', 127.81259238491126), ('democracy', 125.15240996785563), ('thank', 75.95232604573981), ('saving', 69.37350716714845), ('black', 67.34770941130847), ('woman', 65.14144808575594)]
Topic 1:
[('stole', 95.84988889636267), ('stole election', 93.13224689581352), ('election stacey', 91.2018483521999), ('block', 90.26344415827228), ('spun', 90.04666809958727), ('spun block', 90.04666809958727), ('crazy', 89.49133612848495), ('block em', 89.0559831649791), ('em crazy', 89.0559831649791), ('em', 89.0559831649791)]
Topic 2:
[('day', 53.70175758723745), ('really', 49.11880549392248), ('good', 41.04132302026038), ('stacey abrams', 38.258305347578755), ('absolutely', 38.100134941319226), ('warnock', 37.420676240449524), ('history', 36.43702872163451), ('stacey', 36.33228624978522), ('georgia', 34.8499043765373), (

In [26]:
# 2nd LDA model: running new "best model" parameters

lda_model_2 = LatentDirichletAllocation(n_components = 5, max_iter=10, learning_method='online', learning_decay=0.5)
lda_Z_2 = lda_model_2.fit_transform(doc_term_matrix_1)

# visualization of 2nd LDA model 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel