## Part 1: Data import and cleaning

In [1]:
#import initial libraries

import pandas as pd
import numpy as np

In [2]:
#import data 

df_0 = pd.read_csv("data/ashli.csv")
df = pd.read_csv("data/ashli_NR.csv")

In [3]:
# look at basic info about data

df.info()
# this data set consists of 9965  Tweets
# Twarc filter tracking keyword "ashli" started within half-hour after name of "ashli babbitt" was published
# (Ashley Babbit is the protester who was killed in the capitol during the riots on Jan. 6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9965 entries, 0 to 9964
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            9965 non-null   int64  
 1   tweet_url                     9965 non-null   object 
 2   created_at                    9965 non-null   object 
 3   parsed_created_at             9965 non-null   object 
 4   user_screen_name              9965 non-null   object 
 5   text                          9965 non-null   object 
 6   tweet_type                    9965 non-null   object 
 7   coordinates                   0 non-null      float64
 8   hashtags                      677 non-null    object 
 9   media                         684 non-null    object 
 10  urls                          3663 non-null   object 
 11  favorite_count                9965 non-null   int64  
 12  in_reply_to_screen_name       3216 non-null   object 
 13  in_

In [4]:
# count number of languages in data set

count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

34 ['en' 'tl' 'und' 'lt' 'it' 'es' 'fr' 'de' 'pt' 'th' 'ja' 'in' 'zh' 'ht'
 'ca' 'is' 'pl' 'nl' 'hu' 'ko' 'ar' 'cy' 'et' 'no' 'tr' 'lv' 'sv' 'ro'
 'sl' 'eu' 'fa' 'fi' 'cs' 'da']


In [5]:
# tweets are in 34 different languages

# I'll be working only with Tweets in English
# drop tweets in all other languages
# now working with 38884 Tweets 

df = df[df.lang == 'en']
df.shape

(8277, 37)

In [6]:
# drop unnecessary columns 

df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang', 'user_location', 'user_time_zone', 'user_urls', 'place'], axis=1)

In [7]:
# check start time & date of data

df.iloc[0]

# first Tweet downloaded Jan 7, 2021 at 03:27:44

id                                                  1347022077807652866
parsed_created_at                             2021-01-07 03:27:44+00:00
user_screen_name                                            creatcburst
text                  Her name was Ashli Babbit, a 14-year veteran, ...
tweet_type                                                        quote
hashtags                                                            NaN
favorite_count                                                        0
possibly_sensitive                                                  NaN
retweet_count                                                         0
user_id                                                       765234992
Name: 0, dtype: object

In [8]:
# check end time & date of data 

df.iloc[-1]

# last Tweet on Jan 7, 2021 at 06:15:58

id                              1347064412721405957
parsed_created_at         2021-01-07 06:15:57+00:00
user_screen_name                         JASpencer1
text                  Sorry for what democrats did.
tweet_type                                    quote
hashtags                                        NaN
favorite_count                                    0
possibly_sensitive                              NaN
retweet_count                                     0
user_id                                   222195944
Name: 9964, dtype: object

# Part 2: Text processing for NLP 

In [9]:
# create variable for "text" column 
text = df['text'] 

In [10]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers

# import necessary libraries: ntlk etc.

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 


stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [11]:
# apply text processing functions to text

df['processed_text'] = df['text'].apply(process_text)

In [12]:
# look at some of processed text

pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  This is separate from the ipykernel package so we can avoid doing imports until


0     [name, ashli, babbit, year, veteran, served, four, tours, us, air, force, high, level, security, official, throughout, time, service]                                                                                                   
1     [ashli, babbit, rednèck, ass, family, friends, online, trying, paint, like, oluwatoyin, salau, fuck, outta]                                                                                                                             
2     [ashli, babbit, going, remembered, terrorist, martyr]                                                                                                                                                                                   
3     [people, celebrating, death, ashli, babbit, twitter, think, think, transpired, summer]                                                                                                                                                  
4     [tammy, duckworth, needs, oscar, fake,

In [13]:
# part-of-speech tagging 

ready_for_pos = df['processed_text']

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [14]:
# lemmatizing

pos_tagged = df['pos_tagged']

wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [16]:
# lemmatizing
pos_tagged = df['pos_tagged']

from nltk.stem import WordNetLemmatizer 
wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [17]:
# look at lemmatized text

df['lemmatized'] = lemmatized
lemmatized[:20]

[['name',
  'ashli',
  'babbit',
  'year',
  'veteran',
  'served',
  'four',
  'tour',
  'u',
  'air',
  'force',
  'high',
  'level',
  'security',
  'official',
  'throughout',
  'time',
  'service'],
 ['ashli',
  'babbit',
  'rednèck',
  'as',
  'family',
  'friend',
  'online',
  'trying',
  'paint',
  'like',
  'oluwatoyin',
  'salau',
  'fuck',
  'outta'],
 ['ashli', 'babbit', 'going', 'remembered', 'terrorist', 'martyr'],
 ['people',
  'celebrating',
  'death',
  'ashli',
  'babbit',
  'twitter',
  'think',
  'think',
  'transpired',
  'summer'],
 ['tammy',
  'duckworth',
  'need',
  'oscar',
  'fake',
  'performance',
  'meanwhile',
  'ashli',
  'babbit',
  'true',
  'patriot',
  'air',
  'force',
  'veteran',
  'murdered',
  'capitol',
  'simply',
  'protesting',
  'medium',
  'even',
  'cover',
  'murder',
  'instead',
  'attack',
  'peaceful',
  'protest',
  'capitol',
  'smh'],
 ['ashli',
  'babbit',
  'die',
  'trump',
  'man',
  'care',
  'piss',
  'supporter',
  'fire']

In [18]:
# before vectorizing, cast lists of words back into strings

df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))
pd.set_option('display.max_colwidth', -1)
final_docs = df['final_docs']
final_docs[3000:3020]

  after removing the cwd from sys.path.


3609    sn even name lmao name ashli                                                                                                                                            
3610    storing federal building like result something protected first amendment                                                                                                
3611    remember correctly play stupid game win stupid prize summer                                                                                                             
3612    ashli babbitt protester killed capitol air force vet california via @nypost                                                                                             
3613    breonna taylor                                                                                                                                                          
3614    exactly                                                                                                    

In [20]:
#create document term matrix with TFIDF

#import vectorizing tool (usee TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

In [22]:
#run NMF model 

#import NMF tool 
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=6)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)


In [21]:
# run LDA model

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 6, max_iter=10, learning_method='online', learning_decay=.7)
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [23]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 30)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 30)

LDA Model:
Topic 0:
[('vain', 11.289388952660351), ('lmfao', 10.651157716070793), ('fascist', 10.381222888601966), ('man', 10.32020459480206), ('shame', 10.103833843917283), ('cop', 8.922432228109768), ('street', 8.5530972236664), ('welp', 8.096762441807371), ('comply', 7.75952429928336), ('let', 7.697567349794917)]
Topic 1:
[('pack', 64.7147670853554), ('ashli', 62.28008556187086), ('veteran', 61.41158511926153), ('shot', 59.96785957007005), ('ashli pack', 48.85795285424942), ('woman', 47.552953027630494), ('year', 46.84266976046674), ('air', 45.045113294280064), ('force', 44.340472382095555), ('air force', 42.75781152020286)]
Topic 2:
[('federal', 27.478078319452322), ('federal building', 26.88920885038435), ('lol', 22.059476779731217), ('building', 19.022303670099546), ('amendment', 17.321446371356327), ('right', 16.429401925139473), ('friend', 16.374303964911444), ('imagine', 15.384959975702536), ('hear', 13.845775230584039), ('wrong', 13.121843612305906)]
Topic 3:
[('shot', 48.781

# Part 4: Run visualization and testing of LDA model

In [23]:
# visualization of LDA model 
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [24]:
# test LDA model

# log likelihood (higher score is better)
print("Log likelihood: ", lda_model.score(doc_term_matrix_1))


# perplexity (lower score is better)
print("Perplexity: ", lda_model.perplexity(doc_term_matrix_1))

Log likelihood:  -971121.5064836859
Perplexity:  553.5152790163494


In [None]:
# # cross-validation to find best parameters for LDA model

# #import cross-validation tool
# from sklearn.model_selection import GridSearchCV

# # define search parameters
# search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# # initialize model for cross-validation
# lda = LatentDirichletAllocation()

# # initialize grid search class 
# model = GridSearchCV(lda, param_grid=search_params)

# # run grid search 
# model.fit(doc_term_matrix_1)

In [None]:
# # what is best model? 
# best_lda_model = model.best_estimator_

# # print parameters for best model
# print("Parameters for best model: ", model.best_params_)

# # print log likelihood score
# print("Log likelihood score of best model: ", model.best_score_)

# # print perplexity score
# print("Perplexity score of best model: ", best_lda_model.perplexity(doc_term_matrix_1))

In [27]:
# run LDA model with best parameters

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model_2 = LatentDirichletAllocation(n_components = 12, max_iter=10, learning_method='online', learning_decay=.9)
lda_Z_2 = lda_model_2.fit_transform(doc_term_matrix_1)

In [28]:
# # what is best model? 
# best_lda_model = model.best_estimator_

# # print parameters for best model
# print("Parameters for best model: ", model.best_params_)

# # print log likelihood score
# print("Log likelihood score of best model: ", model.best_score_)

# # print perplexity score
# print("Perplexity score of best model: ", best_lda_model.perplexity(doc_term_matrix_1))

Parameters for best model:  {'learning_decay': 0.9, 'n_components': 10}
Log likelihood score of best model:  -205986.77421867714
Perplexity score of best model:  449.5455606822447


In [None]:
# # 2nd LDA model: running new "best model" parameters

# lda_model_2 = LatentDirichletAllocation(n_components = 12, max_iter=10, learning_method='online', learning_decay=0.9)
# lda_Z_2 = lda_model_2.fit_transform(doc_term_matrix_1)

# # visualization of 2nd LDA model 
# pyLDAvis.enable_notebook()
# panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
# panel

# pyLDAvis.enable_notebook()
# panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
# panel

In [None]:
# # top topics of 2nd LDA model

# def print_topics(model, vectorizer, top_n=10):
#     for idx, topic in enumerate(model.components_):
#         print("Topic %d:" % (idx))
#         print([(vectorizer.get_feature_names()[i], topic[i])
#                         for i in topic.argsort()[:-top_n - 1:-1]])
 
# print("LDA Model 2:")
# print_topics(lda_model_2, tfidfconverter )
# print("=" * 20)

# Part 5: Plug results of topic modeling back into dataframe


In [None]:
# make results of NMF model a column in dataframe

topic_values = nmf_model.transform(doc_term_matrix_1)
df['NMF_topic'] = topic_values.argmax(axis=1)


In [None]:
# # make results of LDA2 model a column in dataframe

# topic_values = lda_model_2.transform(doc_term_matrix_1)
# df['LDA2_topic'] = topic_values.argmax(axis=1)

In [24]:
# make results of LDA2 model a column in dataframe

topic_values = lda_model.transform(doc_term_matrix_1)
df['LDA1_topic'] = topic_values.argmax(axis=1)

In [25]:
df.head(1)

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,possibly_sensitive,retweet_count,user_id,processed_text,pos_tagged,lemmatized,final_docs,LDA1_topic
0,1347022077807652866,2021-01-07 03:27:44+00:00,creatcburst,"Her name was Ashli Babbit, a 14-year veteran, who served four tours with the US Air Force, and was a high level security official throughout her time in service.",quote,,0,,0,765234992,"[name, ashli, babbit, year, veteran, served, four, tours, us, air, force, high, level, security, official, throughout, time, service]","[(name, NN), (ashli, NN), (babbit, NN), (year, NN), (veteran, NN), (served, VBD), (four, CD), (tours, NNS), (us, PRP), (air, VBP), (force, JJ), (high, JJ), (level, NN), (security, NN), (official, NN), (throughout, IN), (time, NN), (service, NN)]","[name, ashli, babbit, year, veteran, served, four, tour, u, air, force, high, level, security, official, throughout, time, service]",name ashli babbit year veteran served four tour u air force high level security official throughout time service,5


In [27]:
df['text'][df.LDA1_topic ==0][:20]

5      Ashli Babbit didn't have to die.  But she did so for Trump.  For a man who cares only about himself, and who wouldn't piss on his supporters if they were on fire.                                                                                          
35     @debmcleod13 @1lucyhannah Ashli IS a hero.\n\nPlease do not let  Ashli Babbit die in vain ...\n\nSend the electoral votes back to the states whose legislatures admitted possible fraud and request a forensic audit ...\n\nThere is no downside to this ...
48     Ashli Babbits death falls directly on your shoulders Mr Trumph, you should be charged with her murder, shame on you sir, shame on you!                                                                                                                      
56     THIS🤯🤯🤯🤯🤯🤯👇                                                                                                                                                                                                          

In [29]:
df['text'][df.LDA1_topic ==1][20:40]

242    A woman who was shot and killed during pro-Trump supporters' storming of U.S. Capitol has been identified as 14-year Air Force veteran, Ashli Babbit, of San Diego.​ https://t.co/OH0BUyLAef                                                                                            
244    Who the f cares it was six years ago ...get on with life...                                                                                                                                                                                                                             
254    @Its_kvon *Ashli pack                                                                                                                                                                                                                                                                   
264    Ashli Babbit, an Air Force veteran, had her life taken at a protest at the Capitol. Ashli, a true patriot, gave up more than her 

In [31]:
df['text'][df.LDA1_topic ==2][20:40]

792     She was an advocate for treason. The price was heavy                                                                                                                                                       
823     @samanthamarika1 Ashli Babbit was an insurgent &amp; seditionist &amp; was treated accordingly.\n\nShe should be stripped of all rank. No benefits should be paid to her survivors. https://t.co/KhoV4Z3Nzb
892     Condolences to her friends and family she displayed courage and conviction and payed the ultimate price for believing RIP LOVELY PATRIOT 🇺🇸😇                                                               
979     Lmao #ripbozo don’t storm a federal building 💀 she should’ve just listened to the officers orders ugly ass bitch https://t.co/Kq1dYHjQbQ                                                                   
1015    wrong 😑                                                                                                                                         

In [34]:
df['text'][df.LDA1_topic ==3][40:70]

439    I don't think it was the police. I think she was targeted by someone in the crowd. She looked to be climbing out the window, not in and then fell backward in to the room.                                                                                                                  
442    You spell BREONNA TAYLOR wrong my guy. She was storming the capitol building &amp; rioting BRE was sleep in her bed.                                                                                                                                                                        
483    @Ladyy__C And this isn’t correct. Her name was Ashli Babbit but she should’ve complied &amp; backed the blue by not storming the Capitol.                                                                                                                                                   
539    You killed a un armed woman                                                                                          

In [35]:
df['text'][df.LDA1_topic ==4][40:70]

875     Her name is Ashli Babbitt                                                                                                                                                                                                                                                                                                
880     Ashli Babbit!!!   Rest in peace, enfolded in the arms of patriot souls that welcome into their ranks in heaven.                                                                                                                                                                                                          
897     @mdc_dsa @DemSocialists Her name is Ashli Babbitt                                                                                                                                                                                                                                                                        
906     @kayleighmcenany EXACTLY! 

In [38]:
df['text'][df.LDA1_topic ==5][140:170]

1613    she was a domestic terrorist                                                                                                                                                                                                                                                                                              
1620    @RepMattGaetz Matt I live in Navarre. You should bring Ashli Bobbits name up on the floor. A moment of silence for her death at the hands of the Chamber Guards. She was unarmed and was shot in the chest and died a Patriot doing what she thought was her right. https://t.co/DCwSr2jDqV                               
1622    @NBCNews Ashli wanted to vote. https://t.co/V00k3FUYEs                                                                                                                                                                                                                                                                    
1635    Ashli Babbitt died figh

In [39]:
df['text'][df.LDA1_topic ==6][140:170]

856     @Mike_Pence And what’s the name of the mother fucker who killed Ashli Babbit. You can’t even say her name. Your a pawn mike. Go home...you’re useless...all you had to do was your job                                                                                     
871     Did Donnie Drumpf and his cult members learn “how to coup” from Woody Allen’s Bananas? BIG PLAN: Break into the Capitol in order to...sit in chairs and take selfies?! Worth every bit of Ashli Babbit’s death. Really, no. https://t.co/a7DhcjUu1h                        
878     The Death of Ashli Babbitt https://t.co/2VtfpE7OXM                                                                                                                                                                                                                         
887     🥺 what did she know?                                                                                                                                                

In [40]:
df['text'][df.LDA1_topic ==7][140:170]

7129    RIP bozo                                                                                                                                                                                                                                                
7178    #SayHerName\nAshli - RIP #Patriot                                                                                                                                                                                                                       
7219    Her name was Ashli Babbit\n\n#SayHerName                                                                                                                                                                                                                
7351    Ashli Babbit will NOT be forgotten.                                                                                                                                                                                          

In [41]:
df['text'][df.LDA1_topic ==8][140:170]

3316    @SecretSunBlog The woman killed by police is Ashli Babbitt.\nAshli comes from Ashley, which means ash tree meadow.\nAsherah was the consort of Ba al.\nShe had a sacred tree pole.\nBabbitt is a form of Robert (splendor, fame) and perhaps Middle English form of 'baby.'\nAsh tree baby sacrificed.
3325    @RealWayneRoot Ashli Babbitt is our modern day Crispus Attucks May her memory forever be a blessing. She was a true patriot.                                                                                                                                                                          
3347    Ashli Babbit identified as Air Force vet killed at Capitol https://t.co/5YcwLdVOFU                                                                                                                                                                                                                    
3365    Ashli Babbit identified as woman killed at DC protest in Capitol https://t.co/R3l7n

In [None]:
#rich /poor