# Topic modeling of Tweets w/ keyword "ashli" on Jan. 6

## Part 1: Data import and cleaning

In [1]:
# import initial libraries 
import pandas as pd
import numpy as np

In [3]:
# import Twitter data, which has been converted from json to csv 
df = pd.read_csv("data/ashli.csv")

In [4]:
# basic info about data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42996 entries, 0 to 42995
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            42996 non-null  int64  
 1   tweet_url                     42996 non-null  object 
 2   created_at                    42996 non-null  object 
 3   parsed_created_at             42996 non-null  object 
 4   user_screen_name              42996 non-null  object 
 5   text                          42996 non-null  object 
 6   tweet_type                    42996 non-null  object 
 7   coordinates                   0 non-null      float64
 8   hashtags                      1926 non-null   object 
 9   media                         2423 non-null   object 
 10  urls                          6191 non-null   object 
 11  favorite_count                42996 non-null  int64  
 12  in_reply_to_screen_name       3216 non-null   object 
 13  i

In [5]:
# count number of different languages in data set
count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

35 ['en' 'th' 'tl' 'und' 'pt' 'es' 'lt' 'it' 'fr' 'ja' 'de' 'ht' 'zh' 'in'
 'el' 'ca' 'is' 'pl' 'nl' 'ko' 'fa' 'hu' 'ar' 'cy' 'cs' 'et' 'no' 'tr'
 'lv' 'sv' 'ro' 'sl' 'eu' 'fi' 'da']


In [6]:
# tweets are in 35 different languages. 
# I'll be working only with tweets in English
# so, I'll drop tweets in all other languages
df = df[df.lang == 'en']
df.shape

(38884, 37)

In [7]:
# drop unnecessary columns 
df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang', 'user_location', 'user_time_zone', 'user_urls', 'place'], axis=1)

## Part 2: Process Tweet text for NLP analysis... 

In [8]:
text = df['text'] 

In [16]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers

import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag

stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [17]:
df['processed_text'] = df['text'].apply(process_text)

In [12]:
pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  """Entry point for launching an IPython kernel.


0     [name, ashli, babbit]                                                                                                                                                                                                             
1     [police, officer, shot, killed, ashli, babbitt]                                                                                                                                                                                   
2     [name, ashli, babbit, year, veteran, served, four, tours, us, air, force, high, level, security, official, throughout, time, service]                                                                                             
3     [@the_real_fly, believe, young, lady, ashli, babbit]                                                                                                                                                                              
4     [living, history, right, beginning, american, revolution]     

In [18]:
ready_for_pos = df['processed_text']

In [19]:
# part-of-speech tagging 
def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [20]:
pos_tagged = df['pos_tagged']

In [21]:
# lemmatizing

from nltk.stem import WordNetLemmatizer 
wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [22]:
df['lemmatized'] = lemmatized

In [23]:
df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))

In [24]:
df['final_docs'][30000:30020]

33259    see please retweet share keep prayer follow @melissa update                                                                                                                   
33260    @simplethings_d @chickensith protest defense criminal ashli babbit criminal record storming capitol building involve destruction small business result large scale damage like
33261    name ashli babbitt name ashli babbitt name ashli babbitt say name                                                                                                             
33262    woman shot killed peaceful protest air force veteran ashli babbit                                                                                                             
33263    dc police murdered veteran cold blood today vigil like george floyd get plaza dedicated even name mentioned house floor given moment silence never forget ashli babbit        
33264    @mike_pence oh really kept everyone safe tell ashli bobbitt family sure

In [25]:
final_docs = df['final_docs']

## Part 3: Modeling...

In [26]:
#create document term matrix with TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer
# initial tuning of parameters
#set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

In [27]:
doc_term_matrix_1.shape

(38884, 2000)

In [29]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=10)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)



In [31]:
from sklearn.decomposition import TruncatedSVD

lsi_model = TruncatedSVD(n_components=10)
lsi_Z = lsi_model.fit_transform(doc_term_matrix_1)


In [32]:
from sklearn.decomposition import LatentDirichletAllocation
#  LDA model 
lda_model = LatentDirichletAllocation(n_components = 10, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [33]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, tfidfconverter )
print("=" * 20)

LDA Model:
Topic 0:
[('miss', 632.015825091519), ('miss ashli', 628.0128012361246), ('terrorist', 276.7763399787428), ('capitol', 209.10496352326243), ('fuck', 197.07499206044477), ('federal', 194.88387558259024), ('babbit', 187.79203407455498), ('ashli babbit', 185.2420585089263), ('building', 172.57111599430425), ('ashli', 172.36241132800882)]
Topic 1:
[('say', 238.96067045359484), ('killed capitol', 218.4341922496529), ('capitol', 179.62150535022883), ('killed', 168.40133273961837), ('say ashli', 168.35182120387103), ('vet', 166.308958400202), ('air', 163.15635343898356), ('air force', 161.87444472384712), ('force', 159.92311205731946), ('force vet', 148.5196665206032)]
Topic 2:
[('ashli babbit', 1850.9890749928886), ('babbit', 1845.7408853482368), ('ashli', 1681.3828392789362), ('rip ashli', 652.8576436590427), ('rip', 648.2794080476688), ('killed ashli', 586.5536276597438), ('babbitt', 571.7971662018638), ('ashli babbitt', 570.0119602883248), ('officer', 552.697388196293), ('polic

In [47]:
topic_values = nmf_model.transform(doc_term_matrix_1)


In [49]:
df['NMF_topic'] = topic_values.argmax(axis=1)

In [53]:
df = df.drop(['nmf_topic'], axis=1)

In [54]:
df.head()

Unnamed: 0,id,parsed_created_at,user_screen_name,text,tweet_type,hashtags,favorite_count,possibly_sensitive,retweet_count,user_id,processed_text,pos_tagged,lemmatized,final_docs,NMF_topic
0,1347022076096307201,2021-01-07 03:27:44+00:00,unabashedlycri1,Her name was Ashli Babbit.,retweet,,5602,,0,1279137538104426496,"[name, ashli, babbit]","[(name, NN), (ashli, VBZ), (babbit, NN)]","[name, ashli, babbit]",name ashli babbit,0
1,1347022076172005378,2021-01-07 03:27:44+00:00,polaroptics,Where is the police officer who shot &amp; killed Ashli Babbitt!?,retweet,,2685,,0,533711654,"[police, officer, shot, killed, ashli, babbitt]","[(police, NNS), (officer, NN), (shot, NN), (killed, VBD), (ashli, JJ), (babbitt, NN)]","[police, officer, shot, killed, ashli, babbitt]",police officer shot killed ashli babbitt,3
2,1347022077807652866,2021-01-07 03:27:44+00:00,creatcburst,"Her name was Ashli Babbit, a 14-year veteran, who served four tours with the US Air Force, and was a high level security official throughout her time in service.",quote,,0,,0,765234992,"[name, ashli, babbit, year, veteran, served, four, tours, us, air, force, high, level, security, official, throughout, time, service]","[(name, NN), (ashli, NN), (babbit, NN), (year, NN), (veteran, NN), (served, VBD), (four, CD), (tours, NNS), (us, PRP), (air, VBP), (force, JJ), (high, JJ), (level, NN), (security, NN), (official, NN), (throughout, IN), (time, NN), (service, NN)]","[name, ashli, babbit, year, veteran, served, four, tour, u, air, force, high, level, security, official, throughout, time, service]",name ashli babbit year veteran served four tour u air force high level security official throughout time service,9
3,1347022078164283393,2021-01-07 03:27:44+00:00,EkohawkDonna,@The_Real_Fly Believe this is the young lady Ashli Babbit \nhttps://t.co/7ZeHm1eBpg,retweet,,215,False,0,740927412331028484,"[@the_real_fly, believe, young, lady, ashli, babbit]","[(@the_real_fly, NN), (believe, VBP), (young, JJ), (lady, NN), (ashli, JJ), (babbit, NN)]","[@the_real_fly, believe, young, lady, ashli, babbit]",@the_real_fly believe young lady ashli babbit,0
4,1347022078302679040,2021-01-07 03:27:44+00:00,JanetMarks20,We are living history right now. The beginning of an American Revolution.,retweet,,38,,0,3439143621,"[living, history, right, beginning, american, revolution]","[(living, VBG), (history, NN), (right, RB), (beginning, VBG), (american, JJ), (revolution, NN)]","[living, history, right, beginning, american, revolution]",living history right beginning american revolution,9


In [34]:
import pyLDAvis.sklearn

In [35]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [36]:

import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [37]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(doc_term_matrix_1)

df_new = pd.DataFrame(columns=['x', 'y', 'document'])
df_new['x'], df_new['y'], df_new['document'] = documents_2d[:,0], documents_2d[:,1], range(len(final_docs))
 
source = ColumnDataSource(ColumnDataSource.from_df(df_new))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [38]:
svd_2 = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(doc_term_matrix_1.T)
 
df_new2 = pd.DataFrame(columns=['x', 'y', 'word'])
df_new2['x'], df_new2['y'], df_new2['word'] = words_2d[:,0], words_2d[:,1], tfidfconverter.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df_new2))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)