# Topic modeling of Tweets w/ keyword "ashli" on Jan. 6

## Part 1: Data import and cleaning

In [2]:
# import initial libraries 
import pandas as pd
import numpy as np

In [3]:
# import Twitter data, which has been converted from json to csv 
df = pd.read_csv("data/ashli.csv")

In [4]:
# basic info about data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42996 entries, 0 to 42995
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            42996 non-null  int64  
 1   tweet_url                     42996 non-null  object 
 2   created_at                    42996 non-null  object 
 3   parsed_created_at             42996 non-null  object 
 4   user_screen_name              42996 non-null  object 
 5   text                          42996 non-null  object 
 6   tweet_type                    42996 non-null  object 
 7   coordinates                   0 non-null      float64
 8   hashtags                      1926 non-null   object 
 9   media                         2423 non-null   object 
 10  urls                          6191 non-null   object 
 11  favorite_count                42996 non-null  int64  
 12  in_reply_to_screen_name       3216 non-null   object 
 13  i

In [5]:
# count number of different languages in data set
count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

35 ['en' 'th' 'tl' 'und' 'pt' 'es' 'lt' 'it' 'fr' 'ja' 'de' 'ht' 'zh' 'in'
 'el' 'ca' 'is' 'pl' 'nl' 'ko' 'fa' 'hu' 'ar' 'cy' 'cs' 'et' 'no' 'tr'
 'lv' 'sv' 'ro' 'sl' 'eu' 'fi' 'da']


In [6]:
# tweets are in 35 different languages. 
# I'll be working only with tweets in English
# so, I'll drop tweets in all other languages
df = df[df.lang == 'en']
df.shape

(38884, 37)

In [7]:
# drop unnecessary columns 
df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang', 'user_location', 'user_time_zone', 'user_urls', 'place'], axis=1)

## Part 2: Process Tweet text for NLP analysis... 

In [8]:
text = df['text'] 

In [9]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers

import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag

stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [10]:
df['processed_text'] = df['text'].apply(process_text)

In [11]:
pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  """Entry point for launching an IPython kernel.


0     [name, ashli, babbit]                                                                                                                                                                                                             
1     [police, officer, shot, killed, ashli, babbitt]                                                                                                                                                                                   
2     [name, ashli, babbit, year, veteran, served, four, tours, us, air, force, high, level, security, official, throughout, time, service]                                                                                             
3     [@the_real_fly, believe, young, lady, ashli, babbit]                                                                                                                                                                              
4     [living, history, right, beginning, american, revolution]     

In [12]:
ready_for_pos = df['processed_text']

In [13]:
# part-of-speech tagging 
def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [14]:
pos_tagged = df['pos_tagged']

In [15]:
# lemmatizing

from nltk.stem import WordNetLemmatizer 
wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [16]:
df['lemmatized'] = lemmatized

In [17]:
df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))

In [18]:
df['final_docs'][30000:30020]

33259    see please retweet share keep prayer follow @melissa update                                                                                                                   
33260    @simplethings_d @chickensith protest defense criminal ashli babbit criminal record storming capitol building involve destruction small business result large scale damage like
33261    name ashli babbitt name ashli babbitt name ashli babbitt say name                                                                                                             
33262    woman shot killed peaceful protest air force veteran ashli babbit                                                                                                             
33263    dc police murdered veteran cold blood today vigil like george floyd get plaza dedicated even name mentioned house floor given moment silence never forget ashli babbit        
33264    @mike_pence oh really kept everyone safe tell ashli bobbitt family sure

In [19]:
final_docs = df['final_docs']

## Part 3: Modeling...

In [20]:
#create document term matrix with TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer
# initial tuning of parameters
#set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

In [21]:
doc_term_matrix_1.shape

(38884, 2000)

In [22]:
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=10)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)



In [23]:
from sklearn.decomposition import TruncatedSVD

lsi_model = TruncatedSVD(n_components=10)
lsi_Z = lsi_model.fit_transform(doc_term_matrix_1)


In [24]:
from sklearn.decomposition import LatentDirichletAllocation
#  LDA model 
lda_model = LatentDirichletAllocation(n_components = 10, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [25]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, tfidfconverter )
print("=" * 20)

LDA Model:
Topic 0:
[('prayer', 682.116563078855), ('follow', 633.5425875154655), ('daughter law', 627.0622566511838), ('share prayer', 626.1560932248085), ('retweet share', 626.1560932247144), ('wonderful', 626.0854285133433), ('know daughter', 625.9949365299208), ('prayer follow', 625.9642820560556), ('woman strong', 625.9089298500515), ('stop miss', 625.8422552225545)]
Topic 1:
[('rip ashli', 652.8575535278704), ('rip', 637.2621657537067), ('today', 402.3536645080285), ('want', 374.579433339226), ('force veteran', 361.42247646859994), ('told', 336.1708021050973), ('want know', 323.71572544367785), ('america', 323.34501687140573), ('killed air', 318.62070546581293), ('congress today', 317.56876939634407)]
Topic 2:
[('kind', 1254.8535018470361), ('law', 654.4445467868667), ('stop', 645.2098255312441), ('smart', 626.8727085507037), ('person', 622.0053448281112), ('know', 587.3684158211906), ('say', 287.7567359544381), ('ashli', 248.52894163498917), ('life', 235.41395177988218), ('matte

In [26]:
topic_values = nmf_model.transform(doc_term_matrix_1)


In [27]:
df['NMF_topic'] = topic_values.argmax(axis=1)

In [28]:
df = df.drop(['nmf_topic'], axis=1)

KeyError: "['nmf_topic'] not found in axis"

In [None]:
df.head()

In [None]:
import pyLDAvis.sklearn

In [None]:

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [None]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [None]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(doc_term_matrix_1)

df_new = pd.DataFrame(columns=['x', 'y', 'document'])
df_new['x'], df_new['y'], df_new['document'] = documents_2d[:,0], documents_2d[:,1], range(len(final_docs))
 
source = ColumnDataSource(ColumnDataSource.from_df(df_new))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [None]:
svd_2 = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(doc_term_matrix_1.T)
 
df_new2 = pd.DataFrame(columns=['x', 'y', 'word'])
df_new2['x'], df_new2['y'], df_new2['word'] = words_2d[:,0], words_2d[:,1], tfidfconverter.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df_new2))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [29]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(doc_term_matrix_1))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(doc_term_matrix_1))

Log Likelihood:  -994994.0551055479
Perplexity:  646.4909768990364


In [31]:
from sklearn.model_selection import GridSearchCV

# Define Search Param
search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(doc_term_matrix_1)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 8, 10, 12]})

In [33]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(doc_term_matrix_1))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 12}
Best Log Likelihood Score:  -207088.62880388106
Model Perplexity:  432.01850588083096


In [34]:
# 2nd LDA model
lda_model_2 = LatentDirichletAllocation(n_components = 12, max_iter=10, learning_method='online', learning_decay=0.5)
lda_Z_2 = lda_model_2.fit_transform(doc_term_matrix_1)

In [35]:
import pyLDAvis.sklearn

In [36]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel