# Part 1: Data cleaning 

## Part 1: Data import and cleaning

In [2]:
#import initial libraries

import pandas as pd
import numpy as np

In [3]:
#import data 

df = pd.read_csv("data/ashli.csv")

In [4]:
# look at basic info about data

df.info()
# this data set consists of 42996 Tweets
# Twarc filter tracking keyword "ashli" started within half-hour after name of "ashli babbitt" was published
# (Ashley Babbit is the protester who was killed in the capitol during the riots on Jan. 6)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42996 entries, 0 to 42995
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            42996 non-null  int64  
 1   tweet_url                     42996 non-null  object 
 2   created_at                    42996 non-null  object 
 3   parsed_created_at             42996 non-null  object 
 4   user_screen_name              42996 non-null  object 
 5   text                          42996 non-null  object 
 6   tweet_type                    42996 non-null  object 
 7   coordinates                   0 non-null      float64
 8   hashtags                      1926 non-null   object 
 9   media                         2423 non-null   object 
 10  urls                          6191 non-null   object 
 11  favorite_count                42996 non-null  int64  
 12  in_reply_to_screen_name       3216 non-null   object 
 13  i

In [5]:
# count number of languages in data set

count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

35 ['en' 'th' 'tl' 'und' 'pt' 'es' 'lt' 'it' 'fr' 'ja' 'de' 'ht' 'zh' 'in'
 'el' 'ca' 'is' 'pl' 'nl' 'ko' 'fa' 'hu' 'ar' 'cy' 'cs' 'et' 'no' 'tr'
 'lv' 'sv' 'ro' 'sl' 'eu' 'fi' 'da']


In [7]:
# tweets are in 35 different languages

# I'll be working only with Tweets in English
# drop tweets in all other languages
# now working with 38884 Tweets 

df = df[df.lang == 'en']
df.shape

(38884, 37)

In [8]:
# drop unnecessary columns 

df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang', 'user_location', 'user_time_zone', 'user_urls', 'place'], axis=1)

In [10]:
# check start time & date of data

df.iloc[0]

# first Tweet downloaded Jan 7, 2021 at 03:27:44

id                           1347022076096307201
parsed_created_at      2021-01-07 03:27:44+00:00
user_screen_name                 unabashedlycri1
text                  Her name was Ashli Babbit.
tweet_type                               retweet
hashtags                                     NaN
favorite_count                              5602
possibly_sensitive                           NaN
retweet_count                                  0
user_id                      1279137538104426496
Name: 0, dtype: object

In [12]:
# check end time & date of data 

df.iloc[-1]

# last Tweet on Jan 7, 2021 at 06:15:58

id                                                  1347064414789271558
parsed_created_at                             2021-01-07 06:15:58+00:00
user_screen_name                                            Ben18629860
text                  Not one of these worthless fuckers have given ...
tweet_type                                                      retweet
hashtags                                                            NaN
favorite_count                                                     1986
possibly_sensitive                                                  NaN
retweet_count                                                         0
user_id                                             1193375422752256002
Name: 42995, dtype: object

# Part 2: Text processing for NLP 

In [13]:
# create variable for "text" column 
text = df['text'] 

In [14]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers
# import necessary libraries: ntlk etc.

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 


stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [15]:
# apply text processing functions to text

df['processed_text'] = df['text'].apply(process_text)

In [16]:
# look at some of processed text

pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  This is separate from the ipykernel package so we can avoid doing imports until


0     [name, ashli, babbit]                                                                                                                                                                                                             
1     [police, officer, shot, killed, ashli, babbitt]                                                                                                                                                                                   
2     [name, ashli, babbit, year, veteran, served, four, tours, us, air, force, high, level, security, official, throughout, time, service]                                                                                             
3     [@the_real_fly, believe, young, lady, ashli, babbit]                                                                                                                                                                              
4     [living, history, right, beginning, american, revolution]     

In [26]:
# part-of-speech tagging 

ready_for_pos = df['processed_text']

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [27]:
# lemmatizing

pos_tagged = df['pos_tagged']

wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [23]:
pos_tagged = df['pos_tagged']

In [30]:
# lemmatizing

from nltk.stem import WordNetLemmatizer 
wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [31]:
# look at lemmatized text

df['lemmatized'] = lemmatized
lemmatized[:20]

[['name', 'ashli', 'babbit'],
 ['police', 'officer', 'shot', 'killed', 'ashli', 'babbitt'],
 ['name',
  'ashli',
  'babbit',
  'year',
  'veteran',
  'served',
  'four',
  'tour',
  'u',
  'air',
  'force',
  'high',
  'level',
  'security',
  'official',
  'throughout',
  'time',
  'service'],
 ['@the_real_fly', 'believe', 'young', 'lady', 'ashli', 'babbit'],
 ['living', 'history', 'right', 'beginning', 'american', 'revolution'],
 ['police', 'officer', 'shot', 'killed', 'ashli', 'babbitt'],
 ['garrison',
  'critical',
  'thread',
  'insurgent',
  'killed',
  'today',
  'storming',
  'capitol'],
 ['ashli',
  'babbit',
  'rednèck',
  'as',
  'family',
  'friend',
  'online',
  'trying',
  'paint',
  'like',
  'oluwatoyin',
  'salau',
  'fuck',
  'outta'],
 ['ashli', 'babbit', 'going', 'remembered', 'terrorist', 'martyr'],
 ['ashli',
  'babbit',
  'stormed',
  'capitol',
  'tried',
  'breach',
  'window',
  'speaker',
  'lobby',
  'killed',
  'peacefully',
  'protest',
  'outside',
  'in

In [32]:
# before vectorizing, cast lists of words back into strings

df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))
pd.set_option('display.max_colwidth', -1)
final_docs = df['final_docs']
final_docs[3000:3020]

  after removing the cwd from sys.path.


3280    police officer shot killed ashli babbitt                                                                                                                                                                    
3281    woman murdered federal government cold blood ashli babbit antifa stop lying                                                                                                                                 
3282    understood senator appreciate dedication service country however feel compelled point unarmed protester shot killed today year veteran air force completed tour duty name ashli babbit                      
3284    dc police murdered veteran cold blood today vigil like george floyd get plaza dedicated even name mentioned house floor given moment silence never forget ashli babbit                                      
3285    @daysnights know waiting looked ashli bobbit's profile retweeted lin non-stop know shot yet                                                 

In [33]:
#create document term matrix with TFIDF

#import vectorizing tool (usee TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

In [34]:
#run NMF model 

#import NMF tool 
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=6)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)


In [35]:
# run LDA model

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 6, max_iter=10, learning_method='online', learning_decay=.9)
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [36]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 20)

LDA Model:
Topic 0:
[('rip ashli', 650.242695136725), ('rip', 643.4811119216748), ('babbit', 587.6304078436988), ('ashli babbit', 583.3517211686361), ('ashli', 520.570899720083), ('country', 453.76511418423127), ('served', 399.18885495960217), ('year', 322.3316977161545), ('died', 307.7789794858823), ('right', 268.7014363009379)]
Topic 1:
[('babbitt', 873.318541529854), ('ashli babbitt', 868.8433192963549), ('officer', 658.3679489873323), ('killed ashli', 653.4270002806168), ('police officer', 633.4332985341136), ('officer shot', 604.2520561818841), ('shot', 582.7710237403085), ('killed', 575.7002517079698), ('ashli', 545.9840974105908), ('police', 525.7396173628672)]
Topic 2:
[('floyd', 392.41609082999145), ('george', 391.98289819310804), ('george floyd', 388.0911842705059), ('cold', 384.9161335812381), ('cold blood', 382.4214392822945), ('house', 370.08749652434705), ('like', 366.76503396305264), ('moment', 362.8946004644259), ('forget', 360.4158092291095), ('blood', 358.808323799408

# Part 4: Run visualization and testing of LDA model

In [37]:
# visualization of LDA model 
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [38]:
# test LDA model

# log likelihood (higher score is better)
print("Log likelihood: ", lda_model.score(doc_term_matrix_1))


# perplexity (lower score is better)
print("Perplexity: ", lda_model.perplexity(doc_term_matrix_1))

Log likelihood:  -967384.3109877736
Perplexity:  540.2230953463446


In [None]:
# cross-validation to find best parameters for LDA model

#import cross-validation tool
from sklearn.model_selection import GridSearchCV

# define search parameters
search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# initialize model for cross-validation
lda = LatentDirichletAllocation()

# initialize grid search class 
model = GridSearchCV(lda, param_grid=search_params)

# run grid search 
model.fit(doc_term_matrix_1)

In [None]:
# what is best model? 
best_lda_model = model.best_estimator_

# print parameters for best model
print("Parameters for best model: ", model.best_params_)

# print log likelihood score
print("Log likelihood score of best model: ", model.best_score_)

# print perplexity score
print("Perplexity score of best model: ", best_lda_model.perplexity(doc_term_matrix_1))

In [35]:
import pyLDAvis.sklearn

In [36]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel