# Part 1: Data cleaning

In [1]:
#import initial libraries

import numpy as np
import pandas as pd

In [2]:
#import data (retweets removed in Twarc)

df = pd.read_csv('data/jack_NR.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# look at basic info about data

df.info()
# this data set consists of 32802 Tweets

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32802 entries, 0 to 32801
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            32802 non-null  int64  
 1   tweet_url                     32802 non-null  object 
 2   created_at                    32802 non-null  object 
 3   parsed_created_at             32802 non-null  object 
 4   user_screen_name              32802 non-null  object 
 5   text                          32802 non-null  object 
 6   tweet_type                    32802 non-null  object 
 7   coordinates                   5 non-null      object 
 8   hashtags                      3685 non-null   object 
 9   media                         1982 non-null   object 
 10  urls                          9883 non-null   object 
 11  favorite_count                32802 non-null  int64  
 12  in_reply_to_screen_name       32802 non-null  object 
 13  i

In [4]:
# count number of languages in data set

count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

43 ['und' 'en' 'es' 'bg' 'it' 'in' 'ar' 'pt' 'et' 'tr' 'tl' 'pl' 'sv' 'fa'
 'fr' 'eu' 'no' 'de' 'hi' 'ht' 'fi' 'lv' 'ja' 'ta' 'vi' 'cy' 'nl' 'da'
 'ro' 'sl' 'ml' 'ru' 'ur' 'hu' 'ca' 'iw' 'bn' 'zh' 'ko' 'cs' 'sd' 'lt'
 'ne']


In [5]:
# tweets are in 43 different languages

# I'll be working only with tweets in English
# drop tweets in all other languages
# now working with 64965 Tweets 

df = df[df.lang == 'en']
df.shape

(25925, 37)

In [6]:
# drop unnecessary columns 
df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang'], axis=1)

In [7]:
# check start time & date of data

df.iloc[0]

# first Tweet 2021-01-08 07:05:11

id                                                  1347439190196314114
parsed_created_at                             2021-01-08 07:05:11+00:00
user_screen_name                                             SrslySteve
text                  @jack You have more in common with the Cuecat ...
tweet_type                                                        reply
hashtags                                                            NaN
favorite_count                                                        0
place                                                               NaN
possibly_sensitive                                                  NaN
retweet_count                                                         0
user_id                                                      4707712217
user_location                                         Between the Lines
user_time_zone                                                      NaN
user_urls                                                       

In [8]:
# check end time & date of data 

df.iloc[-1]

# last Tweet  2020-12-31 22:12:51

id                                                  1344768509704810496
parsed_created_at                             2020-12-31 22:12:51+00:00
user_screen_name                                             bobinfanti
text                  @jack \nPresident-Elect Kristie  and 102 other...
tweet_type                                                     original
hashtags                                                            NaN
favorite_count                                                        0
place                                                               NaN
possibly_sensitive                                                  NaN
retweet_count                                                         0
user_id                                                        42769580
user_location                                              USA  / ITALY
user_time_zone                                                      NaN
user_urls                                                       

# Part 2: Text processing for NLP 

In [9]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers
# import necessary libraries: ntlk etc.

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 

stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [10]:
# apply text processing functions to text
df['processed_text'] = df['text'].apply(process_text)

In [11]:
# look at some of processed text
pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  


3     [@jack, common, cuecat, guy, woz, know, hope, sleep, well, mountain, gold]                                                                                                                            
4     [@jack, time, act, responsible]                                                                                                                                                                       
7     [@jack, awesome, great, job, fb, walmart, twitter, target, smaller, always, bad]                                                                                                                      
8     [@jack, twitter, apparently, give, shit, post, porn, sexually, explicit, media, platform, god, forbid, share, unpopular, opinion, election, pathetic]                                                 
11    [@jack, right, thing, trump, clear, present, danger, america, cannot, allowed, incite, widespread, #seditionism, via, social, media, #deletetrumpstwitter, #deletehisaccount, 

In [12]:
# part-of-speech tagging 

ready_for_pos = df['processed_text']

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [13]:
# lemmatizing

pos_tagged = df['pos_tagged']

wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [14]:


df['lemmatized'] = lemmatized

In [15]:
# before vectorizing, cast lists of words back into strings

df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))
pd.set_option('display.max_colwidth', -1)
final_docs = df['final_docs']
final_docs[3000:3020]

  after removing the cwd from sys.path.


3824    @jack come                                                                                                                                                                       
3825    @jack come jack right thing please                                                                                                                                               
3826    @jack please nation people great danger would-be dictator tried stage coup yesterday please part keep u safe #removetrumpnow                                                     
3827    @jack block twitter zuck                                                                                                                                                         
3828    @jack @twitter @twittersafety suspend account @realdonaldtrump @donaldjtrumpjr @ivankatrump @erictrump @potus @flotus january #countryovertrump                                  
3830    @jack @twittersupport facebook stepped suspend trump two week 

 # Part 3: run NMF and LDA models, for topic modeling

In [16]:
#create document term matrix with TFIDF

#import vectorizing tool (usee TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

In [17]:
doc_term_matrix_1.shape

(25925, 2000)

In [18]:
#run NMF model 

#import NMF tool 
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=6, max_iter=400)
nmf = nmf_model.fit_transform(doc_term_matrix_1)

In [19]:
# run LDA model

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 6, max_iter=10, learning_method='online', learning_decay=.9)
lda = lda_model.fit_transform(doc_term_matrix_1)

In [20]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 10)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 10)

LDA Model:
Topic 0:
[('delete', 415.3768662907499), ('account', 373.5986854209935), ('ban', 301.24222779798055), ('time', 290.3529316656378), ('jack delete', 263.74682475237586), ('delete account', 253.03268002008082), ('jack time', 244.48991161925292), ('trump', 225.59821848107546), ('realdonaldtrump', 206.69486436767954), ('deactivate', 203.60509729121716)]
Topic 1:
[('suspend', 319.189506137023), ('account', 258.7504896738571), ('trump', 247.64415366689792), ('twitter', 231.1296949029342), ('jack suspend', 221.5304092264962), ('remove', 195.9903021684501), ('violence', 142.8376833681999), ('suspend trump', 138.45943120056697), ('trump account', 125.21794966960168), ('realdonaldtrump', 124.05452125673328)]
Topic 2:
[('verify', 350.8352355528546), ('bitcoin', 334.3093323438503), ('jack jack', 262.0338954083388), ('jack verify', 233.43101826716253), ('thank', 201.24704705741448), ('ericanlewedim', 184.2056320289114), ('suspend account', 151.27411047362298), ('verify ericanlewedim', 151

# Part 4: Run visualization and testing of LDA model

In [21]:
# visualization of LDA model 
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [22]:
# test LDA model

# log likelihood (higher score is better)
print("Log likelihood: ", lda_model.score(doc_term_matrix_1))


# perplexity (lower score is better)
print("Perplexity: ", lda_model.perplexity(doc_term_matrix_1))

Log likelihood:  -456546.9100393462
Perplexity:  1673.4211522474343


In [23]:
# cross-validation to find best parameters for LDA model

#import cross-validation tool
from sklearn.model_selection import GridSearchCV

# define search parameters
search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# initialize model for cross-validation
lda = LatentDirichletAllocation()

# initialize grid search class 
model = GridSearchCV(lda, param_grid=search_params)

# run grid search 
model.fit(doc_term_matrix_1)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 8, 10, 12]})

In [24]:
# what is best model? 
best_lda_model = model.best_estimator_

# print parameters for best model
print("Parameters for best model: ", model.best_params_)

# print log likelihood score
print("Log likelihood score of best model: ", model.best_score_)

# print perplexity score
print("Perplexity score of best model: ", best_lda_model.perplexity(doc_term_matrix_1))

Parameters for best model:  {'learning_decay': 0.9, 'n_components': 5}
Log likelihood score of best model:  -101517.11492313529
Perplexity score of best model:  1454.57391921786


In [25]:
# run LDA model with best parameters

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model_2 = LatentDirichletAllocation(n_components = 5, max_iter=10, learning_method='online', learning_decay=.5)
lda_2 = lda_model_2.fit_transform(doc_term_matrix_1)

In [52]:
# top topics of 2nd LDA model

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        #print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("Latent Dirichlet Allocation optimized model learning decay 0.9, number of components 5")
print_topics(lda_model_2, tfidfconverter )


Latent Dirichlet Allocation optimized model learning decay 0.9, number of components 5
[('twitter', 327.35265576724504), ('jack twitter', 287.489294835327), ('account', 256.37230808106847), ('suspend', 218.47068595765597), ('need', 203.8870768673765), ('trump', 164.93087012228347), ('shut', 163.28108202395663), ('right', 149.28769429662367), ('jack need', 147.32683418116244), ('twittersupport', 143.5739860872805)]
[('verify', 780.9799108961281), ('jack jack', 618.747886671667), ('jack verify', 541.4606373819423), ('ericanlewedim', 334.57359622150597), ('verify ericanlewedim', 288.47759438335976), ('jack shut', 272.6735238875097), ('shut', 254.5639772526102), ('btc', 235.4786782340303), ('verified', 226.4888431936203), ('mazinnamdikanu', 168.23524630448733)]
[('follow', 255.4689467984964), ('bitcoin', 253.3606899474227), ('jack bitcoin', 244.47172457591978), ('account', 213.8017084471193), ('pls', 191.68104665557055), ('delete', 165.75396235515373), ('jack follow', 155.5742550084303), (

In [29]:
# visualization of 2nd LDA model 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [30]:
# make results of LDA2 model a column in dataframe

topic_values = lda_model_2.transform(doc_term_matrix_1)
df['LDA2_topic'] = topic_values.argmax(axis=1)

In [119]:
df['text'][df.LDA2_topic==1][3100:3150]

24699    @jack The Genesis Block.                                                                                                                                                                                                                                                                         
24700    @jack #Digibyte\n#Target\n#DGB\n\n"Genesis Block Hash: "USA Today: 10/Jan/2014, Target: Data stolen from up to 110M customers" https://t.co/9ubW5KMFT0                                                                                                                                           
24702    @jack @neo @morpheus where you at? What’s this mean?                                                                                                                                                                                                                                             
24714    @jack mama enti Followers count fluctuations , Accounts ni ethestunnava                       

In [80]:
# make results of NMF model a column in dataframe

topic_values = lda_model.transform(doc_term_matrix_1)
df['LDA1_topic'] = topic_values.argmax(axis=1)

In [87]:
df['text'][df.LDA1_topic==1][300:330]

2310    @jack Suspend Trump’s account indefinitely. \n\n#banTrumpforlife #BanTrumpsTwitter                                                                                                                                                                            
2316    @jack, where are your courage, morals, and patriotism? We’re waiting to see them. https://t.co/IT12qstvEc                                                                                                                                                     
2322    @jack Out of an abundance of caution Twitter  should lock @realDonaldTrump  account until January 20th. #Stopthetweet                                                                                                                                         
2337    @jack permanent ban for DJT, his family, and all their staff—he’s been posting on one of his staff’s accounts                                                                                              

In [77]:
# make results of NMF model a column in dataframe

topic_values = nmf_model.transform(doc_term_matrix_1)
df['NMF_topic'] = topic_values.argmax(axis=1)

In [86]:
df['text'][df.NMF_topic==3][300:330]

6383    @jack she on yo ass lil boy! https://t.co/dSQFo6fF6Z                                                                                                                                                                                                                  
6386    @jack Thank you. Thank you.                                                                                                                                                                                                                                           
6392    @jack Thanks Jack, you are super human.  Long live:)                                                                                                                                                                                                                  
6399    @jack Can you please make this guys famous! @DCPoliceDept https://t.co/bQf8JXuJiQ                                                                                                  