# Part 1: Data cleaning

In [1]:
#import initial libraries

import numpy as np
import pandas as pd

In [2]:
#import initial libraries

df = pd.read_csv('data/coup.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# look at basic info about data

df.info()
# this data set consists of 413,600 Tweets

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413600 entries, 0 to 413599
Data columns (total 37 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   id                            413600 non-null  int64  
 1   tweet_url                     413600 non-null  object 
 2   created_at                    413600 non-null  object 
 3   parsed_created_at             413600 non-null  object 
 4   user_screen_name              413600 non-null  object 
 5   text                          413600 non-null  object 
 6   tweet_type                    413600 non-null  object 
 7   coordinates                   7 non-null       object 
 8   hashtags                      13057 non-null   object 
 9   media                         11642 non-null   object 
 10  urls                          52890 non-null   object 
 11  favorite_count                413600 non-null  int64  
 12  in_reply_to_screen_name       31057 non-null

In [4]:
# count number of languages in data set

count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

44 ['en' 'und' 'fr' 'de' 'nl' 'in' 'th' 'ja' 'pt' 'es' 'et' 'ca' 'fa' 'it'
 'tr' 'pl' 'ht' 'cy' 'ko' 'ro' 'tl' 'eu' 'zh' 'ar' 'no' 'ru' 'fi' 'pa'
 'el' 'vi' 'sv' 'iw' 'sr' 'ur' 'hu' 'lt' 'cs' 'lv' 'da' 'hi' 'is' 'sl'
 'ta' 'dv']


In [5]:
# tweets are in 44 different languages

# I'll be working only with tweets in English
# drop tweets in all other languages
# now working with 396,586 Tweets 

df = df[df.lang == 'en']
df.shape

(396586, 37)

In [6]:
# drop unnecessary columns 
df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang'], axis=1)

In [7]:
# check start time & date of data

df.iloc[0]

# first Tweet downloaded Jan 7, 2021 at 00:17:42

id                                                  1346974253178970113
parsed_created_at                             2021-01-07 00:17:42+00:00
user_screen_name                                               faby1717
text                  Assault on democracy: Sen. Josh Hawley has blo...
tweet_type                                                      retweet
hashtags                                                            NaN
favorite_count                                                     7579
place                                                               NaN
possibly_sensitive                                                False
retweet_count                                                         0
user_id                                                       115280140
user_location                                                 CA Desert
user_time_zone                                                      NaN
user_urls                                                       

In [8]:
# check end time & date of data 

df.iloc[-1]

# last Tweet on Jan 6, 2021 at 03:14:11

id                                                  1347018669625286656
parsed_created_at                             2021-01-07 03:14:11+00:00
user_screen_name                                                kt_dinh
text                  Six Senators voted to sustain the objection to...
tweet_type                                                      retweet
hashtags                                                            NaN
favorite_count                                                        6
place                                                               NaN
possibly_sensitive                                                  NaN
retweet_count                                                         0
user_id                                             1249340729689686016
user_location                                                       NaN
user_time_zone                                                      NaN
user_urls                                                       

# Part 2: Text processing for NLP 

In [9]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers
# import necessary libraries: ntlk etc.

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag

stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [10]:
# apply text processing functions to text
df['processed_text'] = df['text'].apply(process_text)

In [11]:
# look at some of processed text
pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  


0     [assault, democracy, sen, josh, hawley, blood, hands, capitol, coup, attempt]                                                                                                                                              
1     [call, old-fashioned, armed, insurgents, breach, capitol, building, request, president, i'd, call, attempted, coup]                                                                                                        
2     [@vritrite, @dreddersart, oh, yeah, like, politics, bad, im, like, ignore, coup, attempt, happening]                                                                                                                       
3     [ridiculously, divisive, statement, btw, gun, violence, continues, baltimore, oh, still, picked, trash]                                                                                                                    
4     [assault, democracy, sen, josh, hawley, blood, hands, capitol, coup, attempt]             

In [None]:
# part-of-speech tagging 

ready_for_pos = df['processed_text']

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [None]:
# lemmatizing

pos_tagged = df['pos_tagged']

wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [None]:
pos_tagged = df['pos_tagged']

In [None]:
# lemmatizing

from nltk.stem import WordNetLemmatizer 
wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [None]:
# look at lemmatized text

df['lemmatized'] = lemmatized
lemmatized[:20]

In [None]:
# before vectorizing, cast lists of words back into strings

df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))
pd.set_option('display.max_colwidth', -1)
final_docs = df['final_docs']
final_docs[3000:3020]

 # Part 3: run NMF and LDA models, for topic modeling

In [16]:
#create document term matrix with TFIDF

#import vectorizing tool (usee TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

KeyError: 'final_docs'

In [None]:
doc_term_matrix_1.shape

In [None]:
#run NMF model 

#import NMF tool 
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=6)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)

In [15]:
# run LDA model

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 6, max_iter=10, learning_method='online', learning_decay=.9)
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

NameError: name 'doc_term_matrix_1' is not defined

In [14]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 20)

LDA Model:


NameError: name 'lda_model' is not defined

# Part 4: Run visualization and testing of LDA model

In [None]:
import pyLDAvis.sklearn

In [None]:
# visualization of LDA model 
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [None]:
# test LDA model

# log likelihood (higher score is better)
print("Log Likelihood: ", lda_model.score(doc_term_matrix_1))

# perplexity (lower score is better)
print("Perplexity: ", lda_model.perplexity(doc_term_matrix_1))