# Part 1: Import data, data cleaning

In [2]:
#import initial libraries

import numpy as np
import pandas as pd

In [3]:
#import data (retweets removed in Twarc)

df = pd.read_csv('data/pakistan_NR.csv')

In [4]:
# look at basic info about data

df.info()
# this data set consists of 4817 Tweets

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4817 entries, 0 to 4816
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            4817 non-null   int64  
 1   tweet_url                     4817 non-null   object 
 2   created_at                    4817 non-null   object 
 3   parsed_created_at             4817 non-null   object 
 4   user_screen_name              4817 non-null   object 
 5   text                          4817 non-null   object 
 6   tweet_type                    4817 non-null   object 
 7   coordinates                   0 non-null      float64
 8   hashtags                      1858 non-null   object 
 9   media                         877 non-null    object 
 10  urls                          1377 non-null   object 
 11  favorite_count                4817 non-null   int64  
 12  in_reply_to_screen_name       1492 non-null   object 
 13  in_

In [5]:
# count number of languages in data set

count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

31 ['en' 'ja' 'und' 'de' 'hi' 'ta' 'in' 'mr' 'kn' 'da' 'es' 'gu' 'or' 'tl'
 'et' 'ht' 'pl' 'pt' 'tr' 'nl' 'eu' 'ur' 'it' 'sv' 'fi' 'fr' 'ca' 'no'
 'vi' 'ro' 'pa']


In [6]:
# tweets are in 31 different languages

# I'll be working only with tweets in English
# drop tweets in all other languages
# now working with 4299 Tweets 

df = df[df.lang == 'en']
df.shape

(4299, 37)

In [7]:
# drop unnecessary columns 
df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang'], axis=1)

In [8]:
# check start time & date of data

df.iloc[0]

# first Tweet 2021-01-10 03:32:19

id                                                  1348110393105915911
parsed_created_at                             2021-01-10 03:32:19+00:00
user_screen_name                                          sharpieforall
text                  @JovanHPulitzer election switching USA-&gt;Lea...
tweet_type                                                        reply
hashtags                                                            NaN
favorite_count                                                        0
place                                                               NaN
possibly_sensitive                                                  NaN
retweet_count                                                         0
user_id                                             1324148422321020936
user_location                                                       NaN
user_time_zone                                                      NaN
user_urls                                                       

In [9]:
# check end time & date of data 

df.iloc[-1]

# last Tweet 2021-01-02 13:20:38

id                                                  1345359346360844288
parsed_created_at                             2021-01-02 13:20:38+00:00
user_screen_name                                          abdulkashmiri
text                  @Hussain_NSharif These people R puppies of Gen...
tweet_type                                                        reply
hashtags                                                            NaN
favorite_count                                                        0
place                                                               NaN
possibly_sensitive                                                  NaN
retweet_count                                                         0
user_id                                                       245495770
user_location                                                       USA
user_time_zone                                                      NaN
user_urls                                                       

# Part 2: Text processing for NLP 

In [10]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers
# import necessary libraries: ntlk etc.

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 

stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [11]:
# apply text processing functions to text
df['processed_text'] = df['text'].apply(process_text)

In [12]:
# look at some of processed text
pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  


0     [@jovanhpulitzer, election, switching, usa, ->, leaonardo, vatican, ->, pakistan, ->, china, ->, frankfurt-usa, hours, ago, pakistan, complete, power, outage, total, grid, failure, vatican, complete, power, failure, red]              
1     [pakistan, experiences, massive, blackout, following, breakdown, national, power, grid]                                                                                                                                                   
2     [@teamlarcho, pakistan, national, power, outage, ):]                                                                                                                                                                                      
3     [weird, ..., vatican, pakistan]                                                                                                                                                                                                           
4     [@rmichanczyk, @jovanhpulitzer

In [13]:
# part-of-speech tagging 

ready_for_pos = df['processed_text']

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [14]:
# lemmatizing

pos_tagged = df['pos_tagged']

wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [15]:

df['lemmatized'] = lemmatized

In [16]:
# before vectorizing, cast lists of words back into strings

df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))
pd.set_option('display.max_colwidth', -1)
final_docs = df['final_docs']
final_docs[3000:3020]

  after removing the cwd from sys.path.


3352    owned upto mention wmd's iraq bin laden saudi national pakistan taliban still power apparently negotiating endorsing trump slowly surely seems though iraq afghanistan going become kill box                             
3353    please allah swt give u strength courage stand tyranny injustice speak truth power allah swt seek refuge disbelief poverty allah swt bless ummat-e-muslimah allah swt forever bless pakistan                             
3354    @casey still chandragupta mauryan king annexed present day pakistan afghanistan seleucid empire mauryan power unmatched time                                                                                             
3355    @xiaomi_pakistan yes redmi power                                                                                                                                                                                         
3356    #psxupdate dgkc informs completion commissioning mw waste heat recovery power plant site

 # Part 3: run NMF and LDA models, for topic modeling

In [17]:
#create document term matrix with TFIDF

#import vectorizing tool (usee TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

In [18]:
doc_term_matrix_1.shape

(4299, 2000)

In [19]:
#run NMF model 

#import NMF tool 
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=6, max_iter=400)
nmf = nmf_model.fit_transform(doc_term_matrix_1)

In [20]:
# run LDA model

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 6, max_iter=10, learning_method='online', learning_decay=.9)
lda = lda_model.fit_transform(doc_term_matrix_1)

In [21]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 10)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 10)

LDA Model:
Topic 0:
[('nationwide', 21.207327355415565), ('nationwide power', 18.707027808273605), ('blackout', 17.11886335449509), ('power blackout', 12.32922631366153), ('tripping', 12.043813795994685), ('wide', 11.078521673176212), ('phase', 10.502830184563688), ('major nationwide', 10.14218994668589), ('major', 9.067612302501725), ('wide power', 8.656485013566353)]
Topic 1:
[('outage', 34.51177682571763), ('power outage', 33.84078206312509), ('iraq', 33.420329493651096), ('outage pakistan', 30.219399548817506), ('going', 24.678269279732344), ('afghanistan', 20.064412361760468), ('apparently', 18.5028837764831), ('taliban', 18.483656449882407), ('saudi', 18.05995032586004), ('mention', 17.793405973770554)]
Topic 2:
[('super', 21.365784855521234), ('super power', 21.089533972955874), ('loses', 14.60557203409045), ('loses power', 14.605524155025087), ('pakistan loses', 13.998972478697318), ('power massive', 13.683893773213061), ('pakistan super', 12.06242341146422), ('site', 11.384884

# Part 4: Run visualization and testing of LDA model

In [22]:
# visualization of LDA model 
import pyLDAvis.sklearn

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [23]:
# test LDA model

# log likelihood (higher score is better)
print("Log likelihood: ", lda_model.score(doc_term_matrix_1))


# perplexity (lower score is better)
print("Perplexity: ", lda_model.perplexity(doc_term_matrix_1))

Log likelihood:  -107361.38821173174
Perplexity:  1987.4068926693155


In [24]:
# cross-validation to find best parameters for LDA model

#import cross-validation tool
from sklearn.model_selection import GridSearchCV

# define search parameters
search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# initialize model for cross-validation
lda = LatentDirichletAllocation()

# initialize grid search class 
model = GridSearchCV(lda, param_grid=search_params)

# run grid search 
model.fit(doc_term_matrix_1)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 8, 10, 12]})

In [25]:
# what is best model? 
best_lda_model = model.best_estimator_

# print parameters for best model
print("Parameters for best model: ", model.best_params_)

# print log likelihood score
print("Log likelihood score of best model: ", model.best_score_)

# print perplexity score
print("Perplexity score of best model: ", best_lda_model.perplexity(doc_term_matrix_1))

Parameters for best model:  {'learning_decay': 0.5, 'n_components': 5}
Log likelihood score of best model:  -28300.971682239877
Perplexity score of best model:  2095.38592666388


In [26]:
# run LDA model with best parameters

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model_2 = LatentDirichletAllocation(n_components = 5, max_iter=10, learning_method='online', learning_decay=.7)
lda_2 = lda_model_2.fit_transform(doc_term_matrix_1)

In [27]:
# top topics of 2nd LDA model

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model 2:")
print_topics(lda_model_2, tfidfconverter )
print("=" * 20)

LDA Model 2:
Topic 0:
[('india', 67.54758180145308), ('people', 54.54414857709085), ('country', 42.611285886920406), ('world', 36.208673766671026), ('china', 35.50897410394422), ('like', 32.92647799303344), ('army', 29.777839247238465), ('state', 29.606741778673364), ('power pakistan', 28.49743610588298), ('nuclear', 28.082935829522818)]
Topic 1:
[('power breakdown', 64.43839134424574), ('blackout', 60.65054934888083), ('breakdown', 60.05950225554813), ('pakistan power', 49.43320647531248), ('major', 42.08514862801497), ('power blackout', 42.05838654287263), ('breakdown pakistan', 42.05012969439131), ('city', 40.39246082434444), ('country', 38.70955292131782), ('entire', 37.99241677903017)]
Topic 2:
[('hope', 49.40399869678017), ('power supply', 48.82806045831825), ('supply', 48.290107016578624), ('hospital', 45.9493780830317), ('pray', 42.91889783114101), ('ventilator', 41.50456271149436), ('currently', 40.8826666935611), ('hope hospital', 40.50661094920823), ('hospital pakistan', 40.

In [28]:
# make results of NMF model a column in dataframe

topic_values = nmf_model.transform(doc_term_matrix_1)
df['NMF_topic'] = topic_values.argmax(axis=1)

In [29]:
# make results of NMF model a column in dataframe

topic_values = lda_model_2.transform(doc_term_matrix_1)
df['LDA2_topic'] = topic_values.argmax(axis=1)

In [39]:
# visualization of 2nd LDA model 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model_2, doc_term_matrix_1, tfidfconverter, mds='tsne')
panel

In [77]:
df['text'][df.LDA2_topic==2][:20]

3      Weird...Vatican and Pakistan\nhttps://t.co/E7XqAAdpGE                                                                                                                                                                                                                                          
11     Meanwhile,please pray for all those who are currently on ventilators.I hope every hospital in Pakistan have an adequate power back system for continues power supply during this time #blackoutinPakistan https://t.co/OnM5bPNeXH                                                              
18     #Pakistan &amp; #Vatican Out Of Power? Blackouts? or Reign of Terror is over?\n\n #Blackout #10daysofdarkness ?                                                                                                                                                                                
30     Meanwhile, please pray for all those who are currently on ventilators. I hope every hospital in Pakistan has