# Part 1: Data cleaning 

In [1]:
#import initial libraries

import pandas as pd
import numpy as np


In [3]:
#import data 

df = pd.read_csv('data/pakistan.csv')

In [4]:
# look at basic info about data

df.info()
# this data set consists of 20286 Tweets
# Twarc search for keywords "Pakistan" AND "power" after major nationwide power outage was reported 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20286 entries, 0 to 20285
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            20286 non-null  int64  
 1   tweet_url                     20286 non-null  object 
 2   created_at                    20286 non-null  object 
 3   parsed_created_at             20286 non-null  object 
 4   user_screen_name              20286 non-null  object 
 5   text                          20286 non-null  object 
 6   tweet_type                    20286 non-null  object 
 7   coordinates                   0 non-null      float64
 8   hashtags                      8270 non-null   object 
 9   media                         1989 non-null   object 
 10  urls                          3189 non-null   object 
 11  favorite_count                20286 non-null  int64  
 12  in_reply_to_screen_name       1492 non-null   object 
 13  i

In [5]:
# count number of languages in data set

count_lang = df['lang'].unique()
print(len(count_lang), count_lang)

31 ['en' 'ja' 'und' 'hi' 'de' 'ta' 'in' 'mr' 'gu' 'kn' 'da' 'es' 'ur' 'nl'
 'or' 'tl' 'et' 'ht' 'pt' 'pl' 'tr' 'vi' 'eu' 'it' 'sv' 'no' 'fi' 'fr'
 'ca' 'ro' 'pa']


In [7]:
# tweets are in 31 different languages

# I'll be working only with Tweets in English
# drop tweets in all other languages
# now working with 19459 Tweets 

df = df[df.lang == 'en']
df.shape

(19459, 37)

In [9]:
# check start time & date of data

df.iloc[0]

# first Tweet Sun Jan 10 03:32:31

id                                                            1348110445333209091
tweet_url                       https://twitter.com/alienguru02/status/1348110...
created_at                                         Sun Jan 10 03:32:31 +0000 2021
parsed_created_at                                       2021-01-10 03:32:31+00:00
user_screen_name                                                      alienguru02
text                            Power blackout reported in multiple cities of ...
tweet_type                                                                retweet
coordinates                                                                   NaN
hashtags                                                                      NaN
media                                                                         NaN
urls                                                                          NaN
favorite_count                                                               5187
in_reply_to_scre

In [10]:
# check end time & date of data 

df.iloc[-1]

# Sat Jan 02 12:56:37

id                                                            1345353303115395076
tweet_url                       https://twitter.com/byzaa_g/status/13453533031...
created_at                                         Sat Jan 02 12:56:37 +0000 2021
parsed_created_at                                       2021-01-02 12:56:37+00:00
user_screen_name                                                          byzaa_g
text                            Bilateral trade between Pakistan&amp;Sri Lanka...
tweet_type                                                                retweet
coordinates                                                                   NaN
hashtags                        SriLanka Pakistan PakArmy Khilafah StopMuslimC...
media                                                                         NaN
urls                                                                          NaN
favorite_count                                                                 16
in_reply_to_scre

# Part 2: Text processing for NLP 

In [11]:
# create variable for "text" column 

text = df['text'] 

In [12]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers
# import necessary libraries: ntlk etc.

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 

stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [13]:
# apply text processing functions to text

df['processed_text'] = df['text'].apply(process_text)

In [14]:
# look at some of processed text

pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  This is separate from the ipykernel package so we can avoid doing imports until


0     [power, blackout, reported, multiple, cities, pakistan, pakistan, media]                                                                                                                                                                            
1     [true, sad, state, affairs, #pakistan, nationwide, power, cut, pak, fears, either, coup, air, strike, claim, killed, crows, damaged, trees, heart, hearts, know, india, sent, strong, message, balakot, airstrikes]                                 
2     [power, blackout, reported, multiple, cities, pakistan, pakistan, media]                                                                                                                                                                            
3     [major, power, grid, failure, #pakistan, karachi, islamabad, lahore, total, blackout]                                                                                                                                                            

In [15]:
# part-of-speech tagging 

ready_for_pos = df['processed_text']

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [16]:
# lemmatizing

pos_tagged = df['pos_tagged']

wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [17]:
# look at lemmatized text

df['lemmatized'] = lemmatized
lemmatized[:20]

[['power',
  'blackout',
  'reported',
  'multiple',
  'city',
  'pakistan',
  'pakistan',
  'medium'],
 ['true',
  'sad',
  'state',
  'affair',
  '#pakistan',
  'nationwide',
  'power',
  'cut',
  'pak',
  'fear',
  'either',
  'coup',
  'air',
  'strike',
  'claim',
  'killed',
  'crow',
  'damaged',
  'tree',
  'heart',
  'heart',
  'know',
  'india',
  'sent',
  'strong',
  'message',
  'balakot',
  'airstrikes'],
 ['power',
  'blackout',
  'reported',
  'multiple',
  'city',
  'pakistan',
  'pakistan',
  'medium'],
 ['major',
  'power',
  'grid',
  'failure',
  '#pakistan',
  'karachi',
  'islamabad',
  'lahore',
  'total',
  'blackout'],
 ['@jovanhpulitzer',
  'election',
  'switching',
  'usa',
  '->',
  'leaonardo',
  'vatican',
  '->',
  'pakistan',
  '->',
  'china',
  '->',
  'frankfurt-usa',
  'hour',
  'ago',
  'pakistan',
  'complete',
  'power',
  'outage',
  'total',
  'grid',
  'failure',
  'vatican',
  'complete',
  'power',
  'failure',
  'red'],
 ['pakistan',
  'ex

In [18]:
# before vectorizing, cast lists of words back into strings

df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))
pd.set_option('display.max_colwidth', -1)
final_docs = df['final_docs']
final_docs[3000:3020]

  after removing the cwd from sys.path.


3211    #update massive power blackout plunged entire pakistan darkness report dawn news                                                                               
3212    large area pakistan plunged darkness first time happened country's power infrastructure fragile past separatist attack blamed outage appear case time stay safe
3213    pakistan blackout leaf million without power ... nuclear power                                                                                                 
3214    power shut major city pakistan mean whole pakistan power moment happened rarely anyone update reason                                                           
3215    pakistan experience massive blackout following breakdown national power grid                                                                                   
3216    massive power outage plunge pakistan darkness                                                                                                           

In [19]:
#create document term matrix with TFIDF

#import vectorizing tool (usee TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

# Part 3: run NMF and LDA models, for topic modeling

In [21]:
#run NMF model 

#import NMF tool 
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=6)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)


In [22]:
# run LDA model

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 6, max_iter=10, learning_method='online', learning_decay=.9)
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [23]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 20)

LDA Model:
Topic 0:
[('hit', 137.75718834657042), ('million', 120.59797436777058), ('pakistan hit', 113.34606698990156), ('blackout', 89.56008590234167), ('_ह', 84.09149686138308), ('million power', 82.45945143669144), ('nationwide', 75.29605261909242), ('daughter', 73.16867848798611), ('power blackout', 73.06189973487683), ('nationwide blackout', 67.49008623873388)]
Topic 1:
[('outage', 131.69533837851947), ('earth', 126.64646961163753), ('power earth', 123.23292818391491), ('power outage', 117.54423819187159), ('outage pakistan', 117.07060156811124), ('undo', 114.24973784759433), ('undo pakistan', 112.35378033326123), ('muslim', 103.78684805690335), ('unitedpakistanforummah', 102.604011583609), ('day', 96.57214801178466)]
Topic 2:
[('breakdown', 379.8711642146731), ('power grid', 328.37392279563676), ('grid', 316.1995903769569), ('bitcoin', 308.1952308865371), ('country', 303.59305876317444), ('practically', 301.58532234911866), ('darkness', 259.8310877076762), ('grid breakdown', 256

In [24]:
# insert NMF model results back into dataframe
topic_values = nmf_model.transform(doc_term_matrix_1)
df['NMF_topic'] = topic_values.argmax(axis=1)
df.head(1)

Unnamed: 0,id,tweet_url,created_at,parsed_created_at,user_screen_name,text,tweet_type,coordinates,hashtags,media,...,user_name,user_statuses_count,user_time_zone,user_urls,user_verified,processed_text,pos_tagged,lemmatized,final_docs,NMF_topic
0,1348110445333209091,https://twitter.com/alienguru02/status/1348110445333209091,Sun Jan 10 03:32:31 +0000 2021,2021-01-10 03:32:31+00:00,alienguru02,Power blackout reported in multiple cities of Pakistan: Pakistan media,retweet,,,,...,I'm DeEp❤,16867,,,False,"[power, blackout, reported, multiple, cities, pakistan, pakistan, media]","[(power, NN), (blackout, NN), (reported, VBD), (multiple, JJ), (cities, NNS), (pakistan, VBP), (pakistan, JJ), (media, NNS)]","[power, blackout, reported, multiple, city, pakistan, pakistan, medium]",power blackout reported multiple city pakistan pakistan medium,4


In [32]:
df.text[df.NMF_topic == 1][500:550]

16452    Is Bitcoin worth it?\n\nBitcoin is socially wasteful and practically useless\n\nIt is consuming electricity equivalent to the entire power consumption of Pakistan, a country of 200M+\n\nAnd what use does it offer? Practically none. Its value only represents speculative craze. https://t.co/xLpGuGtNBy
16453    Is Bitcoin worth it?\n\nBitcoin is socially wasteful and practically useless\n\nIt is consuming electricity equivalent to the entire power consumption of Pakistan, a country of 200M+\n\nAnd what use does it offer? Practically none. Its value only represents speculative craze. https://t.co/xLpGuGtNBy
16454    Is Bitcoin worth it?\n\nBitcoin is socially wasteful and practically useless\n\nIt is consuming electricity equivalent to the entire power consumption of Pakistan, a country of 200M+\n\nAnd what use does it offer? Practically none. Its value only represents speculative craze. https://t.co/xLpGuGtNBy
16456    Is Bitcoin worth it?\n\nBitcoin is socially wasteful and prac