In [5]:
pip install snscrape

Note: you may need to restart the kernel to use updated packages.


In [6]:
## import packages
import pandas as pd
from tqdm.notebook import tqdm
import snscrape.modules.twitter as sntwitter
import os
import pickle

In [36]:
## pulling tweets and save to folder
# pull via hashtag
searchText = "earthquake"
scraper = sntwitter.TwitterSearchScraper("#" + searchText)

# create a directory to hold the files
# create a parent folder
!mkdir tweets
# create a child folder based on search topic
filePath = os.path.join("tweets", searchText)
if not os.path.exists(filePath):
    os.mkdir(filePath)

mkdir: tweets: File exists


In [37]:
# hold all text in a list
columns = ['date', 'source', 'like', 'retweet', 'content']
tweets = []
j = 1

for i, tweet in enumerate(scraper.get_items()):
    
    # exit condition
    if (i > 1000):
        break;
        
    # scrape relevant info from twitter
    data = [tweet.date, tweet.source, tweet.likeCount, tweet.retweetCount, tweet.rawContent]
    # add the scraped info to a list
    tweets.append(data)

    # create a new file for every 10000 of data scraped
    if ((i+1) % 500 == 0):
        tweets.insert(0, columns)
        with open(str(filePath) + "/copy_" + str(j) + ".pickle", "wb") as file:
            pickle.dump(tweets, file)
        tweets.clear()
        j+=1
        
            
            

In [38]:
# count number of sub-files created
fileNumber = len(os.listdir('./tweets/' + str(searchText)))
print(fileNumber)

2


In [41]:
# load file object through pickle
with open("tweets/"+ searchText + "/copy_1.pickle", 'rb') as file:
    data = pickle.load(file)

fileName = []
for i in range(fileNumber):
    fileName.append("tweets/" + searchText + "/copy_" + str(i+1) + ".pickle")
    
print(fileName)

['tweets/earthquake/copy_1.pickle', 'tweets/earthquake/copy_2.pickle']


In [42]:
# merge all files into one dataframe
df = pd.concat(map(lambda file : pd.DataFrame(pd.read_pickle(file)[1:], columns=pd.read_pickle(file)[0]), fileName))

In [43]:
# check number of observations 
print(df.shape[0])

1000


In [44]:
# describe the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 499
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype              
---  ------   --------------  -----              
 0   date     1000 non-null   datetime64[ns, UTC]
 1   source   1000 non-null   object             
 2   like     1000 non-null   int64              
 3   retweet  1000 non-null   int64              
 4   content  1000 non-null   object             
dtypes: datetime64[ns, UTC](1), int64(2), object(2)
memory usage: 46.9+ KB


In [45]:
# quick peek at the dataset
df.head()

Unnamed: 0,date,source,like,retweet,content
0,2023-02-23 12:42:34+00:00,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",0,0,"Temblor - M 4.6 - 89 km WNW of Murghob, Tajik..."
1,2023-02-23 12:42:01+00:00,"<a href=""http://reddit.com/r/EEW"" rel=""nofollo...",0,0,"🌄 #Sismo! #Earthquake! 4.2 M, 12:20 UTC on lan..."
2,2023-02-23 12:41:48+00:00,"<a href=""https://ifttt.com"" rel=""nofollow"">IFT...",0,0,mandeeptoronto: RT @khalsaaidca: Parampreet Si...
3,2023-02-23 12:41:42+00:00,"<a href=""https://mobile.twitter.com"" rel=""nofo...",1,0,Thank you very much to @eu_echo for allocating...
4,2023-02-23 12:41:41+00:00,"<a href=""https://biston.web.id"" rel=""nofollow""...",0,0,#Gempa Mag:3.7\n 23-Feb-2023 19:36:42WIB\n Lok...


In [46]:
# convert UTC datetime to local datetime
df['date'] = pd.to_datetime(df['date']).dt.tz_convert('Asia/Singapore')

In [47]:
df['date_formatted'] = pd.to_datetime(df['date']).dt.date

In [48]:
df.head()

Unnamed: 0,date,source,like,retweet,content,date_formatted
0,2023-02-23 20:42:34+08:00,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",0,0,"Temblor - M 4.6 - 89 km WNW of Murghob, Tajik...",2023-02-23
1,2023-02-23 20:42:01+08:00,"<a href=""http://reddit.com/r/EEW"" rel=""nofollo...",0,0,"🌄 #Sismo! #Earthquake! 4.2 M, 12:20 UTC on lan...",2023-02-23
2,2023-02-23 20:41:48+08:00,"<a href=""https://ifttt.com"" rel=""nofollow"">IFT...",0,0,mandeeptoronto: RT @khalsaaidca: Parampreet Si...,2023-02-23
3,2023-02-23 20:41:42+08:00,"<a href=""https://mobile.twitter.com"" rel=""nofo...",1,0,Thank you very much to @eu_echo for allocating...,2023-02-23
4,2023-02-23 20:41:41+08:00,"<a href=""https://biston.web.id"" rel=""nofollow""...",0,0,#Gempa Mag:3.7\n 23-Feb-2023 19:36:42WIB\n Lok...,2023-02-23


In [49]:
# check the data type
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 499
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype                         
---  ------          --------------  -----                         
 0   date            1000 non-null   datetime64[ns, Asia/Singapore]
 1   source          1000 non-null   object                        
 2   like            1000 non-null   int64                         
 3   retweet         1000 non-null   int64                         
 4   content         1000 non-null   object                        
 5   date_formatted  1000 non-null   object                        
dtypes: datetime64[ns, Asia/Singapore](1), int64(2), object(3)
memory usage: 54.7+ KB


In [51]:
# dump dataframe to a pickle file
df.to_pickle("./tweets/"+ searchText + "/1000testset.pkl")

In [20]:
'''
Section below will reload a dataframe that has been saved to a pickle file
Regexp is first implemented to remove special characters from the contents column
Spacy-langdetect is then adopted to remove potentially non-english content based on cutoff score of >0.5
'''

'\nSection below will reload a dataframe that has been saved to a pickle file\nRegexp is first implemented to remove special characters from the contents column\nSpacy-langdetect is then adopted to remove potentially non-english content based on cutoff score of >0.5\n'

In [52]:
pip install spacy-langdetect

Note: you may need to restart the kernel to use updated packages.


In [22]:
!pip3 install spacy



In [23]:
!pip3 install -U spacy




In [68]:
# read in dataframe from pickle file
df = pd.read_pickle("./tweets/" + searchText + "/1000testset.pkl")

In [69]:
df.content

0      Temblor -  M 4.6 - 89 km WNW of Murghob, Tajik...
1      🌄 #Sismo! #Earthquake! 4.2 M, 12:20 UTC on lan...
2      mandeeptoronto: RT @khalsaaidca: Parampreet Si...
3      Thank you very much to @eu_echo for allocating...
4      #Gempa Mag:3.7\n 23-Feb-2023 19:36:42WIB\n Lok...
                             ...                        
495    6.8 #Earthquake Shakes #Tajikistan https://t.c...
496    #Northport send emergency relief for Turkiye, ...
497    ⚡️The #Kazakh government and #citizens have ex...
498    🔴 #Terremoto magnitudo 6.8 in #Tagikistan: il ...
499    Deprem sonrası evsiz kalan vatandaşlarımızın m...
Name: content, Length: 1000, dtype: object

In [70]:
# Data cleaning - first round
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('[.*?\,%_-]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [71]:
data_clean = pd.DataFrame(df.content.apply(round1))
data_clean

Unnamed: 0,content
0,temblor m km wnw of murghob tajikistan e...
1,🌄 sismo earthquake m utc on land río segundo...
2,mandeeptoronto rt khalsaaidca parampreet singh...
3,thank you very much to euecho for allocating ...
4,gempa \n \n km timurlaut deiyaipapua\n km...
...,...
495,earthquake shakes tajikistan
496,northport send emergency relief for turkiye sy...
497,⚡️the kazakh government and citizens have expr...
498,🔴 terremoto magnitudo in tagikistan il moment...


In [72]:
# Data Cleaning - second round
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…_-]', '', text)
    text = re.sub('\n\t', '', text)
    text = re.sub('[0-9]+', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [73]:
data_clean = pd.DataFrame(data_clean.content.apply(round1))
data_clean

Unnamed: 0,content
0,temblor m km wnw of murghob tajikistan e...
1,🌄 sismo earthquake m utc on land río segundo...
2,mandeeptoronto rt khalsaaidca parampreet singh...
3,thank you very much to euecho for allocating ...
4,gempa \n \n km timurlaut deiyaipapua\n km...
...,...
495,earthquake shakes tajikistan
496,northport send emergency relief for turkiye sy...
497,⚡️the kazakh government and citizens have expr...
498,🔴 terremoto magnitudo in tagikistan il moment...


In [78]:
# dataframe text preprocessing
import nltk
nltk.download('omw-1.4')
nltk.download('words')
from nltk.corpus import stopwords

# nltk.corpus.words.words() intends to remove non-english words
mystopwords = stopwords.words("english") + ['https']
WNlemma = nltk.WordNetLemmatizer()

def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [WNlemma.lemmatize(t.lower()) for t in tokens]
    tokens = [t for t in tokens if t not in mystopwords]
    tokens = [t for t in tokens if len(t) >= 3]
    return tokens

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/eesoonhang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/eesoonhang/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [95]:
# tokenize the content column
df['tokenize_content'] = df['content'].apply(preprocess)

# convert list of tokens to string of tokens
df['tokenize_content'] = df['tokenize_content'].apply(lambda x: ",".join(x))

In [97]:
df.head()

Unnamed: 0,date,source,like,retweet,content,date_formatted,tokenize_content
0,2023-02-23 20:42:34+08:00,"<a href=""https://dlvrit.com/"" rel=""nofollow"">d...",0,0,"Temblor - M 4.6 - 89 km WNW of Murghob, Tajik...",2023-02-23,"temblor,4.6,wnw,murghob,tajikistan,http,//t.co..."
1,2023-02-23 20:42:01+08:00,"<a href=""http://reddit.com/r/EEW"" rel=""nofollo...",0,0,"🌄 #Sismo! #Earthquake! 4.2 M, 12:20 UTC on lan...",2023-02-23,"sismo,earthquake,4.2,12:20,utc,land,río,segund..."
2,2023-02-23 20:41:48+08:00,"<a href=""https://ifttt.com"" rel=""nofollow"">IFT...",0,0,mandeeptoronto: RT @khalsaaidca: Parampreet Si...,2023-02-23,"mandeeptoronto,khalsaaidca,parampreet,singh,ta..."
3,2023-02-23 20:41:42+08:00,"<a href=""https://mobile.twitter.com"" rel=""nofo...",1,0,Thank you very much to @eu_echo for allocating...,2023-02-23,"thank,much,eu_echo,allocating,mill,eur,alert,f..."
4,2023-02-23 20:41:41+08:00,"<a href=""https://biston.web.id"" rel=""nofollow""...",0,0,#Gempa Mag:3.7\n 23-Feb-2023 19:36:42WIB\n Lok...,2023-02-23,"gempa,mag:3.7,23-feb-2023,19:36:42wib,lok:4.04..."


In [82]:
# visualize the tokens frequency distribution
toks =[t for t in [term.lower() for row in (nltk.word_tokenize(text) for text in df['content']) for term in row] if (t not in mystopwords and len(t)>=3)]
fd = nltk.FreqDist(toks)
print(fd.most_common(50))

[('earthquake', 1204), ('tajikistan', 293), ('turkey', 184), ('magnitude', 159), ('utc', 149), ('sismo', 128), ('deprem', 119), ('syria', 111), ('ago', 107), ('china', 103), ('chile', 84), ('temblor', 83), ('info', 79), ('min', 74), ('strikes', 72), ('usgs', 72), ('terremoto', 65), ('6.8', 63), ('7.2', 60), ('reports', 54), ('near', 52), ('depth', 51), ('region', 49), ('2023', 47), ('app', 42), ('pakistan', 42), ('tajikistanearthquake', 42), ('border', 40), ('people', 40), ('2/23/23', 39), ('2023/02/23', 38), ('2023-02-23', 37), ('में', 37), ('february', 37), ('//t.co/rblvnxzyn8', 36), ('murghob', 35), ('feb', 35), ('hit', 35), ('news', 34), ('gempa', 32), ('new', 31), ('भूकंप', 31), ('felt', 30), ('islamabad', 29), ('turkeyearthquake', 28), ('time', 27), ('4.5', 27), ('affected', 26), ('türkiye', 26), ('aid', 25)]


In [None]:
# filter out non-english text
'''
from spacy_langdetect import LanguageDetector
import spacy
nlp = spacy.load("en")
nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

corpus = []
for text in df['content']:
    doc = nlp(text)
    detect_language = doc._.language
    if (detect_language['language'] == 'en') and (detect_language['score'] > 0.5):
        corpus.append(text)

'''