Started: 22-03-2021, Last Updated: 22-03-2021
# Preprocessing

## Preliminaries

In [1]:
# load packages
import os
import csv
import json

import numpy as np 
import pandas as pd 
import itertools
import re

from nltk.stem import SnowballStemmer   
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

In [2]:
# set working directory
os.chdir("/Users/philine/Dropbox/2021-2022_BSE/T2_TextMining/term_paper/")
#os.chdir("/Users/deslava/Dropbox/deslava/BSE/text_mining/term_paper/")

## 1 Load and explore data

In [3]:
# load data
tweet_data = pd.read_csv('data/all_english_tweets_v2.csv', lineterminator='\n') # full dataset
#tweet_data = pd.read_csv('data/sampled_tweets_1M_v2.csv', lineterminator='\n') # sampled data
tweet_data.head()

Unnamed: 0.1,Unnamed: 0,userid,username,acctdesc,location,following,followers,totaltweets,usercreatedts,tweetid,tweetcreatedts,retweetcount,text,hashtags,language,coordinates,favorite_count,extractedts,timestamp
0,357018,962369743,toonspt_,พยรลก 😉\nจะเปิดแอคเมื่อขายของเท่านั้น!!,,462,7,20549,2012-11-21 13:01:19.000000,1496738675085897729,2022-02-24 06:48:02,1897,Footage of the airport bombing in Ivano-Franki...,"[{'text': 'Ukraine', 'indices': [68, 76]}, {'t...",en,,0,2022-02-24 06:51:34.430620,24-02-2022
1,357021,1073676020,IndiaTodayFLASH,No dragging feet on news. Crisp & short news s...,,117,1152610,589066,2013-01-09 12:46:38.000000,1496738675584954371,2022-02-24 06:48:02,18,Ukraine MP Sophia Fedyna tells about the groun...,"[{'text': 'ITVideo', 'indices': [130, 138]}]",en,,0,2022-02-24 06:51:34.403108,24-02-2022
2,357023,1040140500718837760,Areopagiet,Dᴀᴅ ღ Pʜɪ & Qᴜɪ /Pᴀʀᴛɴᴇʀ | Aᴛʜᴇɪsᴛ | ADD /ASS ...,EU v2.0,5008,267,9894,2018-09-13 07:29:55.000000,1496738676335734785,2022-02-24 06:48:03,1552,A cruise missile fired by the Russian army fel...,"[{'text': 'Ukraine', 'indices': [72, 80]}, {'t...",en,,0,2022-02-24 06:51:34.384864,24-02-2022
3,357031,3821643918,Sicarius130,Christie // 刀剣乱舞 (izmt+mtiz for life) + 2.5D (...,Hong Kong,459,272,57829,2015-10-08 04:49:27.000000,1496738678332280834,2022-02-24 06:48:03,1032,"SPREAD AND SHARE, YOU CAN HELP UKRAINE #Ukrain...","[{'text': 'Ukraine', 'indices': [55, 63]}, {'t...",en,,0,2022-02-24 06:51:34.311760,24-02-2022
4,357034,485905284,TheAAntagonist,T.A.A | The handwriting on the wall: LIVE by t...,Omnipresent,12061,13484,813,2012-02-07 18:10:59.000000,1496738679548792834,2022-02-24 06:48:03,0,"Now this is wrong, absolutely wrong, for @JoeB...","[{'text': 'Putin', 'indices': [101, 107]}, {'t...",en,,0,2022-02-24 06:51:34.284328,24-02-2022


In [4]:
# print some example tweets
for tweet in tweet_data["text"][:10]:
    print(tweet)
    print("   ")

Footage of the airport bombing in Ivano-Frankivsk. #Ukraine #Russia https://t.co/MLVuNyPItI
   
Ukraine MP Sophia Fedyna tells about the ground situation in Kyiv in an exclusive conversation with @Akshita_N .

#ITVideo #Russia #Ukraine #Kyiv #Kharkiv https://t.co/TtPt1UZqhz
   
A cruise missile fired by the Russian army fell on Kiev #Ukraine
#Russia 
https://t.co/x0Cty5sDjX
   
SPREAD AND SHARE, YOU CAN HELP UKRAINE #Ukraine #Russia https://t.co/rp2IFCKMi3
   
Now this is wrong, absolutely wrong, for @JoeBiden, @NATO, @UN and @EU_Commission to sit &amp; watch #Putin threaten them while flagrantly violating international norms &amp; bombing another sovereign nation #Ukraine with impunity.
Do these sanctions really work? And who's next? Poland? https://t.co/idlCYh3MIQ
   
🇺🇦 53rd Mechanized Brigade continues to suffer losses in Volnovakha area. #Russia #Ukraine #UkraineWar #RussiaWar #Europe #EU #NATO #US [88]
   
The world must act immediately.- #Ukraine is at stake. 
To do list:
1. Dev

## 2 Extract hashtags

In [5]:
def get_hashtags(text):
    if pd.isna(text) or text=="[]":
        return np.nan
    else:
        hashtags=[]
        text=re.sub("[\']+", '"', text)
        text=json.loads(text)
        for i in text:
            hashtags.append(i["text"])
    return " ".join(hashtags)

In [6]:
tweet_data["hashtags_words"]=tweet_data["hashtags"].apply(get_hashtags)

In [7]:
tweet_data.T.iloc[:,0:5]

Unnamed: 0,0,1,2,3,4
Unnamed: 0,357018,357021,357023,357031,357034
userid,962369743,1073676020,1040140500718837760,3821643918,485905284
username,toonspt_,IndiaTodayFLASH,Areopagiet,Sicarius130,TheAAntagonist
acctdesc,พยรลก 😉\nจะเปิดแอคเมื่อขายของเท่านั้น!!,No dragging feet on news. Crisp & short news s...,Dᴀᴅ ღ Pʜɪ & Qᴜɪ /Pᴀʀᴛɴᴇʀ | Aᴛʜᴇɪsᴛ | ADD /ASS ...,Christie // 刀剣乱舞 (izmt+mtiz for life) + 2.5D (...,T.A.A | The handwriting on the wall: LIVE by t...
location,,,EU v2.0,Hong Kong,Omnipresent
following,462,117,5008,459,12061
followers,7,1152610,267,272,13484
totaltweets,20549,589066,9894,57829,813
usercreatedts,2012-11-21 13:01:19.000000,2013-01-09 12:46:38.000000,2018-09-13 07:29:55.000000,2015-10-08 04:49:27.000000,2012-02-07 18:10:59.000000
tweetid,1496738675085897729,1496738675584954371,1496738676335734785,1496738678332280834,1496738679548792834


## 3 Reformat timestamp for temporal pooling

In [8]:
tweet_data['tweetcreatedts'] = pd.to_datetime(tweet_data['tweetcreatedts'])
tweet_data['timestamp_H'] = tweet_data['tweetcreatedts'].dt.strftime("%H").astype('int') # extract hour
tweet_data['timestamp_M'] = tweet_data['tweetcreatedts'].dt.strftime("%M").astype('int') # extract minute
tweet_data[['tweetcreatedts', 'timestamp', 'timestamp_H', 'timestamp_M']].head()

Unnamed: 0,tweetcreatedts,timestamp,timestamp_H,timestamp_M
0,2022-02-24 06:48:02,24-02-2022,6,48
1,2022-02-24 06:48:02,24-02-2022,6,48
2,2022-02-24 06:48:03,24-02-2022,6,48
3,2022-02-24 06:48:03,24-02-2022,6,48
4,2022-02-24 06:48:03,24-02-2022,6,48


## 4 Clean tweets
### 4.1 Basic Cleaning

In [9]:
# clean the tweets
def clean_tweet(tweet):
    temp = tweet
    temp = tweet.lower()
    #temp = re.sub("(?<=\w)[‘`’'](?=\w)", "", temp) # to avoid removing contractions in english
    temp = re.sub(r'&amp', ' ', temp) # remove twitter amp
    #temp = re.sub("http\S+", "http", temp) # links starting with "http"
    #temp = re.sub("www.\S+", "http", temp) # links starting with "www."
    temp = re.sub("http\S+", " ", temp) # links starting with "http" replace with a space
    temp = re.sub("www.\S+", " ", temp) # links starting with "www." replace with a space
    temp = temp.replace('@', '') # remove @ in mentions
    #temp = re.sub("@[A-Za-z0-9_]+","", temp) # remove mentions
    temp = temp.replace('#', '') # remove # in hashtags
    #temp = re.sub("#[A-Za-z0-9_]+","", temp) # remove hashtags
    temp = re.sub(r":.*?:", '', temp) # remove emojis
    temp = re.sub("[()!?]", " ", temp) # remove punctuation
    temp = re.sub("\[.*?\]"," ", temp)
    temp = re.sub('(?m)([a-z])[\r\n]+$','\\1 ',temp) # remove linebreaks
    #temp = re.sub("[^a-z0-9]"," ", temp) # remove non-alphanumeric characters
    temp = re.sub("[^A-Za-z]"," ", temp) # remove non-alphabetic characters
    temp = re.sub("\s+", " ", temp).strip() # remove double spaces
    
    return temp

In [10]:
tweet_data['cleaned_text'] = tweet_data['text'].apply(clean_tweet)

### 4.2 Tokenize

In [11]:
porter=SnowballStemmer("english")
lmtzr = WordNetLemmatizer()

stop_words = set(stopwords.words('english'))

additional_stopwords = ['ukraine', 'russia', 'russiaukraine', 'ukrainerussia', 'russiaukraineconflict',
                        'ukrainerussianwar', 'ukrainerussiawar', 'russiaukrainewar', 'russianukrainewar',
                        'russiaukraineconflict', 'ukraineinvasion', 'ukrainian', 'russian', 'stoprussia',
                        'stopputin', 'standwithukraine', 'ukraineunderattack']

for word in additional_stopwords:
    stop_words.add(word)

def tokenize(words, modulation):
    tokens = re.split(r'\W+', words)
    stems = []
    for token in tokens:
        if token not in stop_words:
            if modulation==0:
                stems.append(token)
            if modulation==1:
                stems.append(porter.stem(token))
            if modulation==2:
                stems.append(lmtzr.lemmatize(token))
    return stems

In [12]:
# print some example tweets
for tweet in tweet_data["cleaned_text"][:10]:
    print(tweet)
    print("   ")

footage of the airport bombing in ivano frankivsk ukraine russia
   
ukraine mp sophia fedyna tells about the ground situation in kyiv in an exclusive conversation with akshita n itvideo russia ukraine kyiv kharkiv
   
a cruise missile fired by the russian army fell on kiev ukraine russia
   
spread and share you can help ukraine ukraine russia
   
now this is wrong absolutely wrong for joebiden nato un and eu commission to sit watch putin threaten them while flagrantly violating international norms bombing another sovereign nation ukraine with impunity do these sanctions really work and who s next poland
   
rd mechanized brigade continues to suffer losses in volnovakha area russia ukraine ukrainewar russiawar europe eu nato us
   
the world must act immediately ukraine is at stake to do list devastating sanctions on russia now including swift fully isolate russia by all means in all formats weapons equipment for ukraine financial assistance humanitarian assistance
   
the historic mo

In [13]:
# Note: modulation fixes the pre-processing: 0 (nothing) vs. 1 (stemming) vs. 2 (lemmatizing)
tweet_data['tokenized_text'] = [tokenize(document, 0) for document in tweet_data['cleaned_text']]
tweet_data[['text', 'cleaned_text', 'tokenized_text']].head()

Unnamed: 0,text,cleaned_text,tokenized_text
0,Footage of the airport bombing in Ivano-Franki...,footage of the airport bombing in ivano franki...,"[footage, airport, bombing, ivano, frankivsk]"
1,Ukraine MP Sophia Fedyna tells about the groun...,ukraine mp sophia fedyna tells about the groun...,"[mp, sophia, fedyna, tells, ground, situation,..."
2,A cruise missile fired by the Russian army fel...,a cruise missile fired by the russian army fel...,"[cruise, missile, fired, army, fell, kiev]"
3,"SPREAD AND SHARE, YOU CAN HELP UKRAINE #Ukrain...",spread and share you can help ukraine ukraine ...,"[spread, share, help]"
4,"Now this is wrong, absolutely wrong, for @JoeB...",now this is wrong absolutely wrong for joebide...,"[wrong, absolutely, wrong, joebiden, nato, un,..."


In [14]:
# For data exploration purposes
explore = tweet_data.copy()
explore['stemmed_text'] = [tokenize(document, 1) for document in explore['cleaned_text']]
explore['lemmatized_text'] = [tokenize(document, 2) for document in explore['cleaned_text']]
explore.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2022257 entries, 0 to 2022256
Data columns (total 26 columns):
 #   Column           Dtype         
---  ------           -----         
 0   Unnamed: 0       int64         
 1   userid           int64         
 2   username         object        
 3   acctdesc         object        
 4   location         object        
 5   following        int64         
 6   followers        int64         
 7   totaltweets      int64         
 8   usercreatedts    object        
 9   tweetid          int64         
 10  tweetcreatedts   datetime64[ns]
 11  retweetcount     int64         
 12  text             object        
 13  hashtags         object        
 14  language         object        
 15  coordinates      object        
 16  favorite_count   int64         
 17  extractedts      object        
 18  timestamp        object        
 19  hashtags_words   object        
 20  timestamp_H      int64         
 21  timestamp_M      int64         

In [15]:
# print some example tweets
loc = 200

print("Original text: \n", explore['text'].iloc[loc])
print(" ")
print("Tokenized text: \n", explore['tokenized_text'].iloc[loc])
print(" ")
print("Lemmatized text: \n", explore['lemmatized_text'].iloc[loc])
print(" ")
print("Stemmed text: \n", explore['stemmed_text'].iloc[loc])

Original text: 
 🇺🇦 Ukrainian authorities confirm they have been under sustained fire from Russian forces since 0500 hours (5 AM). #Russia #Ukraine #UkraineWar #RussiaWar #Europe #EU #NATO #US [62]
 
Tokenized text: 
 ['authorities', 'confirm', 'sustained', 'fire', 'forces', 'since', 'hours', 'ukrainewar', 'russiawar', 'europe', 'eu', 'nato', 'us']
 
Lemmatized text: 
 ['authority', 'confirm', 'sustained', 'fire', 'force', 'since', 'hour', 'ukrainewar', 'russiawar', 'europe', 'eu', 'nato', 'u']
 
Stemmed text: 
 ['author', 'confirm', 'sustain', 'fire', 'forc', 'sinc', 'hour', 'ukrainewar', 'russiawar', 'europ', 'eu', 'nato', 'us']


## 5 Export pre-processed data

In [16]:
# export the relevant dataframes
tweet_data.to_csv('data/all_english_tweets_preprocessed_v3.csv') # database without http's
#tweet_data.to_csv('data/all_english_tweets_preprocessed_v2.csv')
#tweet_data.to_csv('data/sampled_tweets_1M_v2_preprocessed.csv') # sampled data

#tweet_data.to_json('data/all_english_tweets_preprocessed.json') # full dataset
#tweet_data.to_json('data/sampled_tweets_1M_v2_preprocessed.json') # sampled data