## Preprocessing code
Time and textual preprocessing. Insert movement name in the beginning.
Example of USA Sunrise movement.

In [1]:
import pickle
import pandas as pd
import time
import datetime

In [2]:
movement = 'sunrise'
week_last_day = 'W-SAT'
crisis_starting_week = 8

# 1. Time preprocessing: Order by weeks

In [3]:
with open(f'data/raw/{movement}_tweets', 'rb') as file:
    df = pickle.load(file)    

In [4]:
df.head()

Unnamed: 0,created_at,id,full_text,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,lang,user.id,user.id_str,user.name,user.screen_name,user.location,user.description,user.url,user.followers_count,user.friends_count,user.created_at
0,Fri Jun 12 21:51:01 +0000 2020,1271560665409310720,RT @dearnonnatives: Stop. Take a breath. Relax...,519,0,False,False,,en,1151530473727746050,1151530473727746050,Sunrise Chico,SunriseChico,"Chico, CA",Building a movement of young people to fight c...,https://t.co/n6ua26W2Lu,208,260,Wed Jul 17 16:33:54 +0000 2019
1,Fri Jun 12 21:50:18 +0000 2020,1271560482671824897,RT @cecaaay: SAY HIS NAME. ROBERT FULLER. He w...,160516,0,False,False,,en,1151530473727746050,1151530473727746050,Sunrise Chico,SunriseChico,"Chico, CA",Building a movement of young people to fight c...,https://t.co/n6ua26W2Lu,208,260,Wed Jul 17 16:33:54 +0000 2019
2,Fri Jun 12 05:12:52 +0000 2020,1271309471067762688,RT @crunchwrapsup37: If BLM disappeared from y...,66854,0,False,False,,en,1151530473727746050,1151530473727746050,Sunrise Chico,SunriseChico,"Chico, CA",Building a movement of young people to fight c...,https://t.co/n6ua26W2Lu,208,260,Wed Jul 17 16:33:54 +0000 2019
3,Fri Jun 12 05:10:31 +0000 2020,1271308879490514944,RT @ChicoNR: Local activist groups met downtow...,4,0,False,False,,en,1151530473727746050,1151530473727746050,Sunrise Chico,SunriseChico,"Chico, CA",Building a movement of young people to fight c...,https://t.co/n6ua26W2Lu,208,260,Wed Jul 17 16:33:54 +0000 2019
4,Fri Jun 12 02:47:50 +0000 2020,1271272974637621250,RT @MuseWendi: if you truly think black lives ...,5902,0,False,False,,en,1151530473727746050,1151530473727746050,Sunrise Chico,SunriseChico,"Chico, CA",Building a movement of young people to fight c...,https://t.co/n6ua26W2Lu,208,260,Wed Jul 17 16:33:54 +0000 2019


In [5]:
def time_preprocessing(df, week_last_day, crisis_starting_week):
    """Takes raw data, adds timestamp, week number and crisis dummy variable
    Week last day - 'W-SAT'for USA; 'W-WED' for UK
    Crisis first week - 8 for US, 10 for UK"""
    # Extract timestamp from the textual "created_at" column
    df['timestamp'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')
    # Remove data not needed right now
    df = df[['timestamp', 'full_text', 'user.screen_name']]
    # Remove tweets not included in the period taken for research (01.11.2019-30.06.2020)
    df = df.loc[(df['timestamp'] > '2019-12-01') & (df['timestamp'] < '2020-05-31')].reset_index(drop=True)
    # Add period
    df['period'] = df['timestamp'].dt.to_period(week_last_day)
    df = df.sort_values('period').reset_index(drop=True)
    # Add case week number
    i = 1
    df['week_number'] = 0
    for group_index, group in df.groupby('period'):
        df.loc[df['period']==group_index, 'week_number'] = i
        i += 1
    # Add crisis parameter
    df['crisis'] = 1
    df.loc[df['week_number']<crisis_starting_week, 'crisis'] = 0
    
    return df

In [6]:
df_time_preprocessed = time_preprocessing(df, week_last_day, crisis_starting_week)

In [7]:
df_time_preprocessed

Unnamed: 0,timestamp,full_text,user.screen_name,period,week_number,crisis
0,2019-12-01 13:02:51,"The Governor doesn’t care, but that doesn’t me...",SunriseMvmtRVA,2019-12-01/2019-12-07,1,0
1,2019-12-03 04:53:29,👇 Hey @OregonDOT @oregonmetro @OregonGovBrown ...,SunrisePDX,2019-12-01/2019-12-07,1,0
2,2019-12-03 04:52:39,RT @SallyAnn_12: Every land use decision is a ...,SunrisePDX,2019-12-01/2019-12-07,1,0
3,2019-12-03 04:52:35,RT @BuildSoil: Remember being thankful is noth...,SunrisePDX,2019-12-01/2019-12-07,1,0
4,2019-12-03 04:52:27,"RT @mastmeghan: Honestly, it is bananas to liv...",SunrisePDX,2019-12-01/2019-12-07,1,0
...,...,...,...,...,...,...
24333,2020-05-30 04:23:33,RT @alicesperi: George Floyd’s death is an urg...,SunriseMvmtLA,2020-05-24/2020-05-30,26,1
24334,2020-05-30 04:23:06,RT @cupidastwid: LA SHOW UP!!\nTips: \n- Avoid...,SunriseMvmtLA,2020-05-24/2020-05-30,26,1
24335,2020-05-30 04:22:57,RT @SpikeFriedman: If you are personally incre...,SunriseMvmtLA,2020-05-24/2020-05-30,26,1
24336,2020-05-30 20:52:41,"Tonight, Sunrise NYC and Brooklyn Public Libra...",sunrisemvmtnyc,2020-05-24/2020-05-30,26,1


# 2. Textual preprocessing
1. Normalization
2. Removing symbols
3. Removing lexical extras and tokenization
4. Identify frequent bigrams    

In [8]:
[tweet for tweet in df_time_preprocessed['full_text'].loc[:5]]

['The Governor doesn’t care, but that doesn’t mean we can’t force him to repeal if we twist his hand hard enough. It won’t be an easy fight, and we will be fighting against quite a few Dems in the legislature, but we must win if we want to bring dignity and safety to workers https://t.co/zfT4NkSNt3',
 '👇 Hey @OregonDOT @oregonmetro @OregonGovBrown https://t.co/22bIHetmYL',
 'RT @SallyAnn_12: Every land use decision is a decision for or against climate action. https://t.co/VUPVLJYTwM',
 'RT @BuildSoil: Remember being thankful is nothing unless you act to give back and repair things.',
 "RT @mastmeghan: Honestly, it is bananas to live in a world where articles like this come out and we're just expected to carry on as usual.…",
 'RT @nomorefreeways: “That’s where the change happens: When we get millions of people who are mobilized and feel that there’s hope and there…']

## I. Normalization via Ekphrasis
1. Replace functional things (url, email etc) by tags
2. Spell correction
3. Expand contractions
4. Replace emoticons by text

In [9]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.dicts.emoticons import emoticons

In [10]:
# Ekphraris preprocessing pipeline
text_processor = TextPreProcessor(
    # 1. Remove stuff
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
       
    # 2. Spell correction and contraction expansion
    corrector="twitter", 
    
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correction=True,  # spell correction
        
    # 3. Replace emojis with textual expressions
    dicts=[emoticons]
)

Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [11]:
preprocessed_stage_one = ["".join(text_processor.pre_process_doc(tweet)) for tweet in df_time_preprocessed['full_text']]

In [12]:
[tweet for tweet in preprocessed_stage_one[:5]]

['The Governor doesn’t care, but that doesn’t mean we can’t force him to repeal if we twist his hand hard enough. It won’t be an easy fight, and we will be fighting against quite a few Dems in the legislature, but we must win if we want to bring dignity and safety to workers <url>',
 '👇 Hey <user> <user> <user> <url>',
 'RT <user> : Every land use decision is a decision for or against climate action. <url>',
 'RT <user> : Remember being thankful is nothing unless you act to give back and repair things.',
 'RT <user> : Honestly, it is bananas to live in a world where articles like this come out and we are just expected to carry on as usual.…']

## II. Removing symbols
5. Removing punctuation (except functionally important ')
6. Removing numbers

In [13]:
import re

In [14]:
def symbol_removal(tweet):
    tweet = re.sub('[^a-zA-Z\' ]', ' ', tweet) #numbers, punctuation
    tweet = re.sub('\\s+', ' ', tweet) #extra whitespaces
    tweet = re.sub('[ \t]+$', '', tweet) #trailing whitespaces
    return tweet

In [15]:
preprocessed_stage_two = [symbol_removal(tweet) for tweet in preprocessed_stage_one]

In [16]:
[tweet for tweet in preprocessed_stage_two[:5]]

['The Governor doesn t care but that doesn t mean we can t force him to repeal if we twist his hand hard enough It won t be an easy fight and we will be fighting against quite a few Dems in the legislature but we must win if we want to bring dignity and safety to workers url',
 ' Hey user user user url',
 'RT user Every land use decision is a decision for or against climate action url',
 'RT user Remember being thankful is nothing unless you act to give back and repair things',
 'RT user Honestly it is bananas to live in a world where articles like this come out and we are just expected to carry on as usual']

## III. Removing lexical extras and tokenization
7. Remove stopwords
8. Lemmatize
9. Tokenize

In [17]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
# Load Spacy English module
nlp = spacy.load('en_core_web_sm')

In [18]:
remove_from_stopwords = ['just', 'serious', 'side', 'show', 'keep', 'now', 'never','all']
for word in remove_from_stopwords:
    STOP_WORDS.remove(word)

In [19]:
add_to_stopwords = ['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number', '-pron-', '-PRON-','rt']
for word in add_to_stopwords:
    STOP_WORDS.add(word)

In [20]:
STOP_WORDS.add('-PRON-')

In [21]:
def remove_lexical_extras(tweet):
    tweet = tweet.lower()
    nlp_text = nlp(tweet)
    lemmas = [token.lemma_ for token in nlp_text if token.lemma_ not in STOP_WORDS]
    return(lemmas)

In [22]:
preprocessed_stage_three = [remove_lexical_extras(tweet) for tweet in preprocessed_stage_two]

In [23]:
[tweet for tweet in preprocessed_stage_three[:5]]

[['governor',
  'doesn',
  't',
  'care',
  'doesn',
  't',
  'mean',
  't',
  'force',
  'repeal',
  'twist',
  'hand',
  'hard',
  'win',
  't',
  'easy',
  'fight',
  'fight',
  'dem',
  'legislature',
  'win',
  'want',
  'bring',
  'dignity',
  'safety',
  'worker'],
 [' ', 'hey'],
 ['land', 'use', 'decision', 'decision', 'climate', 'action'],
 ['remember', 'thankful', 'act', 'repair', 'thing'],
 ['honestly',
  'banana',
  'live',
  'world',
  'article',
  'like',
  'come',
  'just',
  'expect',
  'carry',
  'usual']]

## IV. Identify frequent bigrams
https://radimrehurek.com/gensim/models/phrases.html

In [24]:
from gensim.models import Phrases

In [25]:
phrases = Phrases(preprocessed_stage_three, min_count=25, threshold = 15)

In [26]:
preprocessed_stage_four = [phrases[tweet] for tweet in preprocessed_stage_three]

In [27]:
[tweet for tweet in preprocessed_stage_four[:5]]

[['governor',
  'doesn_t',
  'care',
  'doesn_t',
  'mean',
  't',
  'force',
  'repeal',
  'twist',
  'hand',
  'hard',
  'win',
  't',
  'easy',
  'fight',
  'fight',
  'dem',
  'legislature',
  'win',
  'want',
  'bring',
  'dignity',
  'safety',
  'worker'],
 [' ', 'hey'],
 ['land', 'use', 'decision', 'decision', 'climate', 'action'],
 ['remember', 'thankful', 'act', 'repair', 'thing'],
 ['honestly',
  'banana',
  'live',
  'world',
  'article',
  'like',
  'come',
  'just',
  'expect',
  'carry',
  'usual']]

## Add preprocessed text to the dataframe
And save df for further use

In [28]:
df_preprocessed = df_time_preprocessed.copy()
df_preprocessed['text_preprocessed'] = preprocessed_stage_four

In [29]:
df_preprocessed.head()

Unnamed: 0,timestamp,full_text,user.screen_name,period,week_number,crisis,text_preprocessed
0,2019-12-01 13:02:51,"The Governor doesn’t care, but that doesn’t me...",SunriseMvmtRVA,2019-12-01/2019-12-07,1,0,"[governor, doesn_t, care, doesn_t, mean, t, fo..."
1,2019-12-03 04:53:29,👇 Hey @OregonDOT @oregonmetro @OregonGovBrown ...,SunrisePDX,2019-12-01/2019-12-07,1,0,"[ , hey]"
2,2019-12-03 04:52:39,RT @SallyAnn_12: Every land use decision is a ...,SunrisePDX,2019-12-01/2019-12-07,1,0,"[land, use, decision, decision, climate, action]"
3,2019-12-03 04:52:35,RT @BuildSoil: Remember being thankful is noth...,SunrisePDX,2019-12-01/2019-12-07,1,0,"[remember, thankful, act, repair, thing]"
4,2019-12-03 04:52:27,"RT @mastmeghan: Honestly, it is bananas to liv...",SunrisePDX,2019-12-01/2019-12-07,1,0,"[honestly, banana, live, world, article, like,..."


In [30]:
df_preprocessed.to_pickle(f'data/{movement}_preprocessed')