Preprocessing code - on DemSoc

In [1]:
import pickle
import pandas as pd
import time
import datetime

In [53]:
movement = 'demsoc'

# 1. Time preprocessing: Order by weeks

In [44]:
with open(f'../Twitter data/Final data/{movement}_tweets', 'rb') as file:
    df = pickle.load(file)    

In [46]:
df.head()

Unnamed: 0,created_at,id,full_text,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,lang,user.id,user.id_str,user.name,user.screen_name,user.location,user.description,user.url,user.followers_count,user.friends_count,user.created_at
0,Wed Feb 26 18:58:55 +0000 2020,1232741852664811520,#NotAgainSU.\n\nhttps://t.co/3kqtUazEgf\n\nIn ...,0,6,False,False,False,fi,864648993137098754,864648993137098754,New York City YDSA ðŸŒ¹,nycYDSA,"New York, NY",Youth of NYC â€“ Unite! âœŠ Sign up for our emails...,https://t.co/JegJu7EXYF,2056,771,Wed May 17 01:09:19 +0000 2017
1,Wed Feb 26 18:58:40 +0000 2020,1232741788999438338,l. We stand in full support for all of the dem...,0,7,False,False,,en,864648993137098754,864648993137098754,New York City YDSA ðŸŒ¹,nycYDSA,"New York, NY",Youth of NYC â€“ Unite! âœŠ Sign up for our emails...,https://t.co/JegJu7EXYF,2056,771,Wed May 17 01:09:19 +0000 2017
2,Wed Feb 26 18:57:47 +0000 2020,1232741569222135809,This is a necessary response to the 29+ hate c...,2,5,False,False,,en,864648993137098754,864648993137098754,New York City YDSA ðŸŒ¹,nycYDSA,"New York, NY",Youth of NYC â€“ Unite! âœŠ Sign up for our emails...,https://t.co/JegJu7EXYF,2056,771,Wed May 17 01:09:19 +0000 2017
3,Wed Feb 26 18:53:15 +0000 2020,1232740425271824384,The New York City Young Democratic-Socialists ...,10,22,False,False,,en,864648993137098754,864648993137098754,New York City YDSA ðŸŒ¹,nycYDSA,"New York, NY",Youth of NYC â€“ Unite! âœŠ Sign up for our emails...,https://t.co/JegJu7EXYF,2056,771,Wed May 17 01:09:19 +0000 2017
4,Tue Dec 03 18:26:46 +0000 2019,1201930791506366467,Happy to announce that legendary battle rapper...,1,6,False,False,False,en,864648993137098754,864648993137098754,New York City YDSA ðŸŒ¹,nycYDSA,"New York, NY",Youth of NYC â€“ Unite! âœŠ Sign up for our emails...,https://t.co/JegJu7EXYF,2056,771,Wed May 17 01:09:19 +0000 2017


In [48]:
def time_preprocessing(df, week_last_day, crisis_starting_week):
    """Takes raw data, adds timestamp, week number and crisis dummy variable
    Week last day - 'W-SAT'for USA; 'W-WED' for UK
    Crisis first week - 8 for US, 10 for UK"""
    # Extract timestamp from the textual "created_at" column
    df['timestamp'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S +0000 %Y')
    # Remove data not needed right now
    df = df[['timestamp', 'full_text', 'user.screen_name']]
    # Remove tweets not included in the period taken for research (01.11.2019-30.06.2020)
    df = df.loc[(df['timestamp'] > '2019-12-01') & (df['timestamp'] < '2020-05-31')].reset_index(drop=True)
    # Add period
    df['period'] = df['timestamp'].dt.to_period(week_last_day)
    df = df.sort_values('period').reset_index(drop=True)
    # Add case week number
    i = 1
    df['week_number'] = 0
    for group_index, group in df.groupby('period'):
        df.loc[df['period']==group_index, 'week_number'] = i
        i += 1
    # Add crisis parameter
    df['crisis'] = 1
    df.loc[df['week_number']<crisis_starting_week, 'crisis'] = 0
    
    return df

In [49]:
df_time_preprocessed = time_preprocessing(df, 'W-SAT', 8)

In [52]:
df_time_preprocessed

Unnamed: 0,timestamp,full_text,user.screen_name,period,week_number,crisis
0,2019-12-02 20:14:03,Many of these with genocidal repercussions whi...,unc_ydsa,2019-12-01/2019-12-07,1,0
1,2019-12-02 19:35:32,RT @aidachavez: is this your queen https://t.c...,NorthNJDSA,2019-12-01/2019-12-07,1,0
2,2019-12-02 19:37:54,"dont know who needs to hear this, but rich peo...",NorthNJDSA,2019-12-01/2019-12-07,1,0
3,2019-12-02 19:39:21,baby yoda (handshake) all our comrades https:/...,NorthNJDSA,2019-12-01/2019-12-07,1,0
4,2019-12-02 19:39:49,"RT @AshAgony: ""For the second time this week, ...",NorthNJDSA,2019-12-01/2019-12-07,1,0
...,...,...,...,...,...,...
81444,2020-05-28 13:13:52,RT @DemSocialists: There is no justification f...,AuburnDSA,2020-05-24/2020-05-30,26,1
81445,2020-05-28 13:13:47,RT @DemSocialists: From our National Political...,AuburnDSA,2020-05-24/2020-05-30,26,1
81446,2020-05-30 14:52:35,RT @JonahFurman: This is the absolute best of ...,NYCDSA_Climate,2020-05-24/2020-05-30,26,1
81447,2020-05-28 16:12:36,RT @isaiah_kb: hello did you know using tear g...,AnchorageDSA,2020-05-24/2020-05-30,26,1


# 2. Textual preprocessing
- lowercase
- 1,3 Remove emojis, user-mentions, links
- 2 Spell correction
- 2 Expand contractions - exphrasis doesnt really work
- Remove punctuation
- (optional) Identify most frequent bigrams
- Remove stopwords bro

- last thing: nice tokenizer (keep hashtag together), but generally space-based

In [59]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.dicts.emoticons import emoticons

In [60]:
# Ekphraris preprocessing pipeline
text_processor = TextPreProcessor(
    # 1. Remove stuff
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'date', 'number'],
       
    # 2. Spell correction and contraction expansion
    corrector="twitter", 
    
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correction=True,  # spell correction
        
    # 3. Replace emojis with textual expressions
    dicts=[emoticons]
)

Reading twitter - 1grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


In [61]:
test

['Many of these with genocidal repercussions which nations still face today. Great video. https://t.co/d65auc365r',
 'RT @aidachavez: is this your queen https://t.co/ju2SfJHIAT',
 'dont know who needs to hear this, but rich people won\'t magically start "GETTING" to go to school in droves if working class/people who dont want to be in massive debt/those in poverty get the chance to go for free. that\'s literally not how life works',
 'baby yoda (handshake) all our comrades https://t.co/5BFWxXjyj0',
 'RT @AshAgony: "For the second time this week, a thin blue line flag was spotted on NYPD property. The framed banner was seen hanging from tâ€¦',
 "unless you've had the privilege of not having to think of any of these things. OR, say, being a rhodes scholar then going to harvard (yale? who cares) and trying to tell people how to live their lives",
 'RT @innocence: If you donate $10 or more to help us reach our goal to cover DNA testing expenses today, weâ€™ll send you a sticker â€” and gif

In [64]:
test_ekp = ["".join(text_processor.pre_process_doc(s)) for s in test]

In [65]:
test_ekp

['Many of these with genocidal repercussions which nations still face today. Great video. <url>',
 'RT <user> : is this your queen <url>',
 'dont know who needs to hear this, but rich people will not magically start "GETTING" to go to school in droves if working class/people who dont want to be in massive debt/those in poverty get the chance to go for free. that\'s literally not how life works',
 'baby yoda (handshake) all our comrades <url>',
 'RT <user> : "For the second time this week, a thin blue line flag was spotted on NYPD property. The framed banner was seen hanging from tâ€¦',
 'unless you have had the privilege of not having to think of any of these things. OR, say, being a rhodes scholar then going to harvard (yale? who cares) and trying to tell people how to live their lives',
 'RT <user> : If you donate <money> or more to help us reach our goal to cover DNA testing expenses today, weâ€™ll send you a sticker â€” and giftâ€¦',
 "<user> agree!!!! can not trust what our county

In [66]:
from ekphrasis.utils.nlp import unpack_contractions

In [67]:
unpack_contractions("agree!!!! can not trust what our county will do honestly and we do not, have not, will not ever support the carceral state but with or without us, it's happening so hope mr. zurofsky can do his best")

"agree!!!! can not trust what our county will do honestly and we do not, have not, will not ever support the carceral state but with or without us, it's happening so hope mr. zurofsky can do his best"

Save preprocessed dataset for further use.

In [39]:
df_time_preprocessed.to_pickle(f'data/{movement}_preprocessed')