In [2]:
import pandas as pd
import re
import os

In [3]:
data_folder_path = '/home/manuto/Documents/world_bank/bert_twitter_labor/code/twitter/data/may5_7Klabels/data_binary_pos_neg_balanced_removed_allzeros'

# Preprocess text data

## Import data

In [4]:
df = pd.read_csv(os.path.join(data_folder_path,'train_is_hired_1mo.csv'), index_col=0, encoding='utf-8')

In [5]:
df.head()

Unnamed: 0_level_0,text,class
id,Unnamed: 1_level_1,Unnamed: 2_level_1
339,I am looking for a local tweeter for hire? Loo...,0
3046,Ive learned more &amp; talked more with my co-...,1
4825,my heart dont work Im unemployed,0
5025,Gave two weeks notice 3 weeks ago. Started new...,1
4641,I just got hired at TGIF at the Rim and I alre...,1


In [3]:
df[[not text.isascii() for text in df.TweetText]].InformationType_coarse.value_counts()

not related or not informative         5612
other useful information               4332
donations and volunteering             2462
affected individuals                   1676
sympathy and support                   1247
infrastructure and utilities damage     994
caution and advice                      655
Name: InformationType_coarse, dtype: int64

In [4]:
df[[text.isascii() for text in df.TweetText]].InformationType_coarse.value_counts()

not related or not informative         20173
other useful information               14545
donations and volunteering              6463
affected individuals                    6333
sympathy and support                    3773
infrastructure and utilities damage     3565
caution and advice                      2516
Name: InformationType_coarse, dtype: int64

## Remove URL, RT, mention(@)

In [5]:
df.ProcessedText = df.TweetText.str.replace(r'http(\S)+', r'')
df.ProcessedText = df.ProcessedText.str.replace(r'http ...', r'')

In [6]:
df.ProcessedText[df.ProcessedText.str.contains(r'http')]

Series([], Name: ProcessedText, dtype: object)

In [7]:
df.ProcessedText = df.ProcessedText.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')

In [8]:
df.ProcessedText[df.ProcessedText.str.contains(r'RT[ ]?@')]

Series([], Name: ProcessedText, dtype: object)

In [9]:
df.ProcessedText = df.ProcessedText.str.replace(r'@[\S]+',r'')

## Remove non-ascii words or characters

In [10]:
df.ProcessedText = [''.join([i if ord(i) < 128 else '' for i in text]) for text in df.ProcessedText]

In [11]:
df.ProcessedText = df.ProcessedText.str.replace(r'_[\S]?',r'')

## Remove extra space

In [12]:
df.ProcessedText = df.ProcessedText.str.replace(r'[ ]{2, }',r' ')

## &, < and >

In [13]:
df.ProcessedText = df.ProcessedText.str.replace(r'&amp;?',r'and')

In [14]:
df.ProcessedText = df.ProcessedText.str.replace(r'&lt;',r'<')
df.ProcessedText = df.ProcessedText.str.replace(r'&gt;',r'>')

## Insert space between words and punctuation marks

In [15]:
df.ProcessedText = df.ProcessedText.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')

In [16]:
df.ProcessedText = df.ProcessedText.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

## Lowercased and strip

In [17]:
df.ProcessedText = df.ProcessedText.str.lower()

In [18]:
df.ProcessedText = df.ProcessedText.str.strip()

## Calculate text length for later use in LSTM

In [19]:
df.ProcessedText_length = [len(text.split(' ')) for text in df.ProcessedText]

In [20]:
df.ProcessedText_length.value_counts()

14     3867
13     3830
15     3807
18     3736
16     3709
12     3700
17     3658
20     3635
19     3564
21     3555
22     3467
11     3390
10     3183
23     3084
9      2881
24     2631
8      2533
25     2372
26     1979
7      1973
27     1563
6      1547
28     1274
5      1179
29      910
4       867
30      762
31      496
32      366
33      271
34      161
35      115
36      101
37       41
38       39
42       23
39       21
41        9
40        8
43        7
45        7
46        5
50        3
51        3
52        3
44        2
48        2
53        1
56        1
55        1
60        1
47        1
101       1
63        1
Name: ProcessedText_length, dtype: int64

## Drop texts with length <=3 and drop duplicates

In [21]:
df = df[df.ProcessedText_length>3]

In [22]:
df = df.drop_duplicates(subset=['ProcessedText'])

## Summary of sample size and labels

In [23]:
df.shape[0]

74346

In [24]:
df.InformationType_coarse.value_counts()

not related or not informative         25785
other useful information               18877
donations and volunteering              8925
affected individuals                    8009
sympathy and support                    5020
infrastructure and utilities damage     4559
caution and advice                      3171
Name: InformationType_coarse, dtype: int64

## BERT preprocess

In [25]:
df['ProcessedText_BERT'] = '[CLS] '+df.ProcessedText

In [26]:
from pytorch_pretrained_bert import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['ProcessedText_BERTbase_length'] = [len(tokenizer.tokenize(sent)) for sent in df.ProcessedText_BERT]

tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
df['ProcessedText_BERTlarge_length'] = [len(tokenizer.tokenize(sent)) for sent in df.ProcessedText_BERT]

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


## Int label for later use in softmax and cross entropy loss

In [27]:
label_dict = dict()
for i, l in enumerate(list(df.InformationType_coarse.value_counts().keys())):
    label_dict.update({l: i})

df['InformationType_label'] = [label_dict[label] for label in df.InformationType_coarse]

## Save data

In [28]:
df.to_csv('en_disaster.csv')