# BERT_Preprocessing

In [1]:
#import libraries needed for preprocessing
import numpy as np
import pandas as pd
import nltk
import re
from collections import Counter
from tqdm import tqdm

## Load OvensEval Dataset

In [2]:
# Load Data as data fragem with pandas
df = pd.read_csv('data/olid-training-v1.0.tsv', sep='\t')
pd.set_option('display.max_colwidth',0)
# Display 5 first rows
df.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans what their take on this is.,OFF,UNT,
1,90194,@USER @USER Go home youâ€™re drunk!!! @USER #MAGA #Trump2020 ðŸ‘ŠðŸ‡ºðŸ‡¸ðŸ‘Š URL,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of shit to a volcano. ðŸ˜‚""",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illegals to move into red states,NOT,,


## Define Data X and Target Y

In [3]:
# Define Dataset X (Tweets) and targets Y(subtasl_a) of dataset
X = df.drop(columns=['subtask_b','subtask_c'])
y = df.drop(columns=['id', 'tweet','subtask_b', 'subtask_c'])
print('Input data of shape:     {}'.format(X.shape))
print('Target data of shape:    {}'.format(y.shape))
X.head()

Input data of shape:     (13240, 3)
Target data of shape:    (13240, 1)


Unnamed: 0,id,tweet,subtask_a
0,86426,@USER She should ask a few native Americans what their take on this is.,OFF
1,90194,@USER @USER Go home youâ€™re drunk!!! @USER #MAGA #Trump2020 ðŸ‘ŠðŸ‡ºðŸ‡¸ðŸ‘Š URL,OFF
2,16820,Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT,NOT
3,62688,"@USER Someone should'veTaken"" this piece of shit to a volcano. ðŸ˜‚""",OFF
4,43605,@USER @USER Obama wanted liberals &amp; illegals to move into red states,NOT


## Remove special charakters

#### Remove @User 

In [4]:
#Remove @USER 
X['tweet'] = X['tweet'].apply(lambda x: ' '.join(re.sub("@USER"," ",x).split()))
#X.head(30)

#### Transform Lowercase

In [5]:
# Transform Lowercase
X['tweet'] = X['tweet'].apply(lambda x: x.lower())

#### Remove Emojis

In [6]:
# Remove Emojis
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

X['tweet'] = X['tweet'].apply(lambda x: deEmojify(x))

X.head()

Unnamed: 0,id,tweet,subtask_a
0,86426,she should ask a few native americans what their take on this is.,OFF
1,90194,go home youre drunk!!! #maga #trump2020 url,OFF
2,16820,amazon is investigating chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. url #amazon #maga #kag #china #tcot,NOT
3,62688,"someone should'vetaken"" this piece of shit to a volcano. """,OFF
4,43605,obama wanted liberals &amp; illegals to move into red states,NOT


#### Remove Hashtags, url, amp

In [7]:
# Remove Hasttags, url,+ amp 
X['tweet'] = X['tweet'].apply(lambda x: ' '.join(re.sub("(@[A-Za-z0-9]+)|#|url|&amp"," ",x).split()))
X.head(10)
#X['tweet'] = X['tweet'].apply(lambda x: ' '.join(re.sub("([^0-9A-Za-z \[\]\t])|(\w+:\/\/\S+)"," ",x).split()))

Unnamed: 0,id,tweet,subtask_a
0,86426,she should ask a few native americans what their take on this is.,OFF
1,90194,go home youre drunk!!! maga trump2020,OFF
2,16820,amazon is investigating chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. amazon maga kag china tcot,NOT
3,62688,"someone should'vetaken"" this piece of shit to a volcano. """,OFF
4,43605,obama wanted liberals ; illegals to move into red states,NOT
5,97670,liberals are all kookoo !!!,OFF
6,77444,oh noes! tough shit.,OFF
7,52415,was literally just talking about this lol all mass shootings like that have been set ups. its propaganda used to divide us on major issues like gun control and terrorism,OFF
8,45157,buy more icecream!!!,NOT
9,13384,canada doesnt need another cuck! we already have enough looneyleft liberals f**king up our great country! qproofs trudeaumustgo,OFF


#### Remove set of special characters (Simon Hegelich)

In [8]:
# Remove special charakters (Credits Simon Hegelich)
def remove_characters_before_tokenization(sentence,
                                          keep_apostrophes=True):
    sentence = sentence.strip()
    if keep_apostrophes:
        PATTERN = r'["|#|$|&|*|%|@|(|)|~]'
        filtered_sentence = re.sub(PATTERN, r'', sentence)
    else:
        PATTERN = r'[^a-zA-Z0-9 .!;?]'
        filtered_sentence = re.sub(PATTERN, r'', sentence)
    return filtered_sentence

In [9]:
# Remove special charakters
X['tweet'] = X['tweet'].apply(lambda x: remove_characters_before_tokenization(x))
X.head()

Unnamed: 0,id,tweet,subtask_a
0,86426,she should ask a few native americans what their take on this is.,OFF
1,90194,go home youre drunk!!! maga trump2020,OFF
2,16820,amazon is investigating chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. amazon maga kag china tcot,NOT
3,62688,someone should'vetaken this piece of shit to a volcano.,OFF
4,43605,obama wanted liberals ; illegals to move into red states,NOT


## Expand Contractions

#### Load contraction map

In [10]:
# Load contraction_map
contraction_map = {"ain't": "is not", "aren't": "are not","can't": "cannot", 
                   "can't've": "cannot have", "'cause": "because", "could've": "could have", 
                   "couldn't": "could not", "couldn't've": "could not have","didn't": "did not", 
                   "doesn't": "does not", "don't": "do not", "hadn't": "had not", 
                   "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", 
                   "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                   "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                   "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                   "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                   "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                   "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                   "i'll've": "i will have","i'm": "i am", "i've": "i have", 
                   "isn't": "is not", "it'd": "it would", "it'd've": "it would have", 
                   "it'll": "it will", "it'll've": "it will have","it's": "it is", 
                   "let's": "let us", "ma'am": "madam", "mayn't": "may not", 
                   "might've": "might have","mightn't": "might not","mightn't've": "might not have", 
                   "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", 
                   "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", 
                   "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not",
                   "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", 
                   "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", 
                   "she's": "she is", "should've": "should have", "shouldn't": "should not", 
                   "shouldn't've": "should not have", "so've": "so have","so's": "so as", 
                   "this's": "this is",
                   "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                   "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                   "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                   "they'll've": "they will have", "they're": "they are", "they've": "they have", 
                   "to've": "to have", "wasn't": "was not", "we'd": "we would", 
                   "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", 
                   "we're": "we are", "we've": "we have", "weren't": "were not", 
                   "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                   "what's": "what is", "what've": "what have", "when's": "when is", 
                   "when've": "when have", "where'd": "where did", "where's": "where is", 
                   "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                   "who's": "who is", "who've": "who have", "why's": "why is", 
                   "why've": "why have", "will've": "will have", "won't": "will not", 
                   "won't've": "will not have", "would've": "would have", "wouldn't": "would not", 
                   "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would",
                   "y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                   "you'd": "you would", "you'd've": "you would have", "you'll": "you will", 
                   "you'll've": "you will have", "you're": "you are", "you've": "you have" }

#### Define contraction function

In [11]:
# function expanding contradictions
def expand_contractions(sentence, contraction_mapping):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_sentence = contractions_pattern.sub(expand_match, sentence)
    return expanded_sentence

#### Apply contraction function

In [12]:
# call function expand contractions
X['tweet'] = X['tweet'].apply(lambda x: expand_contractions(x, contraction_map))

In [13]:
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('data/big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [14]:
tqdm.pandas()
X['tweet'] = X['tweet'].progress_apply(lambda x: ' '.join(correction(word) for word in x.split()))

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 13240/13240 [34:18<00:00,  6.43it/s] 


###  Display preprocessed data set

In [15]:
X.head(50)

Unnamed: 0,id,tweet,subtask_a
0,86426,she should ask a few native americans what their take on this is,OFF
1,90194,go home your drunk!!! magna trump2020,OFF
2,16820,amazon is investigation chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. amazon magna bag china trot,NOT
3,62688,someone should havetaken this piece of shit to a volcano,OFF
4,43605,drama wanted liberals a illegal to move into red states,NOT
5,97670,liberals are all koko !!!,OFF
6,77444,oh does tough shit,OFF
7,52415,was literally just talking about this ll all mass shooting like that have been set up its propaganda used to divide us on major issues like gun control and terrorist,OFF
8,45157,buy more icecream!!!,NOT
9,13384,canada doesn need another luck we already have enough looneyleft liberals king up our great country proofs trudeaumustgo,OFF


### Save Dataset to CSV files

In [17]:
X.to_csv('data/X_processed.csv',index=False)
y.to_csv('data/y_processed.csv',index=False)