# PREPARING MODEL TRAINING DATA

In [85]:
# Initialise relevant packages
import pandas as pd
import pickle


# Text cleaning
from html import unescape
import re
import string
import wordsegment as ws
import emoji
import os
from sklearn.model_selection import train_test_split
ws.load() # load vocab for word segmentation

## Load Raw Datasets

In [86]:
# load raw data
training_data = {}
training_data['davidson2017'] = pd.read_csv('./Data/Raw Training Data/davidson2017.csv', index_col=0)
training_data['founta2018'] = pd.read_csv('./Data/Raw Training Data/founta2018.csv', names=['text', 'label', 'count_label_votes'], delimiter='\t')
training_data['dynabench2021'] = pd.read_csv('./Data/Raw Training Data/dynabench_dataset_raw.csv',index_col=2)

In [87]:
# clean out round 1 for trainning as it is low quality 
training_data['dynabench2021'] = training_data['dynabench2021'][training_data['dynabench2021']["round"]!="1"]
print("dropped round 1 data from dynabench")
#training_data['dynabench2021']

dropped round 1 data from dynabench


In [88]:
# separate out train test split from dynabench from the get-go and get their indexes to merge at the end 

dynabench2021_train = training_data['dynabench2021'][training_data['dynabench2021']["split"]=="train"].reset_index()
dynabench2021_test =  training_data['dynabench2021'][training_data['dynabench2021']["split"]=="test"].reset_index()
dynabench2021_dev =  training_data['dynabench2021'][training_data['dynabench2021']["split"]=="dev"].reset_index()

## Tidy Up Data Format

In [89]:
# specific formatting

# Davidson 2017
training_data['davidson2017'].rename(columns={"class": "label", "tweet": "text"}, inplace=True, errors='ignore')

# Founta 2018
# --> already fits

# Dynabench 2021
# --> already fits naming
training_data['dynabench2021'].dropna(subset=["text"],inplace=True) # drop single NA entry

In [90]:
for dataset in training_data:
    
    # create index column and rename to ID
    training_data[dataset].reset_index(inplace=True)
    training_data[dataset].rename(columns={'index': 'id'}, inplace=True, errors='ignore')
    
    # drop unneccessary columns
    training_data[dataset] = training_data[dataset][['id','text','label']]
    
    # tidy up column types
    training_data[dataset] = training_data[dataset].convert_dtypes()

## Perform Basic Text Cleaning

In [91]:
# Define helper function for segmenting hashtags found through regex
def regex_match_segmentation(match):
    return ' '.join(ws.segment(match.group(0)))

In [92]:
# Define function for cleaning text
def clean_text(text):
    
    # convert HTML codes
    text = unescape(text)
    
    # lowercase text
    text = text.lower()
    
    # replace mentions, URLs and emojis with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join(' [EMOJI] ' if (char in emoji.UNICODE_EMOJI) else char for char in text).strip()
    
    # find and split hashtags into words
    text = re.sub(r"#[A-Za-z0-9]+", regex_match_segmentation, text)

    # remove punctuation at beginning of string (quirk in Davidson data)
    text = text.lstrip("!")
    
    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    
    
    return text

In [93]:
%%time
# apply text cleaner to text columns for each dataset
for dataset in training_data:
    print(dataset)
    training_data[dataset]['text']=training_data[dataset].text.apply(clean_text)

davidson2017
founta2018
dynabench2021
CPU times: user 43.7 s, sys: 58.1 ms, total: 43.8 s
Wall time: 43.8 s


## Export Multiclass Data
## PR: let's skip the multiclass data for now and just focus on the binary case

In [94]:
# give multiclass labels string names for clarity
# Davidson et al. (2017) --> 0 is "hate speech", 1 is "offensive language", 2 is "neither"
training_data['davidson2017'].label.replace({0: "hateful", 1: "offensive", 2: "neither"}, inplace = True)

# print class frequencies for each dataset
for dataset in training_data:
    print(dataset)
    print(training_data[dataset].groupby('label').id.count(), '\n')

# save dictionary of cleaned datasets to pickle
pickle.dump(training_data, open('./Data/Clean Training Data/training_data_multiclass.pkl','wb'))

davidson2017
label
hateful       1430
neither       4163
offensive    19190
Name: id, dtype: int64 

founta2018
label
abusive    27150
hateful     4965
normal     53851
spam       14030
Name: id, dtype: int64 

dynabench2021
label
hate       15066
nothate    15031
Name: id, dtype: int64 



## Convert to Binary Classification Task

In [95]:
# GOAL: hateful (1) and non-hateful (0)

# Davidson et al. (2017) --> "hateful", "offensive", "neither"
training_data['davidson2017'].label.replace({'hateful': 1, 'offensive': 0, 'neither': 0}, inplace = True)

# Founta et al. (2018) --> "hateful", "abusive", "normal", "spam"
training_data['founta2018'].label.replace({'hateful': 1, "abusive": 0, "normal": 0, "spam": 0}, inplace = True)

# Dynabench (2021) --> "hate"/"not hate" --> 1/0
training_data['dynabench2021'].label.replace({'hate': 1, 'nothate': 0}, inplace = True)

## Create Concatenation of Datasets

In [96]:
# add column with dataset name
for dataset in training_data:
    training_data[dataset]['dataset'] = dataset

In [97]:
# create concatenation of all datasets
frames = [training_data['davidson2017'],training_data['founta2018'],training_data['dynabench2021']]

training_data['combined'] = pd.concat(frames)
training_data['combined']

Unnamed: 0,id,text,label,dataset
0,0,rt [USER]: as a woman you shouldn't complain ...,0,davidson2017
1,1,rt [USER]: boy dats cold...tyga dwn bad for c...,0,davidson2017
2,2,rt [USER] dawg!!!! rt [USER]: you ever fuck a...,0,davidson2017
3,3,rt [USER]: [USER] she look like a tranny,0,davidson2017
4,4,rt [USER]: the shit you hear about me might b...,0,davidson2017
...,...,...,...,...
30092,r5.32p,i have that opinion. absolutely everyone i act...,0,dynabench2021
30093,r5.379,"this dark female, i am not going to use the hu...",1,dynabench2021
30094,r5.4042p,i guess you are assuming that the french laws ...,1,dynabench2021
30095,r5.4641,"i mean, theres no one better right now to give...",0,dynabench2021


## Perform the datasplit

### Generating the directories 
For all the inserted dataset names in the trainning data dict

In [98]:
for directory_name in training_data.keys():
    print(directory_name)
    if not os.path.exists(f"Data/{directory_name}"):
        os.mkdir(f"Data/{directory_name}")
        os.mkdir(f"Data/{directory_name}/Raw")
    else : 
        print("we already have the directory")
    #if not os.path.exists(f"Data/{directory_name}/train"):
    #    os.mkdir(f"Data/{directory_name}/train")
    #if not os.path.exists(f"Data/{directory_name}/test"):
    #    os.mkdir(f"Data/{directory_name}/test")
    #if not os.path.exists(f"Data/{directory_name}/dev"):
    #    os.mkdir(f"Data/{directory_name}/dev")

davidson2017
we already have the directory
founta2018
we already have the directory
dynabench2021
we already have the directory
combined
we already have the directory


### Save the cleaned dynabench
( This is specific to dynabench and thus cannot be replicated for the other datasets as they are not pre-split )

In [99]:
training_data["davidson2017"]

Unnamed: 0,id,text,label,dataset
0,0,rt [USER]: as a woman you shouldn't complain ...,0,davidson2017
1,1,rt [USER]: boy dats cold...tyga dwn bad for c...,0,davidson2017
2,2,rt [USER] dawg!!!! rt [USER]: you ever fuck a...,0,davidson2017
3,3,rt [USER]: [USER] she look like a tranny,0,davidson2017
4,4,rt [USER]: the shit you hear about me might b...,0,davidson2017
...,...,...,...,...
24778,25291,you's a muthaf***in lie “[USER]: [USER] [USER]...,0,davidson2017
24779,25292,"you've gone and broke the wrong heart baby, an...",0,davidson2017
24780,25294,young buck wanna eat!!.. dat nigguh like i ain...,0,davidson2017
24781,25295,youu got wild bitches tellin you lies,0,davidson2017


In [100]:
dynabench_trainning = pd.merge(dynabench2021_train["id"],training_data["dynabench2021"], on = "id")
dynabench_testing = pd.merge(dynabench2021_test["id"],training_data["dynabench2021"], on = "id")
dynabench_deving = pd.merge(dynabench2021_dev["id"],training_data["dynabench2021"], on = "id")

In [101]:
dynabench_trainning

Unnamed: 0,id,text,label,dataset
0,19638p,there are too many costas in this country,0,dynabench2021
1,19640,there is way too many m0ng0l0id kids at my sch...,1,dynabench2021
2,19641p,i called my brother a twat and my mom got so m...,0,dynabench2021
3,19645,the worst of them are the taaaaaaaaaards becau...,1,dynabench2021
4,19645p,the worst of them are the viruses because they...,0,dynabench2021
...,...,...,...,...
24074,r5.997p,"as a tolerant man, i have some serious reserva...",0,dynabench2021
24075,r5.999,"they will get good at it, learn how to rape in...",1,dynabench2021
24076,r5.999p,"they will get good at it, learn how to work in...",0,dynabench2021
24077,r5.99p,the issue is when their weirdly pushy parents ...,0,dynabench2021


In [105]:
dynabench_trainning.to_csv("./Data/dynabench2021/dynabench2021_train.csv", index=False)
dynabench_testing.to_csv("./Data/dynabench2021/dynabench2021_test.csv", index=False, line_terminator="\n")
dynabench_deving.to_csv("./Data/dynabench2021/dynabench2021_valid.csv", index=False, line_terminator="\n")

In [107]:
test = pd.read_csv("./Data/dynabench2021/dynabench2021_test.csv",lineterminator = "\n")
test

Unnamed: 0,id,text,label,dataset
0,19637p,feminists? they are some of the most annoying ...,0,dynabench2021
1,19713p,james you are so odious please leave me alone,0,dynabench2021
2,19735p,fags stink almost as much as cigars,0,dynabench2021
3,19805p,dishonesty runs rife within republicans,0,dynabench2021
4,19902p,it makes me so happy to see naughty kids sent ...,0,dynabench2021
...,...,...,...,...
3004,r5.1959p,did you explain to her that it was because of ...,0,dynabench2021
3005,r5.1437,gays are very well known to be extremely sexua...,1,dynabench2021
3006,r5.2911,asian incels/anti-white male shills try to cal...,1,dynabench2021
3007,r5.961p,these people are the ones that really need to ...,0,dynabench2021


### Save the other datasets

In [22]:
for directory_name in training_data.keys():
    print(directory_name)
    if directory_name == "dynabench2021":
        training_data[directory_name].to_csv(f"./Data/{directory_name}/Raw/{directory_name}_raw.csv",line_terminator="\n")
        pass
    else : 
        
        # save in raw 
        
        training_data[directory_name].to_csv(f"./Data/{directory_name}/Raw/{directory_name}_raw.csv",line_terminator="\n")
        # split with stratified split 
        
        train, validation = train_test_split(training_data[directory_name], test_size=0.2, stratify=training_data[directory_name].label, random_state=123)
        valid, test = train_test_split(validation, test_size=0.5, stratify=validation.label, random_state=123)
        train.to_csv(f"./Data/{directory_name}/{directory_name}_train.csv", index=False, line_terminator="\n")
        test.to_csv(f"./Data/{directory_name}/{directory_name}_test.csv", index=False, line_terminator="\n")
        valid.to_csv(f"./Data/{directory_name}/{directory_name}_valid.csv", index=False, line_terminator="\n")
        
        

davidson2017
founta2018
dynabench2021
combined
