# PREPARING MODEL TRAINING DATA

In [4]:
# Initialise relevant packages
import pandas as pd
import pickle


# Text cleaning
from html import unescape
import re
import string
import wordsegment as ws
import emoji
ws.load() # load vocab for word segmentation

## Load Raw Datasets

In [5]:
# load raw data
training_data = {}

training_data['davidson2017'] = pd.read_csv('./Data/Raw Training Data/davidson2017.csv', index_col=0)
training_data['founta2018'] = pd.read_csv('./Data/Raw Training Data/founta2018.csv', names=['text', 'label', 'count_label_votes'], delimiter='\t')

## Tidy Up Data Format

In [6]:
# specific formatting

# Davidson 2017
training_data['davidson2017'].rename(columns={"class": "label", "tweet": "text"}, inplace=True, errors='ignore')

# Founta 2018
# --> already fits

In [7]:
for dataset in training_data:
    
    # create index column and rename to ID
    training_data[dataset].reset_index(inplace=True)
    training_data[dataset].rename(columns={'index': 'id'}, inplace=True, errors='ignore')
    
    # drop unneccessary columns
    training_data[dataset] = training_data[dataset][['id','text','label']]
    
    # tidy up column types
    training_data[dataset] = training_data[dataset].convert_dtypes()

## Perform Basic Text Cleaning

In [8]:
# Define helper function for segmenting hashtags found through regex
def regex_match_segmentation(match):
    return ' '.join(ws.segment(match.group(0)))

In [9]:
# Define function for cleaning text
def clean_text(text):
    
    # convert HTML codes
    text = unescape(text)
    
    # lowercase text
    text = text.lower()
    
    # replace mentions, URLs and emojis with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join(' [EMOJI] ' if (char in emoji.UNICODE_EMOJI) else char for char in text).strip()
    
    # find and split hashtags into words
    text = re.sub(r"#[A-Za-z0-9]+", regex_match_segmentation, text)

    # remove punctuation at beginning of string (quirk in Davidson data)
    text = text.lstrip("!")
    
    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    
    return text

In [10]:
# apply text cleaner to text columns for each dataset
for dataset in training_data:
    training_data[dataset]['text']=training_data[dataset].text.apply(clean_text)

## Export Multiclass Data

In [11]:
# give multiclass labels string names for clarity
# Davidson et al. (2017) --> 0 is "hate speech", 1 is "offensive language", 2 is "neither"
training_data['davidson2017'].label.replace({0: "hateful", 1: "offensive", 2: "neither"}, inplace = True)

# print class frequencies for each dataset
for dataset in training_data:
    print(dataset)
    print(training_data[dataset].groupby('label').id.count(), '\n')

# save dictionary of cleaned datasets to pickle
pickle.dump(training_data, open('./Data/Clean Training Data/training_data_multiclass.pkl','wb'))

davidson2017
label
hateful       1430
neither       4163
offensive    19190
Name: id, dtype: int64 

founta2018
label
abusive    27150
hateful     4965
normal     53851
spam       14030
Name: id, dtype: int64 



## Convert to Binary Classification Task

In [12]:
# GOAL: hateful (1) and non-hateful (0)

# Davidson et al. (2017) --> "hateful", "offensive", "neither"
training_data['davidson2017'].label.replace({'hateful': 1, 'offensive': 0, 'neither': 0}, inplace = True)

# Founta et al. (2018) --> "hateful", "abusive", "normal", "spam"
training_data['founta2018'].label.replace({'hateful': 1, "abusive": 0, "normal": 0, "spam": 0}, inplace = True)

## Export Binary Data

In [13]:
# print class frequencies for each dataset
for dataset in training_data:
    print(dataset)
    print(training_data[dataset].groupby('label').id.count(), '\n')

# save dictionary of cleaned datasets to pickle
pickle.dump(training_data, open('./Data/Clean Training Data/training_data_binary.pkl','wb'))

davidson2017
label
0    23353
1     1430
Name: id, dtype: int64 

founta2018
label
0    95031
1     4965
Name: id, dtype: int64 

