# Load, clean and split labelled GHC data (Kennedy et al. 2020)

In [124]:
import pandas as pd
import csv
import datetime
import emoji
import re
import unicodedata

from html import unescape
from sklearn.model_selection import train_test_split

## Load data

In [125]:
import_df = pd.read_csv('../../0_data/raw/ghc_with_pushshift_meta_25850.tsv', sep='\t')

In [126]:
# select relevant columns and write to copy
ghc_df = import_df[['created_at', 'body', 'Text', 'Hate']].copy()

# fix datetime format
ghc_df['created_at'] = ghc_df.created_at.apply(lambda x: datetime.datetime.strptime(x.split('+')[0], '%Y-%m-%d %H:%M:%S'))

In [127]:
# create column with text that was actually annotated ("Text" in 2016-09, "body" afterwards)
text = ghc_df[ghc_df.created_at<'2016-10'].Text.append(ghc_df[ghc_df.created_at>'2016-10'].body)
text.name = 'text'
ghc_df = pd.concat([ghc_df, text], axis=1)

# remove now-redundant other text columns
ghc_df = ghc_df[['created_at','text','Hate']]

# rename 'Hate' to 'label'
ghc_df.rename(columns={'Hate':'label'}, inplace=True)

In [128]:
# total number of posts
print(f'total number of posts: {ghc_df.shape[0]} \n')

# number of posts by month
ghc_df.groupby(ghc_df.created_at.apply(lambda x: x.to_period('M'))).text.count().reset_index().rename(columns = {'text': 'number_posts'})

total number of posts: 25850 



Unnamed: 0,created_at,number_posts
0,2016-09,5176
1,2018-01,2108
2,2018-02,2126
3,2018-03,2069
4,2018-04,2060
5,2018-05,2086
6,2018-06,2111
7,2018-07,2062
8,2018-08,1941
9,2018-09,2036


In [129]:
# overall proportion of hateful posts
print('overall proportion of hateful posts: {:.1%} \n'.format(ghc_df[ghc_df.label==1].label.count()/ghc_df.shape[0]))

# proportion of hateful posts by month

gb = ghc_df.groupby([ghc_df.created_at.apply(lambda x: x.to_period('M')), 'label']).size()
prop_hateful_df = gb.to_frame('n_hateful')
prop_hateful_df['prop_hateful'] = gb/gb.groupby(level=0).sum()
prop_hateful_df.drop(0, level=1, inplace=True)
prop_hateful_df.reset_index(inplace=True)
prop_hateful_df.drop(columns=['label'])

overall proportion of hateful posts: 9.0% 



Unnamed: 0,created_at,n_hateful,prop_hateful
0,2016-09,376,0.072643
1,2018-01,190,0.090133
2,2018-02,181,0.085136
3,2018-03,190,0.091832
4,2018-04,186,0.090291
5,2018-05,232,0.111218
6,2018-06,224,0.106111
7,2018-07,214,0.103783
8,2018-08,167,0.086038
9,2018-09,179,0.087917


## Clean text

In [130]:
%%time

# Define function to clean text
def clean(text):

    # convert html
    text = unescape(text)
    
    # replace mentions, URLs and emojis with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join('[EMOJI]' if (char in emoji.UNICODE_EMOJI['en']) else char for char in text).strip()
    
    # clean
    text = unicodedata.normalize("NFKD", text)
    
    return text

# create clean_text column
ghc_df['clean_text'] = ghc_df.text.apply(clean)


# save number of documents before dropping empty posts
n_docs = ghc_df.shape[0]

# drop rows with empty text
ghc_df = ghc_df[ghc_df.clean_text!=""]

print(f'{n_docs} posts, of which {n_docs - ghc_df.shape[0]} were dropped for empty string content')
print(f'{ghc_df.shape[0]} posts remain. \n')


# save number of documents before dropping posts that are just [URL], [EMOJI] or [USER]
n_docs = ghc_df.shape[0]

25850 posts, of which 0 were dropped for empty string content
25850 posts remain. 

CPU times: user 688 ms, sys: 9.87 ms, total: 698 ms
Wall time: 708 ms


In [131]:
# get overview of frequency of special tokens
for special_token in ['[USER]', '[URL]', '[EMOJI]']:
    print(f'{special_token} contained in {ghc_df[ghc_df.clean_text.str.contains(special_token)].shape[0]} documents')

[USER] contained in 16539 documents
[URL] contained in 13959 documents
[EMOJI] contained in 18143 documents


## Split and save sets

In [133]:
# split up df into train and eval with specified proportions, stratified by label
TRAIN_PROP = 0.8
export_train, export_eval = train_test_split(ghc_df[['clean_text', 'label']], train_size = TRAIN_PROP, stratify = ghc_df.label, random_state = 123)

# export as separate csvs
export_train.to_csv('../../0_data/clean/labelled_ghc/train_random.csv', index=False)
export_eval.to_csv('../../0_data/clean/labelled_ghc/eval_random.csv', index=False)

In [135]:
# split up df into train and eval with specified proportions, stratified by label
TRAIN_SIZE = 80
TEST_SIZE = 16
export_train, export_eval = train_test_split(ghc_df[['clean_text', 'label']], train_size = TRAIN_SIZE, test_size = TEST_SIZE, stratify = ghc_df.label, random_state = 123)

# export as separate csvs
export_train.to_csv('../../0_data/clean/labelled_ghc/train_random_small.csv', index=False)
export_eval.to_csv('../../0_data/clean/labelled_ghc/eval_random_small.csv', index=False)