# Load, clean and split labelled GQ data (Qian et al. 2019)

In [1]:
import pandas as pd
import csv
import datetime
import emoji
import re
import unicodedata

from html import unescape
from sklearn.model_selection import train_test_split

## Load data

In [2]:
# load texts from labelled corpus into set

# initialise empty lists --> faster than appending to dict
texts = []
hate_ids = []

with open('../../0_data/raw/qian_gab_raw.csv', 'r') as read_obj:
    csv_dict_reader = csv.DictReader(x.replace('\n', ' ') for x in read_obj)
    for row in csv_dict_reader:
        texts.append(row['text'])
        hate_ids.append(row['hate_speech_idx'])

In [3]:
%%time

# create dataframe from lists
texts = pd.Series(texts, name = 'text')
hate_ids = pd.Series(hate_ids, name = 'hate_ids')
sample_df = pd.concat([texts, hate_ids], axis=1)


# convert dtypes
sample_df['text']= sample_df.text.astype('string')

# print finished df
sample_df

CPU times: user 10.7 ms, sys: 6.01 ms, total: 16.7 ms
Wall time: 37.5 ms


Unnamed: 0,text,hate_ids
0,1. i joined gab to remind myself how retarded ...,[1]
1,1. This is what the left is really scared of. ...,[3]
2,1. It makes you an asshole. 2. Give it to a ...,[2]
3,1. So they manage to provide a whole lot of da...,[2]
4,"1. Hi there, i,m Keith, i hope you are doing w...",[3]
...,...,...
11820,1. Remember this 3 months ago? The intern that...,[2]
11821,1. #Sweden's four major dailies aftonbladet.se...,[2]
11822,1. Satisfaction and justice 2. BEST TRUMP INS...,[3]
11823,1. Twitter BANNED me and reported me to the FB...,"[2, 3]"


In [4]:
# create copy to work with
import_df = sample_df.copy()

In [5]:
# split text column by enumeration
series = import_df.text
for i in reversed(range(2,30)):
    series = series.str.split(f"{i}. \t").explode()

# merge text column with initial index
import_df = pd.DataFrame(series).merge(import_df.hate_ids, left_index=True, right_index=True).reset_index()

In [6]:
# create thread-internal id to match with hate_ids

def count_indices(c):
    return list(range(1,c+1))

ids = import_df.groupby('index').index.count().apply(lambda x: count_indices(x)).explode()
ids.name = 'id'

import_df = pd.concat([import_df, pd.DataFrame(ids).reset_index(drop=True)], axis=1)

In [7]:
# convert hate_ids to proper lists
import_df['hate_ids'] = import_df.hate_ids.apply(lambda elem: elem.strip("[]").split(", "))

In [8]:
# compare thread id to hate_id and label as hateful if there is a match
import_df['label'] = import_df.apply(lambda row: 1 if str(row.id) in row.hate_ids else 0, axis=1)

In [9]:
# desired output (from paper): 14614 hateful posts
import_df.groupby('label').label.count()

label
0    19162
1    14614
Name: label, dtype: int64

In [10]:
# select relevant columns and write to copy
gq_df = import_df[['text', 'label']].copy()

gq_df

Unnamed: 0,text,label
0,1. i joined gab to remind myself how retarded ...,1
1,1. This is what the left is really scared of. ...,0
2,That literally looks like a monkey. Why are we...,0
3,\tDumb Cunt,1
4,1. It makes you an asshole.,0
...,...,...
33771,\tfucking retard,1
33772,1. Twitter BANNED me and reported me to the FB...,0
33773,"I was routinely suspended for saying fuck, and...",1
33774,\tThey got me for dropping the cunt bomb... a ...,1


## Clean text

In [11]:
%%time

# Define function to clean text
def clean(text):

    # convert html
    text = unescape(text)
    
    # replace mentions, URLs and emojis with special token
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USER]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = ''.join('[EMOJI]' if (char in emoji.UNICODE_EMOJI['en']) else char for char in text).strip()
    
    # clean misformatting (e.g. "\xa0")
    text = unicodedata.normalize("NFKD", text)
    
    # remove newline and tab characters
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    
    # remove leading "1. " (data artifact):
    if text.startswith('1.'):
        text = text[len('1.'):]
        
    # collapse whitespace into single whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # remove leading and trailing whitespaces
    text = text.strip()
    
    return text

# create clean_text column
gq_df['clean_text'] = gq_df.text.apply(clean)

# save number of documents before dropping empty posts
n_docs = gq_df.shape[0]

# drop rows with empty text
gq_df = gq_df[gq_df.clean_text!=""]

print(f'{n_docs} posts, of which {n_docs - gq_df.shape[0]} were dropped for empty string content')
print(f'{gq_df.shape[0]} posts remain. \n')

# save number of documents before dropping posts that are just [URL], [EMOJI] or [USER]
n_docs = gq_df.shape[0]

33776 posts, of which 982 were dropped for empty string content
32794 posts remain. 

CPU times: user 1.45 s, sys: 22.6 ms, total: 1.48 s
Wall time: 1.49 s


In [12]:
# get overview of frequency of special tokens
for special_token in ['[USER]', '[URL]', '[EMOJI]']:
    print(f'{special_token} contained in {gq_df[gq_df.clean_text.str.contains(special_token)].shape[0]} documents')

[USER] contained in 17238 documents
[URL] contained in 13596 documents
[EMOJI] contained in 20974 documents


## TOTAL: Split and save sets

In [18]:
# full df
export_train, export_eval = train_test_split(gq_df[['clean_text', 'label']], train_size = 0.8, stratify = gq_df.label, random_state = 123)
export_train.to_csv('../../0_data/clean/labelled_gq/train_rand_26k.csv', index=False)
export_eval.to_csv('../../0_data/clean/labelled_gq/test_rand_6k.csv', index=False)
    
for size, name in [(20000, "20k"), (10000, "10k"), (5000, "5k"), (2000, "2k"), (1000, "1k")]:
    export_sample_train, _ = train_test_split(export_train, train_size = size, stratify = export_train.label, random_state = 123)
    export_train.to_csv(f'../../0_data/clean/labelled_gq/train_rand_{name}.csv', index=False)