In [1]:
# import relevant packages

import pandas as pd
import os
import re

from html import unescape
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load raw datasets

In [2]:
df_dict = dict()

PATH = "./data/0_raw"

for file in os.listdir(PATH):
    if "ipynb" not in file:
        print(file)
        df_dict[file.removesuffix(".csv")] = pd.read_csv(f"{PATH}/{file}")

dynabench2021_english.csv
fortuna2019_portuguese.csv
basile2019_spanish.csv
sanguinetti2020_italian.csv


## Reformat columns
Need separate logic for different datasets. 1 is for hateful, 0 for non-hateful.

In [3]:
# Dynabench 2021 / English
df_dict["dynabench2021_english"].label.replace({"hate":1, "nothate":0}, inplace=True)

# Fortuna 2019 / Portuguese
df_dict["fortuna2019_portuguese"].rename(columns={"hatespeech_comb": "label"}, inplace=True)

# Basile 2019 / Spanish
df_dict["basile2019_spanish"].rename(columns={"HS": "label"}, inplace=True)

# Sanguinetti 2020 / Italian
df_dict["sanguinetti2020_italian"].rename(columns={"hs": "label"}, inplace=True)

# drop redundant columns
for dataset in df_dict:
    df_dict[dataset] = df_dict[dataset][["text", "label"]]

## Clean text

In [4]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USR]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.strip()
    return text

for dataset in df_dict:
    df_dict[dataset]["text"] = df_dict[dataset].text.apply(lambda x: clean_text(x))

## Show descriptive stats

In [5]:
def descriptive_stats(df):
    n_total = df.shape[0]
    n_hate = df.label.sum()
    print("{} entries, of which {} ({:.2%}) are hateful.".format(n_total, n_hate, n_hate/n_total))
    return df.label.sum()/len(df), len(df)

for dataset in df_dict:
    print(dataset.upper())
    descriptive_stats(df_dict[dataset])
    display(df_dict[dataset].sample(5))
    print()

DYNABENCH2021_ENGLISH
41144 entries, of which 22175 (53.90%) are hateful.


Unnamed: 0,text,label
8825,"cover your boobs in public if you are breastfeeding, women can be so inappriopriate",1
674,there's no point in a tucute's existence. Let's get rid of them,1
28893,"hello everyone, Hi there, Is there anyone here who would like to chat with me? I have no friends! We can talk in email or PM on Reddit!, hey bro just tell me, are you a refugee ?",0
3311,This computer is nothing but useless junk,0
37081,I can do without homos,1



FORTUNA2019_PORTUGUESE
5670 entries, of which 1788 (31.53%) are hateful.


Unnamed: 0,text,label
4903,RT [USR]: ESSA MULHER É FODA DEMAIS!! ELA DEVERIA SER EXEMPLO PRA TODAS AS. MULHERES NO MUNDO! [URL],0
4440,RT [USR]: VAGABUNDO CHEGA AGARRANDO AS MENINA SEJEM CAVALHEIROS: OLA TUDO BEM VOCÊ É LINDA SEUS LABIOS DEVEM TER GOSTO DE MÉL VAMO _,1
3875,RT [USR]: [USR] Volta para a tua terra,1
2664,[USR] me fala na agenda feminista que dia é o protesto na frente da mesquita islamica que vou la gritar junto [USR],1
1771,"[USR] [USR] [USR] vai na páginad o Flavio, ele fez uma live explicando",0



BASILE2019_SPANISH
4950 entries, of which 2055 (41.52%) are hateful.


Unnamed: 0,text,label
31,me saqué un 9 en el final de biología para vos hija de puta q no me quisiste promocionar por unas centésimas EN TU CARA Y EN TU CANCHA,0
573,Callate zorra chupa pija,1
1933,[USR] [USR] [USR] Es que la puta libre lucha denuncia la explotación sexual. No tiene nada que ver! Por la explotación laboral en la industria textil vamos a abolir la industria textil? No. Vamos a perseguir la explotación,0
2691,"Pedro Sánchez cede y autoriza a Open Arms a desembarcar 87 inmigrantes ilegales - Rambla Libre. ""Que importancia tienen estas admisiones cuando por las costas de Cádiz, en apoyo de las mafias, estamos admitiendo a miles cada semana"" dice Pedro Sánchez. [URL]",0
27,[USR] Tu eres demasiado perra 😂,1



SANGUINETTI2020_ITALIAN
8100 entries, of which 3388 (41.83%) are hateful.


Unnamed: 0,text,label
7552,[USR] Le merde dei #rom non saranno mai come NOI. E tu caro [USR] per continuare a fare #televisione devi continuare a #leccare il culo ai maiali #sinistronzi del #pd. Sei solo un #quacquaracqua.,1
2205,#GABBIAOPEN l’8% delle nostre imprese è gestito da immigrati. E creano lavoro anche per gli italiani [USR] #welcomerefugees,0
116,"Ministro Esteri dell'Eritrea: ""L'Italia non vuole collaborare con noi su migranti. Perché"" - Il Fogliettone [URL]",0
7140,"SUI #MIGRANTI LA #SINISTRA CON LA BAVA ALLA BOCCA PARLA DI #DEPORTAZIONI E NAZISMO Un trasloco da una struttura che tra l'altro in passato aveva creato problemi fatto passare dalla sinistra come atto razzista, nazista, da lager. Ormai hanno oltrepassato il limite della decenza! [URL]",0
3126,"[USR] [USR] è vero...com'è vero che tutti i terroristi sono musulmani....sarà un caso, mai un valdese ha messo bombe!",1





## Create and export splits

In [6]:
# set aside 2.5k from each dataset for testing

TEST_SIZE = 2500

for dataset in df_dict:
    df_dict[dataset], testset = train_test_split(df_dict[dataset], test_size = TEST_SIZE, random_state=123)
    testset.to_csv(f"./data/1_clean/{dataset}/test_{TEST_SIZE}.csv", index=False)

In [7]:
# create differently-sized train portions from rest of data

SEEDS = 10 # for repeated experiments with different random data selection
N_RANGE = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]

for dataset in df_dict:
    for n in N_RANGE:
        if n<len(df_dict[dataset]):
            for random_state in range(1, SEEDS+1):
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"./data/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)