In [1]:
# import relevant packages

import pandas as pd
import os
import re

from html import unescape
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load raw datasets

In [2]:
df_dict = dict()

PATH = "./data/0_raw"

for file in os.listdir(PATH):
    if "ipynb" not in file:
        print(file)
        df_dict[file.removesuffix(".csv")] = pd.read_csv(f"{PATH}/{file}")

dynabench2021_english.csv
fortuna2019_portuguese.csv
basile2019_spanish.csv
sanguinetti2020_italian.csv


## Reformat columns
Need separate logic for different datasets. 1 is for hateful, 0 for non-hateful.

In [3]:
# Dynabench 2021 / English
df_dict["dynabench2021_english"].label.replace({"hate":1, "nothate":0}, inplace=True)

# Fortuna 2019 / Portuguese
df_dict["fortuna2019_portuguese"].rename(columns={"hatespeech_comb": "label"}, inplace=True)

# Basile 2019 / Spanish
df_dict["basile2019_spanish"].rename(columns={"HS": "label"}, inplace=True)

# Sanguinetti 2020 / Italian
df_dict["sanguinetti2020_italian"].rename(columns={"hs": "label"}, inplace=True)

# drop redundant columns
for dataset in df_dict:
    df_dict[dataset] = df_dict[dataset][["text", "label"]]

## Clean text

In [4]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USR]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.strip()
    return text

for dataset in df_dict:
    df_dict[dataset]["text"] = df_dict[dataset].text.apply(lambda x: clean_text(x))

## Show descriptive stats

In [5]:
def descriptive_stats(df):
    n_total = df.shape[0]
    n_hate = df.label.sum()
    print("{} entries, of which {} ({:.2%}) are hateful.".format(n_total, n_hate, n_hate/n_total))
    return df.label.sum()/len(df), len(df)

for dataset in df_dict:
    print(dataset.upper())
    descriptive_stats(df_dict[dataset])
    display(df_dict[dataset].sample(5))
    print()

DYNABENCH2021_ENGLISH
41144 entries, of which 22175 (53.90%) are hateful.


Unnamed: 0,text,label
32504,"REMINDER: You're under Constitutional obligation to give a sh*t about black people, hispanic people or anyone else...",0
1010,Gay men hate women,1
6682,chink is a fucking unethical concept,0
41024,Politicians are rubbish,0
30724,"The French have always had looser morals than the rest of the world, but were also the most accepting of homosexuality. It is because of it being degenerate behaviour, not a coincidence.",1



FORTUNA2019_PORTUGUESE
5670 entries, of which 1788 (31.53%) are hateful.


Unnamed: 0,text,label
1852,"[USR] pensava: sou horrorosa, gorda, tosca, feia, chata. ngm vai me querer. n mereço ngm alem dessa pessoa.",0
5513,"'Vai também ser lançado um manual de boas maneiras para lidar com fufas, gays e transsexuais, os chamados LGTB' - notícias matinais na rádio",1
2436,"Lívia Nepomuceno, FOX Sports. Só consigo olhar para uma coisa. Duas, no caso. [URL]",0
5065,"Sem dúvida que os nacionalistas falam a mesma linguagem. Cá como lá, ao lado das forças da ordem! [USR] _ [URL]",0
5415,"Uber de puta: a mulher não quer ser uma GP, mas precisa de uma grana, vai lá e dá uma. Mais barato e mais prático Fica a ideia ae",1



BASILE2019_SPANISH
6600 entries, of which 2739 (41.50%) are hateful.


Unnamed: 0,text,label
1081,[USR] TU ERES UNA PUTA DE NOM GUARRA.,1
373,Puta madre😩 mi novia es una belleza 😍😩,0
2813,"-En que se parece una mujer a una foca ? -En que una es gorda, tiene bigote y huele a pescado y la otra vive en el mar",1
6284,"[USR] La ley debe ser para todos igual, cualquier preferencia convierte la ley en papel sin valor. Soy inmigrante ahora y me apego a la legalidad q me exigen, pero aquí hay preferencias ""humanitarias"" con sesgo político",0
1684,CÁLLATE PUTA ZORRA BORRACHA DROGADICTA [URL],1



SANGUINETTI2020_ITALIAN
8100 entries, of which 3388 (41.83%) are hateful.


Unnamed: 0,text,label
224,"[USR] non vogliamo rom disonesti, ok, io non voglio neanche italiani disonesti, tutto qui.",0
5630,Dedicato alla sinistra che giustifica il reato di favoreggiamento dell'immigrazione clandestina e sputa sulla legalità. Vergogna buonista. [URL],0
3689,"Ogni #suicidio che avviene per colpa della #crisi deve fare #riflettere il #governo.pagare i #terroristi si, #imprendiotri #impiccati",1
7262,"[USR] Lo so, lo so, alla gente piaciono i morbidi, i paciosi, quelli che ti dicono ""va tutto bene"" anche quando stai per morire. Capisco. Il problema è che qui si sta demolendo tutto e l'attenzione è solo su sti cazzo di migranti. Affondatele quelle cazzo di navi perdiana.",1
995,#Milano ville abusive dei nomadi [URL] “Se costruisco io demoliscono in un giorno se costruiscono rom non succede nulla”,0





## Create and export splits

In [6]:
# set aside 2.5k from each dataset for testing

TEST_SIZE = 2500

for dataset in df_dict:
    df_dict[dataset], testset = train_test_split(df_dict[dataset], test_size = TEST_SIZE, random_state=123)
    testset.to_csv(f"./data/1_clean/{dataset}/test_{TEST_SIZE}.csv", index=False)

In [7]:
# create differently-sized train portions from rest of data

SEEDS = 10 # for repeated experiments with different random data selection
N_RANGE = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]

for dataset in df_dict:
    for n in N_RANGE:
        if n<len(df_dict[dataset]):
            for random_state in range(1, SEEDS+1):
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"./data/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)