In [1]:
# import relevant packages

import pandas as pd
import os
import re

from html import unescape
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load raw datasets

In [2]:
df_dict = dict()

PATH = "./data/0_raw"

for file in os.listdir(PATH):
    if "ipynb" not in file:
        print(file)
        df_dict[file.removesuffix(".csv")] = pd.read_csv(f"{PATH}/{file}")

dynabench2021_english.csv
ousidhoum2019_french.csv
fortuna2019_portuguese.csv
basile2019_spanish.csv
ousidhoum2019_arabic.csv
sanguinetti2020_italian.csv


## Reformat columns
Need separate logic for different datasets. 1 is for hateful, 0 for non-hateful.

In [3]:
# Dynabench 2021 / English
df_dict["dynabench2021_english"].label.replace({"hate":1, "nothate":0}, inplace=True)

# Fortuna 2019 / Portuguese
df_dict["fortuna2019_portuguese"].rename(columns={"hatespeech_comb": "label"}, inplace=True)

# Basile 2019 / Spanish
df_dict["basile2019_spanish"].rename(columns={"HS": "label"}, inplace=True)

# Sanguinetti 2020 / Italian
df_dict["sanguinetti2020_italian"].rename(columns={"hs": "label"}, inplace=True)

# Ousidhoum 2019 / Arabic & French
for d in ["ousidhoum2019_arabic", "ousidhoum2019_french"]:
    df_dict[d]["label"] = df_dict[d].sentiment.apply(lambda x: 1 if "hateful" in x else 0)
    # text was already cleaned in a way that conflicts with our later cleaning, so we align it here
    df_dict[d]["text"] = df_dict[d].tweet.apply(lambda x: x.replace("@user", "[USR]").replace("@url", "[URL]"))
    
# drop redundant columns
for dataset in df_dict:
    df_dict[dataset] = df_dict[dataset][["text", "label"]]

## Clean text

In [4]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USR]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.strip()
    return text

for dataset in df_dict:
    df_dict[dataset]["text"] = df_dict[dataset].text.apply(lambda x: clean_text(x))

## Show descriptive stats

In [5]:
def descriptive_stats(df):
    n_total = df.shape[0]
    n_hate = df.label.sum()
    print("{} entries, of which {} ({:.2%}) are hateful.".format(n_total, n_hate, n_hate/n_total))
    return df.label.sum()/len(df), len(df)

for dataset in df_dict:
    print(dataset.upper())
    descriptive_stats(df_dict[dataset])
    display(df_dict[dataset].sample(5))
    print()

DYNABENCH2021_ENGLISH
41144 entries, of which 22175 (53.90%) are hateful.


Unnamed: 0,text,label
35067,women are only good for $ex,1
2934,tennis is not for you as your muscles are not strong,0
19344,Men staying at home is a danger to women apparently. Then tell those women to leave!,1
24391,Totally agree. I could never find a woman or man attractive to go out with. I might think they look good and are very beautiful ect. But I couldnt be in a relationship with them as I'm aromantic.,0
20013,"Open your eyes: It's called the ''Alice in wonderland technique' The aim of the Alice in Wonderland or confusion technique is to confound the expectations and conditioned reactions of those immigrent vermin. They are accustomed to a world that makes sense, at least to him: a world of continuity and logic, a predictable world.",1



OUSIDHOUM2019_FRENCH
4014 entries, of which 399 (9.94%) are hateful.


Unnamed: 0,text,label
649,[USR] /o je sais pas. je comprends pas. j'en ai marre. va encore se reprendre toute cette violence dans la gueule. #lassitude,0
1075,[USR] c’est demain attardé,0
826,les rebeus faut arrêter chaque dîner de faire du couscous naima qui danse juste la honte !!!! faut revoir t'on repas #udpp,0
2755,dsl beyondelabeach mais ton compte étais trop simple sautée jeune attardé,0
254,les daronnes renois elles parlent avec les yeux mdrr,0



FORTUNA2019_PORTUGUESE
5670 entries, of which 1788 (31.53%) are hateful.


Unnamed: 0,text,label
2242,jApOnÊs AbRe TeU oLhO,1
3445,Pelo menos ia proteger o ocidente do islam.,0
3197,O orgulho hetero é tão válido quanto o orgulho homo. Se um pode ter orgulho por que o outro não? #orgulhohetero,0
2485,[USR] Nazista é contra JUDEU..contra Israel... bolsonaro é pro-Israel.... o que uma pessoa que defende israel tem a ver com nazi?,1
3788,Raiane sapatão fld dá mh bunda KKKKKKKKK,1



BASILE2019_SPANISH
6600 entries, of which 2739 (41.50%) are hateful.


Unnamed: 0,text,label
1157,Estoy hasta la polla de no poder dejar que la gente me etiquete en cosas en FB porque tengo familiares mayores imbeciles.,0
3866,[USR] Cállate hoy!!!! La puta que te pario!!!! Deja 1 DÍA SIN MALAS NOTICIAS... 1 día de paz!!!!! La recalcada concha de tu madre,0
6211,"Esto se pasa de castaño oscuro. Hay que endurecer las medidas. Esas personas no merecen estar expuestas a tal peligro. Que electrifiquen las vallas, y ya que cada inmigrante decida si asumir tal riesgo. [URL]",1
2510,LOS ARABES DE AMÉRICA. La FEF necesitará $ 5.200.000 (cinco millones doscientos mil dolares) para el pago del Cuerpo Técnico comandado por Hernán Darío Gómez que llega mañana y será presentado a las 11h00. Los cuatro años de contrato hoy se convierten en un grave albur.,0
3826,"han puesto un centro para acoger a inmigrantes en mi pueblo y me he apuntado como voluntaria, voy mañana y la verdad es que estoy nerviosa",0



OUSIDHOUM2019_ARABIC
3353 entries, of which 755 (22.52%) are hateful.


Unnamed: 0,text,label
2279,#في_بلادك_تقدري تعيشي بلا ما تسمعي : عورة ، بلاصتك فالكوزينة ، مسطاشة ، عاهرة ، بايرة ...,0
349,[USR] [USR] خنازير البنا,0
2076,نيويورك تايمز: هكذا يعيش اللاجئون اليمنيون بكوريا الجنوبية: [URL],0
2181,[USR] [USR] على اساس انتي بروفيسورة بعلم شراب بول البعير الطازج ومتفتهمين كلامي,0
1607,[USR] [USR] عليك اللعنه يا خنزير سنجعل جثثكم تتعفن وتأكلها الكلاب,1



SANGUINETTI2020_ITALIAN
8100 entries, of which 3388 (41.83%) are hateful.


Unnamed: 0,text,label
967,Paolini: “Altri 1302 #profughi in provincia di #Pesaro. Presidio di 24 ore della #Lega” [URL],0
1530,"#PapaFrancesco: “Vittime della guerra del terrorismo, di un lavoro da schiavi: Gesù è in ognuno di loro”… [URL]",0
6942,"[USR] [USR] Invece le feccie islamiche e l'islam sono una bella cosa. Al massimo non ne parliamo... Ipocrisia, portami via.",1
3028,quando anche il vaticano sarà attaccato ci accorgeremo che l'islam forse non è una religione di pace! vero Boldrini? svegliamoci!,1
6916,"#SALVINI: ""ECCO I 'POVERI IMMIGRATI' CHE DISTRUGGONO LE AUTO E I NEGOZI E AGGREDISCONO LE FORZE DELL’ORDINE... A CASA!!!!!"" [URL]",1





## Create and export splits

In [6]:
# set aside 2.5k from each dataset for testing
# except for Ousidhoum in French and Arabic, where train set would otherwise be too small

TEST_SIZE = 2500

for dataset in df_dict:
    if "ousid" in dataset:
        df_dict[dataset], testset = train_test_split(df_dict[dataset], test_size = 1000, random_state=123)
        testset.to_csv(f"./data/1_clean/{dataset}/test_1000.csv", index=False)
    else:
        df_dict[dataset], testset = train_test_split(df_dict[dataset], test_size = TEST_SIZE, random_state=123)
        testset.to_csv(f"./data/1_clean/{dataset}/test_{TEST_SIZE}.csv", index=False)

In [7]:
# create differently-sized train portions from rest of data

SEEDS = 10 # for repeated experiments with different random data selection
N_RANGE = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset.upper())
    for n in N_RANGE:
        if n<len(df_dict[dataset]):
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"./data/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
    print()

DYNABENCH2021_ENGLISH
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set
  saving n = 5000 training set
  saving n = 10000 training set
  saving n = 20000 training set

OUSIDHOUM2019_FRENCH
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

FORTUNA2019_PORTUGUESE
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

BASILE2019_SPANISH
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 