In [1]:
# import relevant packages

import pandas as pd
import os
import re

from html import unescape
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load raw datasets

In [2]:
df_dict = dict()

PATH = "./data/0_raw"

for file in os.listdir(PATH):
    if "ipynb" not in file:
        print(file)
        df_dict[file.removesuffix(".csv")] = pd.read_csv(f"{PATH}/{file}")

dynabench2021_english.csv
ousidhoum2019_french.csv
fortuna2019_portuguese.csv
basile2019_spanish.csv
ousidhoum2019_arabic.csv
sanguinetti2020_italian.csv


## Reformat columns
Need separate logic for different datasets. 1 is for hateful, 0 for non-hateful.

In [3]:
# Dynabench 2021 / English
df_dict["dynabench2021_english"].label.replace({"hate":1, "nothate":0}, inplace=True)

# Fortuna 2019 / Portuguese
df_dict["fortuna2019_portuguese"].rename(columns={"hatespeech_comb": "label"}, inplace=True)

# Basile 2019 / Spanish
df_dict["basile2019_spanish"].rename(columns={"HS": "label"}, inplace=True)

# Sanguinetti 2020 / Italian
df_dict["sanguinetti2020_italian"].rename(columns={"hs": "label"}, inplace=True)

# Ousidhoum 2019 / Arabic & French
for d in ["ousidhoum2019_arabic", "ousidhoum2019_french"]:
    df_dict[d]["label"] = df_dict[d].sentiment.apply(lambda x: 1 if "hateful" in x else 0)
    # text was already cleaned in a way that conflicts with our later cleaning, so we align it here
    df_dict[d]["text"] = df_dict[d].tweet.apply(lambda x: x.replace("@user", "[USR]").replace("@url", "[URL]"))
    
# drop redundant columns
for dataset in df_dict:
    df_dict[dataset] = df_dict[dataset][["text", "label"]]

## Clean text

In [4]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'[USR]',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.strip()
    return text

for dataset in df_dict:
    df_dict[dataset]["text"] = df_dict[dataset].text.apply(lambda x: clean_text(x))

## Show descriptive stats

In [5]:
def descriptive_stats(df):
    n_total = df.shape[0]
    n_hate = df.label.sum()
    print("{} entries, of which {} ({:.2%}) are hateful.".format(n_total, n_hate, n_hate/n_total))
    return df.label.sum()/len(df), len(df)

for dataset in df_dict:
    print(dataset.upper())
    descriptive_stats(df_dict[dataset])
    display(df_dict[dataset].sample(5))
    print()

DYNABENCH2021_ENGLISH
41144 entries, of which 22175 (53.90%) are hateful.


Unnamed: 0,text,label
40215,Muslims are doing great at disrupting our society,1
5466,"black people are everywhere, white people are everywhere. The world isnt really defined by colour of skin, it's defined by strength of character",0
2013,W O M E N D O N O T B E L O N G T O T H E W O R K P L A C E,1
30859,why do swifites hate gay people so much? even gays hate swifties soo much.,0
37574,Pakistanis dont belong here,1



OUSIDHOUM2019_FRENCH
4014 entries, of which 399 (9.94%) are hateful.


Unnamed: 0,text,label
1895,soeur elle sept ans lui apprend jouer au echec elle nous dis mais pourquoi faut proteger le roi ? c'est une gauchiste certifiee,0
737,[USR] donc répète pour les mongol du rn: la balance de l’immigration en france c’est environ 300 000 personn… [URL],0
2129,[USR] aller aller sale arabe y’a aucune générosité avec vous salut.,0
813,[USR] [USR] [USR] non je préfère juste que tu arrêtes d'employer ton vocabulaire de gauchiste (c… [URL],0
3259,[USR] [USR] [USR] c'est dingue ce terrorisme intellectuel! vous avez le même prénom certes mais… [URL],0



FORTUNA2019_PORTUGUESE
5670 entries, of which 1788 (31.53%) are hateful.


Unnamed: 0,text,label
71,"A censura democrática é a pior delas todas porque é arbitrária, hipócrita e cobarde. Finge não existir, mas existe! [URL]",0
2946,Nem uma menos no STF [URL] via [USR],0
4279,RT [USR]: Arthur é a favor da sexualização das crianças? [URL],1
4405,RT [USR]: Mais uma vez a dualidade de critérios. Ricardo Costa muito mais rápido a denunciar as identidades dos [USR] do que os a _,0
487,Aquele símbolo que dá aquela fome [URL],0



BASILE2019_SPANISH
6600 entries, of which 2739 (41.50%) are hateful.


Unnamed: 0,text,label
5528,[USR] Cállatee Jajajaja la puta madre,0
4018,"🧐🤔🤔 “puritanismo sexual” en pocas a Catherine Deneuve no le parece acoso que un imbécil te toque porque le gustas, es un coqueteo tonto... #fuckU Cien artistas e intelectuales francesas contra el “puritanismo” sexual | Cultura | EL PAÍS [URL]",0
3111,[USR] [USR] Por dioss que puta vergüenza es esta! 😠,0
5143,"Aggg!!! Eres un cabron, un gilipollas, un imbecil. Que tal si de una puta vez dejas de jugar con los sentimientos d… [URL]",0
6202,"[USR] Lo que tienes que hacer, presuntuoso petimetre populista, es aprobar un presupuesto para llenar de muros, cámaras infrarrojas y concertinas todas nuestras fronteras y devolver en caliente a los que se cuelen ¿Que quieres, la MUERTE de nuestros guardias civiles? [USR]",0



OUSIDHOUM2019_ARABIC
3353 entries, of which 755 (22.52%) are hateful.


Unnamed: 0,text,label
2908,[USR] هذا العمليه الإجراميه الإرهابيه برعايتكم يا دموييين يا انجاس تسوون فتنه بين المسيحيين الأقباط والمسلمين ل… [URL],1
2013,يعجبني اكثر ان الحريم يقولون اكسينشن ونحسبه الفورد الكبير وهو وصلات شعر,0
903,-والله كانت متبرجه يا شيخ,0
482,هههههههههههههههههههه بول البعير يضاهي الكحول بالنسبة للسعوديين,1
1355,[USR] [USR] [USR] [USR] [USR] [USR] ياللوح ماهو بول البعير بول الناقه وفي… [URL],0



SANGUINETTI2020_ITALIAN
8100 entries, of which 3388 (41.83%) are hateful.


Unnamed: 0,text,label
2757,"#SCABBIA . Ma anche #pidocchi, asili e bambini infestati da immigrati che NON eseguono trattamento. [URL]",1
3517,[USR] sveglia Renzi non devi dire che è tutto tranquillo devi chiudetele frontiere ..tu Renzi fai entrare terroristi,1
6187,Zuccaro il magistrato leghista massone escluso dai suoi colleghi dalle indagini mafiose e costretto per farsi notare ad attaccare le ONG e i poveri immigrati 🤔🤔,0
2184,"La Polizia di #Terracina ha controllato il quartiere denominato “Capanne”, luogo di aggregazione di stranieri [URL]",0
2941,"[USR] Ma si tiene in albergo gli immigrati negri dotandoli di tutto,telefono compreso. Renzi ed Alfano: fate schifo!!",1





## Create and export splits

In [6]:
# set aside 2.5k from each dataset for testing
# except for Ousidhoum in French and Arabic, where train set would otherwise be too small

TEST_SIZE = 2500

for dataset in df_dict:
    if "ousid" in dataset:
        df_dict[dataset], testset = train_test_split(df_dict[dataset], test_size = 1000, random_state=123)
        testset.to_csv(f"./data/1_clean/{dataset}/test_1000.csv", index=False)
    else:
        df_dict[dataset], testset = train_test_split(df_dict[dataset], test_size = TEST_SIZE, random_state=123)
        testset.to_csv(f"./data/1_clean/{dataset}/test_{TEST_SIZE}.csv", index=False)
        
# export all non-test data, so we can use it for active learning later
for dataset in df_dict:
    df_dict[dataset].to_csv(f"./data/1_clean/{dataset}/train_{len(df_dict[dataset])}.csv", index=False)

In [7]:
# create differently-sized train portions from rest of data

SEEDS = 10 # for repeated experiments with different random data selection
N_RANGE = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset.upper())
    for n in N_RANGE:
        if n<len(df_dict[dataset]):
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"./data/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
    print()

DYNABENCH2021_ENGLISH
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set
  saving n = 5000 training set
  saving n = 10000 training set
  saving n = 20000 training set

OUSIDHOUM2019_FRENCH
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

FORTUNA2019_PORTUGUESE
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

BASILE2019_SPANISH
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 