In [1]:
# import relevant packages

import pandas as pd
import os
import re

from html import unescape
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load raw datasets

In [2]:
df_dict = dict()

PATH = "./data/0_raw"

for file in os.listdir(PATH):
    if "ipynb" not in file:
        print(file)
        df_dict[file.removesuffix(".csv")] = pd.read_csv(f"{PATH}/{file}")

dynabench2021_english.csv
ousidhoum2019_french.csv
fortuna2019_portuguese.csv
basile2019_spanish.csv
ousidhoum2019_arabic.csv
sanguinetti2020_italian.csv


## Reformat columns
Need separate logic for different datasets. 1 is for hateful, 0 for non-hateful.

In [3]:
# Dynabench 2021 / English
df_dict["dynabench2021_english"].label.replace({"hate":1, "nothate":0}, inplace=True)

# Fortuna 2019 / Portuguese
df_dict["fortuna2019_portuguese"].rename(columns={"hatespeech_comb": "label"}, inplace=True)

# Basile 2019 / Spanish
df_dict["basile2019_spanish"].rename(columns={"HS": "label"}, inplace=True)

# Sanguinetti 2020 / Italian
df_dict["sanguinetti2020_italian"].rename(columns={"hs": "label"}, inplace=True)

# Ousidhoum 2019 / Arabic & French
for d in ["ousidhoum2019_arabic", "ousidhoum2019_french"]:
    df_dict[d]["label"] = df_dict[d].sentiment.apply(lambda x: 1 if "hateful" in x else 0)
    # text was already cleaned in a way that conflicts with our later cleaning, so we align it here
    df_dict[d]["text"] = df_dict[d].tweet.apply(lambda x: x.replace("@url", "http"))
    
# drop redundant columns
for dataset in df_dict:
    df_dict[dataset] = df_dict[dataset][["text", "label"]]

## Clean text

In [4]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'@user',text)
    text = re.sub(r"http\S+",'http',text)
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.replace("[URL]", "http")
    text = text.strip()
    return text

for dataset in df_dict:
    df_dict[dataset]["text"] = df_dict[dataset].text.apply(lambda x: clean_text(x))

## Show descriptive stats

In [5]:
def descriptive_stats(df):
    n_total = df.shape[0]
    n_hate = df.label.sum()
    print("{} entries, of which {} ({:.2%}) are hateful.".format(n_total, n_hate, n_hate/n_total))
    return df.label.sum()/len(df), len(df)

for dataset in df_dict:
    print(dataset.upper())
    descriptive_stats(df_dict[dataset])
    display(df_dict[dataset].sample(5))
    print()

DYNABENCH2021_ENGLISH
41144 entries, of which 22175 (53.90%) are hateful.


Unnamed: 0,text,label
20623,Women are so fucking self centered and incapable of seeing the bigger picture. Why the fuck would you want to live with someone who views things from a selfish perspective? It sucks. You want to know why it sucks? Because women.,1
35912,gays are vile,1
9791,"youre a cotton picker! what the fuck even is that? That's an 18th century fucking insult boi and you stop me calling that, I'll break your face",1
21865,Zyklon b is just a dezinfectant though not a deadly virus,0
6218,"I adore you man, you are just anti-nigga as I am",1



OUSIDHOUM2019_FRENCH
4014 entries, of which 399 (9.94%) are hateful.


Unnamed: 0,text,label
3055,putain d'attardé,0
1427,@user @user @user mdr les renois qui like fort les insultes vous voit,0
3446,@user sans emploie hhhhh. c'est un rebeu qui lui pris ça place. quelle grosse merde. de toute façon les fachos… http,0
1259,@user ils se sont cru dans un film juste des gars un peut attardé un peut trop influencé par les films/virtuel… http,0
1790,mdrr le mec parle des bites de noirs ça se voit il connaît pas les teubs des rebeus — bon ça est mdrrrr stop http,1



FORTUNA2019_PORTUGUESE
5670 entries, of which 1788 (31.53%) are hateful.


Unnamed: 0,text,label
3886,RT @user: eu toda vez que vejo alguém falando que racismo inverso existe http,0
1832,"@user dá uma fuçada lá, mãe, primeiro livro da minha amiga @user www.aescritoraeomusico.blogspot.com (eu reviso)",0
4519,RT @user @user nao paga pau. Ele paga uma mocreia para tirar a zica.,1
1924,"@user hahahah Só 3 maconheiras vaiam o Mito: vira notícia; E isso aqui, G1?http",0
659,"BOA PRELIMINAR, MAS AGORA A GERAÇÃO CREME DE AVELÃ EMOCIONADA PODE SE RETIRAR: VAI COMEÇAR FUTEBOL DE VERDADE: LIBERTADORES E COPA DO BRASIL",0



BASILE2019_SPANISH
6600 entries, of which 2739 (41.50%) are hateful.


Unnamed: 0,text,label
2822,@user Me enseñaron que a los hijos de puta como tú hay que dejarlos bien muertos.,0
1863,"@user Ella es más bonita, tú eres más perra.",1
358,Putos moros se están cargando el pais. http,1
739,@user @user @user Tu no te mereces q yo me esfuerce en poner ni una #puta coma para q te ahogues al leer #bastardo,0
5293,@user @user tu eres diamante zorra,1



OUSIDHOUM2019_ARABIC
3353 entries, of which 755 (22.52%) are hateful.


Unnamed: 0,text,label
2381,تخيل بنفس الوقت تكلم خنزير بالواقع ويسبك سبودر بالخاص ويمنشنك وسيف ؟ اي انحطاط توصلت له يا انا,0
2842,@user هؤلاء مجرد أنعام، كرم الله الأنعام، وأنما خنازير بشريه، مأكلها خبيث ومشربها خبيث، ومسكنها خبيث، ول… http,1
2077,ترى شكلج غبي وانتي تستنقصين من بنات جنسج وتدافعين عن الشباب وتبررين لهم التحرش بحجة ان البنت مب متحجبة او تسوي شوو!… http,0
430,@user @user خنزير يرتوت لخنزير,0
2188,من أول انطباع تعرف أنه ازق من مر على النادي بعد خنزير اليونايتد http,0



SANGUINETTI2020_ITALIAN
8100 entries, of which 3388 (41.83%) are hateful.


Unnamed: 0,text,label
1664,"@user @user Vallo a dire alla protettrice dei rom, Diana Pavlovic! ������",0
6799,NESSUNO È STATO BUTTATO PER STRADATUTTE STRONZATEIL DECRETO SALVINI ATTACCA GLI INTERESSI DELL'IMMIGRAZIONE CLANDESTINACHE COSTA MILIONI DI EURO AL GIORNOAGLI ITALIANI.Decreto Salvini: a chi fa paura il ritorno della legalità? http,0
2691,Trovano una #rom in casa a rubare ma non la denunciano xchè temono ritorsioni da parte dei #rom http succede in #Italia,0
4782,Ma guarda. L’Autorità Marittima di #Panama ha comunicato ha avviato il processo di cancellazione dal proprio registro navale della #Aquarius di @user e @user perché “non tiene conto delle procedure legali internazionali sul recupero degli immigrati nel Mediterraneo”.,0
1923,Dal campo rom di Castel Romano alla Capitale per rubare portafogli: arrestate due donne ‘in trasferta’ http,0





## Create and export splits

In [6]:
# set aside 2.5k from each dataset for testing
# except for Ousidhoum in French and Arabic, where train set would otherwise be too small

TEST_SIZE = 2500

for dataset in df_dict:
    if "ousid" in dataset:
        df_dict[dataset], testset = train_test_split(df_dict[dataset], test_size = 1000, random_state=123)
        testset.to_csv(f"./data/1_clean/{dataset}/test_1000.csv", index=False)
    else:
        df_dict[dataset], testset = train_test_split(df_dict[dataset], test_size = TEST_SIZE, random_state=123)
        testset.to_csv(f"./data/1_clean/{dataset}/test_{TEST_SIZE}.csv", index=False)
        
# export all non-test data, so we can use it for active learning later
for dataset in df_dict:
    df_dict[dataset].to_csv(f"./data/1_clean/{dataset}/train_{len(df_dict[dataset])}.csv", index=False)

In [7]:
# create differently-sized train portions from rest of data

SEEDS = 10 # for repeated experiments with different random data selection
N_RANGE = [10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset.upper())
    for n in N_RANGE:
        if n<len(df_dict[dataset]):
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"./data/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
    print()

DYNABENCH2021_ENGLISH
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set
  saving n = 5000 training set
  saving n = 10000 training set
  saving n = 20000 training set

OUSIDHOUM2019_FRENCH
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

FORTUNA2019_PORTUGUESE
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

BASILE2019_SPANISH
  saving n = 10 training set
  saving n = 20 training set
  saving n = 50 training set
  saving n = 