In [1]:
# import relevant packages

import pandas as pd
import os
import re

from html import unescape
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load raw datasets

In [2]:
df_dict = dict()

PATH = "../0_data/main/0_raw"

for file in os.listdir(PATH):
    if "ipynb" not in file:
        print(re.sub('\.csv$', '', file))
        df_dict[re.sub('\.csv$', '', file)] = pd.read_csv(f"{PATH}/{file}")

san20_it
for19_pt
bas19_es
ous19_ar
dyn21_en
ous19_fr


## Reformat columns
Need separate logic for different datasets. 1 is for hateful, 0 for non-hateful.

In [3]:
# Dynabench 2021 / English
df_dict["dyn21_en"].label.replace({"hate":1, "nothate":0}, inplace=True)

# Fortuna 2019 / Portuguese
df_dict["for19_pt"].rename(columns={"hatespeech_comb": "label"}, inplace=True)

# Basile 2019 / Spanish
df_dict["bas19_es"].rename(columns={"HS": "label"}, inplace=True)

# Sanguinetti 2020 / Italian
df_dict["san20_it"].rename(columns={"hs": "label"}, inplace=True)

# Ousidhoum 2019 / Arabic & French
for d in ["ous19_ar", "ous19_fr"]:
    df_dict[d]["label"] = df_dict[d].sentiment.apply(lambda x: 1 if "hateful" in x else 0)
    # text was already cleaned in a way that conflicts with our later cleaning, so we align it here
    df_dict[d]["text"] = df_dict[d].tweet.apply(lambda x: x.replace("@url", "http"))
    
# drop redundant columns
for dataset in df_dict:
    df_dict[dataset] = df_dict[dataset][["text", "label"]]

## Clean text

In [4]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'@user',text) # format expected by XLM-T
    text = re.sub(r"http\S+",'http',text) # format expected by XLM-T
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.replace("[URL]", "http") # format expected by XLM-T
    text = text.strip()
    return text

for dataset in df_dict:
    df_dict[dataset]["text"] = df_dict[dataset].text.apply(lambda x: clean_text(x))

## Show descriptive stats

In [5]:
def descriptive_stats(df):
    n_total = df.shape[0]
    n_hate = df.label.sum()
    print("{} entries, of which {} ({:.2%}) are hateful.".format(n_total, n_hate, n_hate/n_total))
    return df.label.sum()/len(df), len(df)

for dataset in df_dict:
    print(dataset.upper())
    descriptive_stats(df_dict[dataset])
    display(df_dict[dataset].sample(5))
    print()

SAN20_IT
8100 entries, of which 3388 (41.83%) are hateful.


Unnamed: 0,text,label
2142,"@user sono persone di vari paesi europei,varie religioni e dunque anche musulmani.Isis contro ogni essere umano che vuole la libertà'",0
919,"RT Corriere ""Il rapper Bello Figo canta la vita ""comoda"" del profugo: e Mussolini ci casca … http http",0
4095,@user @user @user @user Rimandate tutti i migranti a casa loro.,1
3573,@user Per la nostra sicurezza bisogna identificare tutti quei fetidi di beduini islamici che sono andati davanti al colosseo e cacciarli,1
2928,#Piazzapulita:caro Formigli basta basta noi italiani siamo alla fame e voi ci rompete i coglioni con gli immigrati,1



FOR19_PT
5670 entries, of which 1788 (31.53%) are hateful.


Unnamed: 0,text,label
1211,dps d ontem eu pensei assim q so qro uma sapatao p chamar d meu camiaozino de amor,1
299,Algumas razões para explicar o Fenômeno Bolsonaro @user @user @user http,0
2917,"NÃO USO ISSO GOSTO DO TÉTE A TÉTE, CHEGA ENCOCHANDO E CHAMANDO DE DELÍCIA, UM AMANTE A MODA ANTIGA #JoaquinResponde http",1
796,"cara, pode me chamar do que quiser! Quero ver refutar o que falei com fatos, lógica e verdades! http",0
4823,"RT @user: Quão 'democratas' e 'tolerantes' são estes esquerdistas, estes 'direitinhas' e estes 'politicamente correctos'. http _",0



BAS19_ES
6600 entries, of which 2739 (41.50%) are hateful.


Unnamed: 0,text,label
1057,Cambio de junta directiva club activo 20-30 Colon Sur en Club Árabe http,0
1957,#MuyRicoMuyRicoPero pues te gusta cualquier gata pintada de güera. #D0nLuckyLuciano #MaffiaTeam http,0
2772,Espero que tengan un buen año y reciban lo que merezcan menos yo porque yo me jodería por puta JAJAJAJAJJAJAAJAJAJAJJAJAJAJA,0
2419,"@user Es mierda licuada de perro, por que tu eres una perra no puede ser tuya. Si waaaa?",1
4878,@user Está pinche zorra cállate,1



OUS19_AR
3353 entries, of which 755 (22.52%) are hateful.


Unnamed: 0,text,label
3255,@user هي بقت عالثقافة بلد شكبرة كاعد يرجع ليورة مثل بول البعير يوم بعد يوم,0
1555,@user وجهة مزبلية مش سياحية ! شعب وسخ و ارض وسخة ! عيب عليكم,0
827,@user @user مدري شفتي التحرش ؟!,0
3333,@user كلامك رائع وسليم لكن لماذا لاتتكلم عن عنصرية الفلسطينيين الذين يعايرون ابناء الحرمين في بول البعير و… @user,0
371,@user @user @user @user انت روح اشرب لك كوبين بول البعير بتاعكم وانت تبئه كويس انشاء الله… @user,1



DYN21_EN
41144 entries, of which 22175 (53.90%) are hateful.


Unnamed: 0,text,label
548,it is not ok to be a pikey,1
40252,i think most refugees would agree that rapists are scum of the earth,0
37263,Its not ok to discriminate gays but I'm really not fond of them,1
40919,marriage is for bloody working class people,0
31456,"Its name is kungflu not covid19 virus, it is hard to explain what is acceptable racism to those who only react when there is a discrimination against them.",1



OUS19_FR
4014 entries, of which 399 (9.94%) are hateful.


Unnamed: 0,text,label
2940,un attardé qui ne comprend rien à l'anglais et parle le français comme ribery.,0
1625,@user tu ja respondeu mongol,0
1952,@user j...avais grave besoin d...entendre ca merci mec !,0
2880,hier soir je ss fait insulté de «sale pd de merde» parce qu’un renoi aigri commandé un filet-o-fish à 1h45am d… http,0
605,@user et fais moi pas dire ce que j’ai pas dis l’afrique appartient au africain donc au renois et au maghrébin un malade toi,0





## Create and export splits

In [6]:
# set aside 2k from each dataset for testing and 500 for dev
# except for Ousidhoum in French and Arabic, where train set would otherwise be too small

TEST_SIZE = 2000
DEV_SIZE = 500

for dataset in df_dict:
    if "ous19_fr" in dataset:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = 2000, random_state=123)
        devset, testset = train_test_split(devtest, test_size = 1500, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_500.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_1500.csv", index=False)
    elif "ous19_ar" in dataset:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = 1300, random_state=123)
        devset, testset = train_test_split(devtest, test_size = 1000, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_300.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_1000.csv", index=False)
    else:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = TEST_SIZE+DEV_SIZE, random_state=123)
        devset, testset = train_test_split(devtest, test_size = TEST_SIZE, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_{DEV_SIZE}.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_{TEST_SIZE}.csv", index=False)
        
# export all data that is not test or dev, so we can use it for active learning later
for dataset in df_dict:
    df_dict[dataset].to_csv(f"../0_data/main/1_clean/{dataset}/train_{len(df_dict[dataset])}.csv", index=False)

In [7]:
# create differently-sized train portions from rest of data

SEEDS = 10 # for repeated experiments with different random data selection
N_RANGE = [10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset.upper())
    for n in N_RANGE:
        
        # save all splits for Dynabench
        if n<len(df_dict[dataset]) and "dyn21" in dataset: 
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"../0_data/main/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
        
        # save splits up to 2k for other datasets
        elif n<len(df_dict[dataset]) and n<=2000: 
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):  
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"../0_data/main/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
    
    print()

SAN20_IT
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 40 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 300 training set
  saving n = 400 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

FOR19_PT
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 40 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 300 training set
  saving n = 400 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

BAS19_ES
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 40 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 300 training set
  saving n = 400 train