In [1]:
# import relevant packages

import pandas as pd
import os
import re

from html import unescape
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load raw datasets

In [2]:
df_dict = dict()

PATH = "../0_data/main/0_raw"

for file in sorted(os.listdir(PATH)):
    if "ipynb" not in file:
        print(re.sub('\.csv$', '', file))
        df_dict[re.sub('\.csv$', '', file)] = pd.read_csv(f"{PATH}/{file}")

bas19_es
dyn21_en
for19_pt
has19_hi
has20_hi
has21_hi
ous19_ar
ous19_fr
san20_it


## Reformat columns
Need separate logic for different datasets. 1 is for hateful, 0 for non-hateful.

In [3]:
# Dynabench 2021 / English
df_dict["dyn21_en"].label.replace({"hate":1, "nothate":0}, inplace=True)

# Fortuna 2019 / Portuguese
df_dict["for19_pt"].rename(columns={"hatespeech_comb": "label"}, inplace=True)

# Basile 2019 / Spanish
df_dict["bas19_es"].rename(columns={"HS": "label"}, inplace=True)

# Sanguinetti 2020 / Italian
df_dict["san20_it"].rename(columns={"hs": "label"}, inplace=True)

# Ousidhoum 2019 / Arabic & French
for d in ["ous19_ar", "ous19_fr"]:
    df_dict[d]["label"] = df_dict[d].sentiment.apply(lambda x: 1 if "hateful" in x else 0)
    # text was already cleaned in a way that conflicts with our later cleaning, so we align it here
    df_dict[d]["text"] = df_dict[d].tweet.apply(lambda x: x.replace("@url", "http"))
    
# HASOC 19, 20 and 21 / Hindi
for d in ["has19_hi", "has20_hi", "has21_hi"]:
    df_dict[d]["label"] = df_dict[d].task_2.apply(lambda x: 1 if x=="HATE" else 0)
    
# drop redundant columns
for dataset in df_dict:
    if "split" in df_dict[dataset].columns:
        df_dict[dataset] = df_dict[dataset][["text", "label", "split"]]
    else:
        df_dict[dataset] = df_dict[dataset][["text", "label"]]

## Clean text

In [4]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'@user',text) # format expected by XLM-T
    text = re.sub(r"http\S+",'http',text) # format expected by XLM-T
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.replace("[URL]", "http") # format expected by XLM-T
    text = text.strip()
    return text

for dataset in df_dict:
    df_dict[dataset]["text"] = df_dict[dataset].text.apply(lambda x: clean_text(x))

## Show descriptive stats

In [5]:
def descriptive_stats(df):
    n_total = df.shape[0]
    n_hate = df.label.sum()
    print("{} entries, of which {} ({:.2%}) are hateful.".format(n_total, n_hate, n_hate/n_total))
    return df.label.sum()/len(df), len(df)

for dataset in df_dict:
    print(dataset.upper())
    descriptive_stats(df_dict[dataset])
    display(df_dict[dataset].sample(5))
    print()

BAS19_ES
6600 entries, of which 2739 (41.50%) are hateful.


Unnamed: 0,text,label
3792,"1cosa que no entiendo es porque un t√≠o puede ense√±ar la polla en redes sociales pero si una t√≠a ense√±a las tetas,el chichi o lo que sea se ve mal Es as√≠ o me equivoco? Porque creo que no,creo que todav√≠a falta cierta libertad o mentalidad abierta mejor dicho",0
406,Ac√° nos huevean si nos encapuchamos pero en los paises √°rabes estariamos todas obligadas a encapucharnos ü§∑‚Äç‚ôÄÔ∏è,0
5535,Me gusta Gigi Hadid y no busco atacarla pero si yo subo una foto de mi axila sin depilar me tacharian de guarra y p‚Ä¶ http,0
5723,"@user La concha de tu madre gordo hijo de puta!! La hija de puta de ti mujer es la que no pac√≠fica las cosas,gordo falopero!",1
2375,le tengo que ense√±ar a mi perra a cerrar la puerta porque la abre y no se da cuenta de quE TENGO EL AIRE PRENDIDO,0



DYN21_EN
41144 entries, of which 22175 (53.90%) are hateful.


Unnamed: 0,text,label,split
40917,black coffees are disgusting,0,test
18986,"I don't think they are as attracted to ol' Hook Nose than they are to his MONEY. He also talks a good game, knowing what to say and when to say it. Case in point: I had to listen to an Jewish coworker waffle on and on about his wealth and how ""cultured"" he was. Decked out in high end clothes, metro-sexual hair doo, and frequently reminding us lowly Goyim that he has an advanced degree. He was clearly trolling for female coworkers. Some were actually falling for it. Luckily most of us could see straight through the charade, and we were able to talk these ladies out of associating with this guy.",1,train
40762,ch!nks shouldn't be allowed to adopt,1,test
1028,Why dont you go back to Africa?,1,dev
19296,These black women are big as this black whale,1,train



FOR19_PT
5670 entries, of which 1788 (31.53%) are hateful.


Unnamed: 0,text,label
4545,RT @user: Eu farei a biografia do Bolsonaro http,0
2385,Leiam com aten√ß√£o todos os lados antes de julgarem.... http,0
2450,@user e quando n√£o p√¥de? tem alguma lei que pro√≠be? tem alguma pol√≠cia que prenda quem n√£o usa? tem algu√©m defendendo isso?,0
1236,E a seguir quer provar que o Isl√£o e compat√≠vel com estes valores. Enfim. http,1
880,"Claro que N√ÉO, eles tem medo do TRUMP.kkkk http",1



HAS19_HI
5983 entries, of which 746 (12.47%) are hateful.


Unnamed: 0,text,label,split
5933,"‡§Ø‡•á ‡§è‡§ï ‡§Æ‡§π‡§ø‡§≤‡§æ ‡§π‡•à ‡§ú‡•ã BJP ‡§∏‡•á ‡§§‡§æ‡§≤‡•ç‡§≤‡•Å‡§ï ‡§∞‡§ñ‡§§‡•Ä ‡§π‡•à ‡§Æ‡§π‡§ø‡§≤‡§æ‡§ì‡§Ç ‡§ï‡•á ‡§¨‡§æ‡§∞‡•á ‡§Æ‡•á‡§Ç BJP ‡§ï‡•á ‡§Æ‡§π‡§ø‡§≤‡§æ‡§ì ‡§ï‡§æ ‡§µ‡§ø‡§ö‡§æ‡§∞ ‡§¶‡•á‡§ñ‡§ø‡§è @user ‡§ï‡•ã ‡§ê‡§∏‡•á ‡§≤‡•ã‡§ó‡•ã ‡§™‡§∞ ‡§ú‡§≤‡•ç‡§¶ ‡§∏‡•á ‡§ú‡§≤‡•ç‡§¶ ‡§ï‡§æ‡§∞‡•ç‡§Ø‡§µ‡§æ‡§π‡•Ä ‡§ï‡§∞‡§®‡•Ä ‡§ö‡§æ‡§π‡§ø‡§è, ‡§ê‡§∏‡•á ‡§≤‡•ã‡§ó ‡§¶‡•á‡§∂ ‡§ï‡•Ä ‡§è‡§ï‡§§‡§æ ‡§ï‡•á ‡§≤‡§ø‡§è ‡§ò‡§æ‡§§‡§ï ‡§π‡•à‡§Ç, ‡§â‡§Æ‡•ç‡§Æ‡•Ä‡§¶ ‡§π‡•à @user ‡§ú‡§≤‡•ç‡§¶ ‡§∏‡•á ‡§ú‡§≤‡•ç‡§¶ ‡§ï‡§æ‡§∞‡•ç‡§Ø‡§µ‡§æ‡§π‡•Ä ‡§ï‡§∞‡•á‡§ó‡•Ä",0,test
2336,". @user ‡§ú‡•Ä, ‡§ñ‡§ü‡•ç‡§ü‡•Ä ‡§Æ‡•Ä‡§†‡•Ä ‡§á‡§≤‡§æ‡§π‡§æ‡§¨‡§æ‡§¶‡•Ä ‡§∏‡§ø‡§Ø‡§æ‡§∏‡§§ ‡§ï‡•Ä ‡§Ø‡§æ‡§¶ ‡§¶‡§ø‡§≤‡§æ‡§§‡§æ ‡§Ø‡•á ‡§™‡•ã‡§∏‡•ç‡§ü‡§∞ ‡§Ø‡§æ‡§¶ ‡§π‡•à ‡§Ü‡§™‡§ï‡•ã ?",0,train
3484,‡§π‡§µ‡§æ‡§≤‡§æ ‡§ü‡•á‡§∞‡§∞ ‡§´‡§Ç‡§°‡§ø‡§Ç‡§ó ‡§ï‡§æ ‡§Æ‡•Å‡§ñ‡•ç‡§Ø ‡§∏‡•ã‡§∞‡•ç‡§∏ ‡§π‡•à @user,0,train
3958,"‡§§‡§∞‡§¨‡•á‡§ú‡§º ‡§ï‡§æ ‡§Æ‡§∞‡§®‡§æ ‡§≠‡•Ä ‡§õ‡•ã‡§ü‡•Ä ‡§∏‡•Ä ‡§π‡•Ä ‡§§‡•ã ‡§¨‡§æ‡§§ ‡§π‡•à, ‡§á‡§§‡§®‡§æ ‡§π‡§Ç‡§ó‡§æ‡§Æ‡§æ ‡§ï‡•ç‡§Ø‡•ã‡§Ç ‡§´‡§ø‡§∞ ‡§ú‡§¨ ‡§§‡•Å‡§Æ‡•ç‡§π‡§æ‡§∞‡•Ä ‡§Æ‡§∏‡•ç‡§ú‡§ø‡§¶‡•á‡§Ç ‡§ü‡•Ç‡§ü‡•á‡§Ç‡§ó‡•Ä ‡§§‡§¨ ‡§≠‡•Ä ‡§õ‡•ã‡§ü‡•Ä ‡§¨‡§æ‡§§ ‡§π‡•Ä ‡§ï‡§π‡•ã‡§ó‡•á ‡§ï‡•ç‡§Ø‡§æ ‡§∏‡•Å‡§Ö‡§∞ ‡§ï‡•Ä ‡§î‡§≤‡§æ‡§¶",0,train
4424,"‡§ö‡•ã‡§¶‡•Ç ‡§∏‡§æ‡§≤‡•á, ‡§ï‡§ü‡•á ‡§π‡•Å‡§è ‡§≤‡•Å‡§Ç‡§° ‡§ï‡•Ä ‡§Ü‡§ß‡•Ä ‡§™‡•à‡§¶‡§æ‡§á‡§∂... ‡§ö‡•å‡§ï‡•Ä‡§¶‡§æ‡§∞ ‡§ú‡•á‡§≤ ‡§Æ‡•á‡§Ç ‡§®‡§π‡•Ä ‡§π‡•à ‡§î‡§∞ ‡§®‡§æ ‡§π‡•Ä ‡§â‡§∏‡§™‡•á ‡§ï‡•ã‡§à ‡§ï‡•á‡§∏ ‡§≠‡•Ä ‡§π‡•à‡•§ ‡§≤‡§≤‡•Å‡§Ü ‡§ú‡•á‡§≤ ‡§Æ‡•á‡§Ç ‡§π‡•à ‡§µ‡•ã ‡§≠‡•Ä ‡§¨‡§π‡•Å‡§§ ‡§∏‡§æ‡§∞‡•á ‡§Æ‡§æ‡§Æ‡§≤‡•ã‡§Ç ‡§Æ‡•á‡§Ç‡•§ ‡§∏‡§ú‡§º‡§æ ‡§Ø‡§æ‡§´‡•ç‡§§‡§æ ‡§ï‡•à‡§¶‡•Ä ‡§π‡•à ‡§µ‡•ã",0,train



HAS20_HI
4232 entries, of which 347 (8.20%) are hateful.


Unnamed: 0,text,label,split
862,‡§¶‡•á‡§∂ ‡§ï‡•Ä ‡§ú‡§®‡§§‡§æ ‡§ï‡§ø‡§∏‡•Ä ‡§ï‡•ã ‡§®‡§π‡•Ä‡§Ç ‡§¨‡§ñ‡•ç‡§∂‡•á‡§ó‡•Ä ‡§ú‡•ã ‡§ó‡§≤‡§§‡•Ä ‡§ï‡§∞‡•á‡§ó‡§æ ‡§â‡§∏‡§ï‡•ã ‡§∏‡•Å‡§®‡§®‡§æ ‡§î‡§∞ ‡§∏‡§π‡§®‡§æ ‡§≠‡•Ä ‡§¨‡§¢‡§º‡•á‡§ó‡§æ,0,train
4023,"@user '‡§Æ‡§æ‡§Ç' ‡§π‡§∞ ‡§¶‡•Å‡§É‡§ñ ‡§ï‡§æ ‡§á‡§≤‡§æ‡§ú ‡§π‡•à‡•§ ‡§¶‡•Å‡§®‡§ø‡§Ø‡§æ ‡§Æ‡•á‡§Ç ‡§ê‡§∏‡•Ä ‡§ï‡•ã‡§à ‡§∏‡§Æ‡§∏‡•ç‡§Ø‡§æ ‡§®‡§π‡•Ä‡§Ç, ‡§ú‡§ø‡§∏‡§ï‡§æ ‡§∏‡§Æ‡§æ‡§ß‡§æ‡§® ‡§Æ‡§æ‡§Ç ‡§ï‡•á ‡§™‡§æ‡§∏ ‡§®‡§π‡•Ä‡§Ç‡•§üòá#HappyMothersDay",0,test
2648,"RT @user: ‡§î‡§∞ ‡§â‡§ß‡§∞ ‡§ö‡•Ä‡§® ‡§ï‡•á ‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡§™‡§§‡§ø ‡§ï‡§æ ‡§¨‡§°‡§º‡§æ ‡§¨‡§Ø‡§æ‡§®, ""‡§ï‡§æ‡§Ç‡§ó ‡§ö‡•Ç‡§Ç ‡§Æ‡•ã‡§¶‡•Ä ‡§™‡•Å‡§Ç ‡§ö‡§ø‡§Ç‡§ó ‡§ö‡§æ‡§Ç‡§ó ‡§™‡•Å‡§Ç‡§ó"" ‡§Ö‡§∞‡•ç‡§• - . . . ‡§Ü‡§Ø‡•á‡§ó‡§æ ‡§Æ‡•ã‡§¶‡•Ä ‡§π‡•Ä..!! üòúüòú",0,train
558,"@user @user http ‡§Ø‡•á ‡§¶‡•á‡§ñ‡•ã ‡§Æ‡•ã‡§¶‡•Ä ‡§ú‡•Ä ‡§®‡•á ‡§Æ‡•á‡§π‡•Å‡§≤ ‡§ö‡•å‡§ï‡§∏‡•Ä ‡§ï‡•ã ""‡§≠‡§æ‡§à"" ‡§ï‡§π‡§æ. ‡§Ö‡§¨ ‡§¨‡§§‡§æ‡§ì ‡§ï‡•ç‡§Ø‡§æ ‡§ñ‡•ç‡§Ø‡§æ‡§≤ ‡§π‡•à?",0,train
2330,@user ‡§Æ‡§§‡§¶‡§æ‡§® ‡§ú‡§∞‡•Ç‡§∞ ‡§ï‡§∞‡•á‡•§ ‡§ö‡§æ‡§π‡•á BJP‡§ï‡•ã ‡§ï‡§∞‡•ã ‡§ö‡§æ‡§π‡•á ‡§Æ‡•ã‡§¶‡•Ä ‡§ú‡•Ä ‡§ï‡•ã ‡§ï‡§∞‡•ã ‡§ö‡§æ‡§π‡•á ‡§ï‡§Æ‡§≤ ‡§ï‡•ã ‡§ï‡§∞‡•ã üëçüëçüôèüôè,0,train



HAS21_HI
4594 entries, of which 566 (12.32%) are hateful.


Unnamed: 0,text,label,split
680,"‡§è‡§Æ ‡§°‡•Ä ‡§¨‡•ã‡§≤‡•á -""‡§ï‡•ç‡§≤‡§æ‡§∏‡§ø‡§ï‡§≤ ‡§§‡§¨‡§≤‡§æ"" ‡§∞‡•á‡§°‡§ø‡§Ø‡•ã ‡§™‡§∞ .. ""‡§ú‡§æ‡§ï‡§ø‡§∞ ‡§π‡•Å‡§∏‡•à‡§®"" ‡§ï‡§æ ‡§§‡§¨‡§≤‡§æ ‡§¨‡§ú‡§®‡•á ‡§≤‡§ó‡§æ MD ‡§∏‡§æ‡§π‡§¨ ‡§ñ‡•Å‡§∂ ‡§π‡•ã‡§ï‡§∞ ‡§ï‡§æ‡§∞ ‡§≤‡•á‡§ï‡§∞ ‡§ò‡§∞ ‡§ï‡•Ä ‡§§‡§∞‡§´ ‡§®‡§ø‡§ï‡§≤‡•á‡•§ ‡§Ö‡§ö‡§æ‡§®‡§ï ‡§è‡§ï ‡§ï‡§æ‡§∞ ‡§®‡•á ‡§ì‡§µ‡§∞‡§ü‡•á‡§ï ‡§ï‡§∞‡§ï‡•á ‡§∏‡§æ‡§Æ‡§®‡•á ‡§∏‡•á ‡§ï‡•ç‡§∞‡§æ‡§∏ ‡§ï‡§ø‡§Ø‡§æ... MD ‡§∏‡§æ‡§π‡§¨ ‡§ï‡•á ‡§Æ‡•Å‡§Å‡§π ‡§∏‡•á ‡§®‡§ø‡§ï‡§≤‡§æ ""‡§π‡§∞‡§æ‡§Æ‡•Ä"" ‡§∞‡•á‡§°‡§ø‡§Ø‡•ã ‡§¨‡§ú‡§æ - ""‡§®‡§Æ‡§∏‡•ç‡§ï‡§æ‡§∞ ‡§¶‡•ã‡§∏‡•ç‡§§‡•ã ‡§Æ‡•à‡§Ç ‡§Ö‡§∞‡§µ‡§ø‡§®‡•ç‡§¶ ‡§ï‡•á‡§ú‡§∞‡•Ä‡§µ‡§æ‡§≤ ‡§¨‡•ã‡§≤ ‡§∞‡§π‡§æ ‡§π‡•Ç‡§Å......"" üòÄüòÄüòÄüòÇüòÇüòÇ",0,train
2803,Pappu to usko bol diye the upne lekin final race mein Pappu to ap hi ban gaye lagta hain. #‡§®‡§∞‡•á‡§Ç‡§¶‡•ç‡§∞_‡§Æ‡•ã‡§¶‡•Ä_‡§ó‡•ç‡§≤‡•ã‡§¨‡§≤_‡§™‡§™‡•ç‡§™‡•Ç_‡§π‡•à #‡§®‡§∞‡•á‡§Ç‡§¶‡•ç‡§∞_‡§Æ‡•ã‡§¶‡•Ä_‡§ó‡•ç‡§≤‡•ã‡§¨‡§≤_‡§™‡§™‡•ç‡§™‡•Ç #‡§®‡§∞‡•á‡§Ç‡§¶‡•ç‡§∞_‡§ï‡§ø_bakch‡§Æodi #‡§≠‡§æ‡§∑‡§£‡§¨‡§æ‡§ú_‡§Æ‡•ã‡§¶‡•Ä #IndiaCovidCrisis,0,train
4180,@user ‡§™‡§∞ ‡§§‡•Å ‡§§‡•ã ‡§Ö‡§¨‡•ç‡§¶‡•Å‡§≤ ‡§ï‡§ø ‡§®‡§æ‡§ú‡§æ‡§Ø‡§ú‡§º ‡§î‡§≤‡§æ‡§¶ ‡§π‡•à‡§Ç ‡§¨‡•á... üòú,0,train
3901,‡§ö‡§æ‡§®‡•ç‡§¶ ‡§î‡§∞ ‡§Æ‡§Ç‡§ó‡§≤ ‡§™‡§∞ ‡§ú‡•Ä‡§µ‡§® ‡§ñ‡•ã‡§ú‡§®‡•á ‡§µ‡§æ‡§≤‡§æ ‡§á‡§Ç‡§∏‡§æ‡§® ‡§Ü‡§ú ‡§™‡•É‡§•‡•ç‡§µ‡•Ä ‡§™‡§∞ ‡§ñ‡•Å‡§¶ ‡§ï‡§æ ‡§ú‡•Ä‡§µ‡§® ‡§¨‡§ö‡§æ‡§®‡•á ‡§Æ‡•à ‡§≤‡§ó‡§æ ‡§π‡•à #AatmanirbharBharat #COVIDEmergency #PMModi #NASAEXPORTS #IndiaCovidCrisis,0,train
3046,@user @user #Hindus DYING #HindusLivesMatter #KindAttnPMNarendraModi #coronasecondwaveinindia #ModiMadeDisaster #ResignModi #BJP #RSS LIARS I WARNED 2 YRS BACK #RSSTerrorists BANNED BY SARDAR PATEL IGNORED #‡§∂‡•á‡§∞_‡§®‡§π‡•Ä‡§Ç_‡§≠‡•á‡§°‡§º‡§ø‡§Ø‡§æ_‡§™‡§æ‡§≤_‡§≤‡§ø‡§Ø‡§æ_‡§≠‡§ï‡•ç‡§§‡•ã #MautKaSaudagar,0,train



OUS19_AR
3353 entries, of which 755 (22.52%) are hateful.


Unnamed: 0,text,label
835,ŸÖÿπÿ±ŸàŸÅ ÿÆŸÜÿ≤Ÿäÿ± ŸÖŸÜ ŸäŸàŸÖŸá,0
780,ŸÉŸÑŸÖŸÜ Ÿäÿ®ÿ±ÿ± ŸÑŸÇÿ™ŸÑ ÿ±Ÿàÿ≠ ÿ®ÿ¥ÿ±ÿå ŸáŸà ŸÉÿßÿ¶ŸÜ Ÿàÿ≥ÿÆ ŸàŸÖÿ±Ÿäÿ∂ÿå ŸàŸÖÿ¥ÿßÿ±ŸÉ ÿ®ÿßŸÑŸÇÿ™ŸÑÿå ŸàŸÖÿßŸÉŸà ÿßŸä ÿßÿ≥ÿ™ÿ´ŸÜÿßÿ°. #ÿ™ÿßÿ±Ÿá_ŸÅÿßÿ±ÿ≥,0
1954,ŸäÿπŸÜŸä Ÿäÿßÿ¥ŸàŸäÿ™ ÿÆŸÜÿßÿ≤Ÿäÿ± ÿßŸÑÿ¥ÿ±ŸÅ ŸÖŸÑŸàÿ¥ ŸÖÿπŸÜŸä ÿπŸÜÿØŸÉŸÖ ŸäÿπŸÜŸä ÿßŸÜÿ™ŸÖ ŸÖŸÅŸäÿ¥ ÿπŸÜÿØŸÉŸÖ ŸÉÿ±ŸÖŸá ÿ®ÿ≥ÿ®ÿ®ŸÉŸÖ Ÿäÿßÿ¥ŸàŸäÿ™ ÿ≤ÿ®ÿßŸÑÿ© ÿßŸÑŸÜÿßÿ≥ ÿßÿÆÿØÿ™ ŸÅŸÉÿ±ÿ© ÿπŸÜ ÿ≥ÿ™ÿßÿ™ ŸÖÿµÿ±‚Ä¶ http,0
438,@user @user ŸÉŸäŸÅ ÿ™ŸÇŸàŸÑŸàŸÜ ÿπŸÑŸâ ÿ™ÿ±ÿßŸÖÿ® ÿÆŸÜÿ≤Ÿäÿ± ÿßŸàŸÜÿ™ŸÖ ÿπŸÜÿØŸÉŸÖ ÿßŸÉÿ®ÿ± ŸÇÿßÿπÿØÿ© ÿπÿ≥ŸÉÿ±Ÿäÿ© ŸÑŸáÿü! ÿßŸÑŸÑŸá ŸäŸÑÿπŸÜ ÿ≠ŸÑŸäÿ® ÿßŸÑÿ≠ŸÖŸäÿ± ÿßŸÑŸä ÿ≥Ÿàÿß ŸÅŸä ÿπŸÇŸàŸÑŸÉŸÖ ŸÉÿ∞ÿß,0
2079,Ÿäÿß ŸÉŸÅÿßÿ± ŸÇÿ±Ÿäÿ¥ Ÿäÿß ÿ¥ÿßÿ±ÿ®Ÿä ÿ®ŸàŸÑ ÿßŸÑÿ®ÿπŸäÿ±,1



OUS19_FR
4014 entries, of which 399 (9.94%) are hateful.


Unnamed: 0,text,label
3994,@user @user fake en plus c'est de tr√®s mauvaise qualit√© personne ne s'exprime comme un attard√©‚Ä¶ http,0
2766,la pr√©f√©rence islamiste de la gauche est motiv√©e par le client√©lisme mais √©galement par l‚Äôanticl√©ricalisme.,0
3055,putain d'attard√©,0
1239,@user @user la r√©torique pr√©fer√©e de l'extreme droite. je suis pas raciste j'ai un ami arabe. ca ne marche‚Ä¶ http,0
2431,donc j‚Äô√©tais dans le m√©tro en train de coller mon nez contre mon √©cran comme un putain d‚Äôattard√© seul avec frust‚Ä¶ http,0



SAN20_IT
8100 entries, of which 3388 (41.83%) are hateful.


Unnamed: 0,text,label,split
3880,@user si ma oggi i nemici della nostra civilta sono i musulmani http,1,train
3775,"A quanto pare, il corano, oltre che in bagno si pu√≤ usare anche per questo. Ma pare non funzioni molto bene..... http",1,train
7738,"#Facciamorete: i figli non servono ad esser famiglia. #LGBT: il sesso che d√† figli non val pi√π di anale e dildo. #Gay: i figli si comprano. #Femministe: si possono uccidere, se embrioni. #Pd: son sostituibili cogli stranieri. #Burioni: sono infetti. Evviva la cultura dell'#amore.",0,test
4401,MALEDETTISSIME MMERDE..... POI PER√í I PROFUGHI DEL CAZZO VANNO SOCCORSI E OSPITATI NEGLI ALBERGHI CON IL WIFI... VI AUGURO TUTTO IL MALE POSSIBILE,1,train
6653,@user @user @user @user @user Ma perch√© non andate a Milano ad aiutare i loro stranieri nella stessa situazione? Ah no l√¨ ci sono i compagni,0,train





## Create and export splits

In [6]:
# set aside 2k from each dataset for testing and 500 for dev
# except for Ousidhoum in French and Arabic, where train set would otherwise be too small
# and for HASOC 20 and 21 in Hindi, where test splits are given

TEST_SIZE = 2000
DEV_SIZE = 500

for dataset in df_dict:
    if "ous19_fr" in dataset:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = 2000, random_state=123)
        devset, testset = train_test_split(devtest, test_size = 1500, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_500.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_1500.csv", index=False)
    elif "ous19_ar" in dataset:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = 1300, random_state=123)
        devset, testset = train_test_split(devtest, test_size = 1000, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_300.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_1000.csv", index=False)
    elif "has19_hi" in dataset or "has20_hi" in dataset: # use provided test sets
        df_dict[dataset][df_dict[dataset]["split"]=="test"].to_csv(f"../0_data/main/1_clean/{dataset}/test_{len(df_dict[dataset][df_dict[dataset]['split']=='test'])}.csv", index=False)
        df_dict[dataset], devset = train_test_split(df_dict[dataset][df_dict[dataset]["split"]=="train"], test_size = 500, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_500.csv", index=False)
    else:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = TEST_SIZE+DEV_SIZE, random_state=123)
        devset, testset = train_test_split(devtest, test_size = TEST_SIZE, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_{DEV_SIZE}.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_{TEST_SIZE}.csv", index=False)
        
# export all data that is not test or dev, so we can use it for full sample training
for dataset in df_dict:
    df_dict[dataset].to_csv(f"../0_data/main/1_clean/{dataset}/train_{len(df_dict[dataset])}.csv", index=False)

In [7]:
# create differently-sized train portions from rest of data

SEEDS = 10 # for repeated experiments with different random data selection
N_RANGE = [10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset.upper())
    for n in N_RANGE:
        
        # save all splits for Dynabench
        if n<len(df_dict[dataset]) and "dyn21" in dataset: 
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"../0_data/main/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
        
        # save splits up to 2k for other datasets
        elif n<len(df_dict[dataset]) and n<=2000: 
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):  
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"../0_data/main/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
    
    print()

BAS19_ES
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 40 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 300 training set
  saving n = 400 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

DYN21_EN
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 40 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 300 training set
  saving n = 400 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set
  saving n = 3000 training set
  saving n = 4000 training set
  saving n = 5000 training set
  saving n = 10000 training set
  saving n = 20000 training set

FOR19_PT
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 