In [1]:
# import relevant packages

import pandas as pd
import os
import re

from html import unescape
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)

## Load raw datasets

In [19]:
df_dict = dict()

PATH = "../0_data/main/0_raw"

for file in sorted(os.listdir(PATH)):
    if "ipynb" not in file:
        print(re.sub('\.csv$', '', file))
        df_dict[re.sub('\.csv$', '', file)] = pd.read_csv(f"{PATH}/{file}")

bas19_es
dyn21_en
for19_pt
fou18_en
has19_hi
has20_hi
has21_hi
ken20_en
ous19_ar
ous19_fr
san20_it


## Reformat columns
Need separate logic for different datasets. 1 is for hateful, 0 for non-hateful.

In [20]:
# Dynabench 2021 / English
df_dict["dyn21_en"].label.replace({"hate":1, "nothate":0}, inplace=True)

# Founta 2018 / English
df_dict["fou18_en"].label.replace({'hateful': 1, "abusive": 0, "normal": 0, "spam": 0}, inplace = True)

# Kennedy 2020 / English
df_dict["ken20_en"].rename(columns={"label_hate_maj": "label"}, inplace=True)

# Fortuna 2019 / Portuguese
df_dict["for19_pt"].rename(columns={"hatespeech_comb": "label"}, inplace=True)

# Basile 2019 / Spanish
df_dict["bas19_es"].rename(columns={"HS": "label"}, inplace=True)

# Sanguinetti 2020 / Italian
df_dict["san20_it"].rename(columns={"hs": "label"}, inplace=True)

# Ousidhoum 2019 / Arabic & French
for d in ["ous19_ar", "ous19_fr"]:
    df_dict[d]["label"] = df_dict[d].sentiment.apply(lambda x: 1 if "hateful" in x else 0)
    # text was already cleaned in a way that conflicts with our later cleaning, so we align it here
    df_dict[d]["text"] = df_dict[d].tweet.apply(lambda x: x.replace("@url", "http"))
    
# HASOC 19, 20 and 21 / Hindi
for d in ["has19_hi", "has20_hi", "has21_hi"]:
    df_dict[d]["label"] = df_dict[d].task_2.apply(lambda x: 1 if x=="HATE" else 0)
    
# drop redundant columns
for dataset in df_dict:
    if "split" in df_dict[dataset].columns:
        df_dict[dataset] = df_dict[dataset][["text", "label", "split"]]
    else:
        df_dict[dataset] = df_dict[dataset][["text", "label"]]

## Clean text

In [21]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'@user',text) # format expected by XLM-T
    text = re.sub(r"http\S+",'http',text) # format expected by XLM-T
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.replace("[URL]", "http") # format expected by XLM-T
    text = text.strip()
    return text

for dataset in df_dict:
    df_dict[dataset]["text"] = df_dict[dataset].text.apply(lambda x: clean_text(x))

## Boost proportion of hate in English datasets to match Dynabench
We are doing a first phase of fine-tuning on up to 20k entries in Dynabench. Dynabench has ca. 53% hate. The other English datasets have a lower proportion of hate so we drop non-hate to make the proportion more comparable at 20k entries.

In [22]:
# boost Kennedy 2020 / English to have 50% hate (up from ca. 30%)
df_dict["ken20_en"] = pd.concat([df_dict["ken20_en"][df_dict["ken20_en"].label==1], df_dict["ken20_en"][df_dict["ken20_en"].label==0].sample(11596, random_state=123)]).sample(frac=1, random_state=123)

# boost Founta 2018 / English to have 22% hate, which is max possible (up from ca. 5%)
df_dict["fou18_en"] = pd.concat([df_dict["fou18_en"][df_dict["fou18_en"].label==1], df_dict["fou18_en"][df_dict["fou18_en"].label==0].sample(17600, random_state=123)]).sample(frac=1, random_state=123)


## Show descriptive stats

In [23]:
def descriptive_stats(df):
    n_total = df.shape[0]
    n_hate = df.label.sum()
    print("{} entries, of which {} ({:.2%}) are hateful.".format(n_total, n_hate, n_hate/n_total))
    return df.label.sum()/len(df), len(df)

for dataset in df_dict:
    print(dataset.upper())
    descriptive_stats(df_dict[dataset])
    display(df_dict[dataset].sample(5))
    print()

BAS19_ES
6600 entries, of which 2739 (41.50%) are hateful.


Unnamed: 0,text,label
1490,"As√≠ como a muchos les encabrona que les digan negros a mi tambien me encabrona y doble que me digan g√ºera, para m√≠ no es un alago ni mucho menos superioridad",1
4134,"üì£üó°Ô∏è#Barcelona, ciudad de #okupas, inmigrantes ilegales #manteros, #turismofobia y #narcopisos en el Raval. Magreb√≠es atacan a un filipino en ese barrio. @user desaparecida y sin dar la cara. http",1
2243,Lo √∫nico que debe votar una mujer son sus tetas cuando hacen deporte. #GravedadAnteTodo,1
5057,#NOME?,0
240,"@user @user @user @user Pues eso, c√°llate la puta boca.",0



DYN21_EN
41144 entries, of which 22175 (53.90%) are hateful.


Unnamed: 0,text,label,split
18489,Pandering to the mentally ill never lasts. A society that persists in celebrating illness is doomed.,1,train
19181,Pissing me the fuck OFF! Trampy girls,0,train
10858,"I will be honest, all bl@user deserve to be saved from this racist government and I wanna do it",0,test
26502,"i want to dedicate myself to the TOTAL and complete eradication of climate change on this planet, I would dedicate an entire life to this worthy pursuit using my own hands, for these gas guzzlers among us, must be eventually dealt with en masse! Otherwise all of HUMANITY must eventually suffer under the cruetly of their fumes.",0,train
13772,"One thought: Is it possible that since the government-sponsored imprisonement of black people based upon fictitious statistics is ""unleashing & emboldening"" the most violent factions fo BLM? I'm not justifying the violence but providing a a broader context for their demands and the hsitorcal unjustices they've suffered -just a thought.",0,train



FOR19_PT
5670 entries, of which 1788 (31.53%) are hateful.


Unnamed: 0,text,label
4265,RT @user: Este √© o islamismo..... A triste realidade que a imprensa #fakenews se recusa a nos mostrar. http,1
5281,@user gente QUE FOFURA!,0
3150,olha a√≠! O que sempre digo: - repete o mesmo chav√£o que a milit√¢ncia manda. - xinga o oponete - n√£o refuta nada do _ http,0
944,"Como voc√™ pode militar por uma causa sendo que voc√™ nem estuda, @user? #Transg√™neros http",0
5498,"Usar uma frase sem economias de caracteres pra me chamar de burro √© f√°cil! Quero ver refutar o que falei com fatos, _ http",0



FOU18_EN
22565 entries, of which 4965 (22.00%) are hateful.


Unnamed: 0,text,label
62053,But if nothing bad happened then hell I'd even give up 20 years I still have to ask my dad what the hell was going on though,0
11192,"RT @user: im sorry, sometimes im fucked up.",0
14461,"Guided Meditations: For Calmness, Awareness, And Love... http #SelfDevelopment http",0
92552,"@user I had the biggest Lego Star Wars collection ever when I was a kid haha, fuck knows where it is now hope‚Ä¶ http",0
66325,"@user Yeah not great. The ref yesterday was pissing me off! Our end product is awful at the moment, too wasteful",1



HAS19_HI
5983 entries, of which 746 (12.47%) are hateful.


Unnamed: 0,text,label,split
783,‡§∏‡•Ç‡§Ö‡§∞ ‡§ï‡•Ä ‡§™‡•à‡§¶‡§æ‡§á‡§∂ ‡§Ö‡§∞‡§®‡•ç‡§¶‡•Ä ‡§ï‡•Ä ‡§î‡§≤‡§æ‡§¶ ‡§ï‡•á‡§ú‡§∞‡•Ä‡§µ‡§æ‡§≤ ‡§ï‡§æ ‡§Æ‡•Å‡§§ ‡§™‡§ø‡§®‡•á ‡§µ‡§æ‡§≤‡•á ‡§π‡§ø‡§ú‡•ç‡§°‡•á ‡§Ü‡§™‡§ø‡§Ø‡•ã ‡§∏‡•á ‡§Ö‡§™‡§®‡•Ä ‡§ó‡§æ‡§®‡•ç‡§¶ ‡§Æ‡§∞‡•ç‡§µ‡§æ‡§®‡•á ‡§µ‡§æ‡§≤‡•á ‡§Æ‡§æ‡§¶‡§∞‡§∏‡•ã‡§¶ ‡§§‡•Ç ‡§Ö‡§™‡§®‡•Ä ‡§Ö‡§Æ‡•ç‡§Æ‡•Ä ‡§ï‡•ã ‡§∏‡•å‡§™ ‡§¶‡•á ‡§Ü‡§™ ‡§ï‡•á ‡§ó‡•Å‡§Ç‡§°‡•ã ‡§ï‡•ã ‡§î‡§∞ ‡§â‡§®‡•ç‡§π‡•Ä ‡§∏‡•á ‡§∞‡§æ‡§∂‡§® ‡§ï‡§æ‡§∞‡•ç‡§° ‡§¨‡§®‡§µ‡§æ ‡§î‡§∞ ‡§≠‡•ã‡§ï,0,train
3877,"‡§ï‡§∂‡•ç‡§Æ‡•Ä‡§∞ ‡§ï‡§æ ‚Äú‡§∞‡§ä‡§´‚Äù ‡§ï‡§≤‡§ï‡§§‡•ç‡§§‡§æ ‡§ï‡•á 5 ‡§ü‡•Å‡§∞‡§ø‡§∏‡•ç‡§ü ‡§ï‡•ã ‡§¨‡§ö‡§æ‡§§‡•á ‡§π‡•Å‡§è ‡§ñ‡§º‡•Å‡§¶ ‡§°‡•Ç‡§¨ ‡§ó‡§Ø‡§æ‡•§ ‡§°‡•Ç‡§¨‡§®‡•á ‡§µ‡§æ‡§≤‡•á ‡§®‡•á ‡§Ø‡•á ‡§≠‡•Ä ‡§®‡§π‡•Ä ‡§∏‡•ã‡§ö‡§æ ‡§ï‡§ø ‡§µ‡•ã ‡§ú‡§ø‡§®‡•ç‡§π‡•á ‡§¨‡§ö‡§æ‡§®‡•á ‡§ï‡•Ä ‡§ï‡•ã‡§∂‡§ø‡§∂ ‡§ï‡§∞ ‡§∞‡§π‡§æ ‡§µ‡•ã ‡§§‡•Ä‡§®‡•ã ‚Äú‡§π‡§ø‡§Ç‡§¶‡•Ç‚Äù ‡§π‡•à‡§Ç, ‡§¨‡§∏ ‡§á‡§Ç‡§∏‡§æ‡§®‡§ø‡§Ø‡§§ ‡§ï‡•ã ‡§∏‡§∞‡•ç‡§µ‡§™‡•ç‡§∞‡§•‡§Æ ‡§∞‡§ñ‡§æ‡•§ ‡§ú‡§ó‡§π-‡§ú‡§ó‡§π ‡§ï‡§∂‡§Æ‡•Ä‡§∞‡§ø‡§Ø‡•ã‡§Ç ‡§ï‡•ã ‡§™‡•Ä‡§ü‡§®‡•á ‡§µ‡§æ‡§≤‡•á ‡§§‡§•‡§æ‡§ï‡§•‡§ø‡§§ ‡§¶‡•á‡§∂‡§≠‡§ï‡•ç‡§§ ‡§ó‡•å‡§∞ ‡§∏‡•á ‡§¶‡•á‡§ñ‡•ã‡•§ @user Bhai",0,train
3373,"‡§∏‡§≠‡•Ä ‡§∏‡§æ‡§•‡§ø‡§Ø‡•ã‡§Ç ‡§∏‡•á ‡§µ‡§ø‡§®‡§§‡•Ä ‡§™‡•Ç‡§∞‡•ç‡§µ‡§ï ‡§ï‡§π‡§§‡§æ ‡§π‡•Å ‡§ï‡•Ä 15%‡§Æ‡•Å‡§∏‡•ç‡§≤‡§ø‡§Æ ‡§Ü‡§¨‡§æ‡§¶‡•Ä ‡§µ‡§æ‡§≤‡•á ,,‡§î‡§∞‡§Ç‡§ó‡§æ‡§¨‡§æ‡§¶ ‡§∏‡•á ‡§á‡§Æ‡•ç‡§§‡§ø‡§Ø‡§æ‡§ú ‡§ú‡§º‡§≤‡•Ä‡§≤ AIMIM ‡§ï‡•á‡•§‡§ü‡§ø‡§ï‡§ü ‡§∏‡•á ‡§ú‡•Ä‡§§ ‡§∏‡§ï‡§§‡•á ‡§π‡•à ‡§§‡•ã 42%‡§Æ‡•Å‡§∏‡•ç‡§≤‡§ø‡§Æ ‡§Ü‡§¨‡§æ‡§¶‡•Ä ‡§µ‡§æ‡§≤‡•á ‡§¨‡§π‡§∞‡§æ‡§á‡§ö ‡§∏‡•á ‡§ï‡•ç‡§Ø‡•ã ‡§®‡§π‡•Ä------? @user @user @user",1,train
3016,"Earlier I doubted, but now CONFIRMED!!! This man is murderer of thousands of people in #GujratRiot ‡§Ü‡§ú ‡§§‡§ï ‡§ú‡§ø‡§§‡§®‡•á #‡§®‡•Ä‡§ö ‡§Æ‡§∞‡•á ‡§•‡•à, ‡§â‡§® ‡§∏‡§¨ ‡§ï‡•á ‡§¨‡§¶‡§≤‡•á ‡§á‡§∏‡§ï‡§æ ‡§ú‡§®‡•ç‡§Æ ‡§π‡•Å‡§Ü ‡§π‡•à !!!!!",0,train
3264,"‡§Æ‡•Ä‡§°‡§ø‡§Ø‡§æ ‡§ï‡•á #ExitPoll ‡§™‡§∞ BJP ‡§ï‡•Ä ‡§∏‡•Å‡§®‡§æ‡§Æ‡•Ä ‡§ö‡§≤ ‡§∞‡§π‡•Ä ‡§π‡•à, ‡§Ø‡•á ‡§¶‡•á‡§ñ‡§ï‡§∞ ‡§Æ‡•Å‡§ù‡•á ‡§§‡•á‡§∞‡•Ä ‡§Æ‡•á‡§π‡§∞‡§¨‡§æ‡§®‡§ø‡§Ø‡§æ ‡§´‡§ø‡§≤‡•ç‡§Æ ‡§ï‡•á #‡§ï‡•Å‡§§‡•ç‡§§‡•á ‡§ï‡•Ä ‡§µ‡§´‡§æ‡§¶‡§æ‡§∞‡•Ä ‡§Ø‡§æ‡§¶ ‡§Ü ‡§ó‡§à- #‡§ú‡•Ä_‡§π‡§æ‡§Å #ExitPoll2019",0,train



HAS20_HI
4232 entries, of which 347 (8.20%) are hateful.


Unnamed: 0,text,label,split
863,"RT @user: ""‡§Æ‡•á‡§∞‡•Ä ‡§¨‡•á‡§ü‡•Ä ‡§¨‡§ö‡§™‡§® ‡§∏‡•á ‡§π‡•Ä ‡§ö‡§≤ ‡§´‡§ø‡§∞ ‡§®‡§π‡•Ä‡§Ç ‡§∏‡§ï‡§§‡•Ä‡•§‡§ß‡•Ä‡§∞‡•á-‡§ß‡•Ä‡§∞‡•á ‡§ú‡§¨ ‡§µ‡§π ‡§¨‡§°‡§º‡•Ä ‡§π‡•ã‡§®‡•á ‡§≤‡§ó‡•Ä ‡§§‡§¨ ‡§â‡§∏‡•á ‡§ï‡§Ç‡§ß‡•á ‡§™‡§∞ ‡§â‡§†‡§æ ‡§ï‡§∞ ‡§ñ‡•á‡§§‡•ã‡§Ç ‡§Æ‡•á‡§Ç ‡§≤‡•á ‡§ú‡§æ‡§®‡§æ ‡§Æ‡•á‡§∞‡•á ‡§≤‡§ø‡§è‚Ä¶",0,train
811,RT @user: ‡§∏‡•Å‡§¨‡§π ‡§ï‡§æ ‡§≠‡•Ç‡§≤‡§æ ‡§Ö‡§ó‡§∞ ‡§∂‡§æ‡§Æ ‡§ï‡•ã ‡§ò‡§∞ ‡§≤‡•å‡§ü‡§ï‡§∞ ‡§Ü‡§Ø‡•á ‡§§‡•ã ‡§â‡§∏‡•á ‡§≠‡•Ç‡§≤‡§æ ‡§®‡§π‡•Ä‡§Ç ‡§ï‡§π‡§§‡•á..ü§óü§óüòÅüòÇüòÇ http,0,train
1447,"RT @user: ‡§Æ‡§π‡§æ‡§® ‡§µ‡•à‡§¶‡§ø‡§ï ‡§∏‡§Ç‡§∏‡•ç‡§ï‡•É‡§§‡§ø ‡§ï‡•á ‡§µ‡§ø‡§∞‡•Å‡§¶‡•ç‡§ß ‡§¶‡•Å‡§∑‡•ç‡§™‡•ç‡§∞‡§ö‡§æ‡§∞, ‡§∑‡§°‡•ç‡§Ø‡§Ç‡§§‡•ç‡§∞ ‡§ï‡§∞‡§ï‡•á ‡§®‡•Ä‡§ö‡§æ ‡§¶‡§ø‡§ñ‡§æ‡§ï‡§∞ ‡§ß‡§∞‡•ç‡§Æ‡§æ‡§Ç‡§§‡§∞‡§£ ‡§ï‡§∞‡§ï‡•á ‡§¶‡•á‡§∂ ‡§ï‡•ã ‡§§‡•ã‡§°‡§º‡§®‡•á ‡§µ‡§æ‡§≤‡•ã‡§Ç ‡§∏‡•á ‡§¨‡§ö‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è‚Ä¶",0,train
4142,RT @user: ‡§∂‡§∞‡•Ä‡§∞ ‡§Æ‡§∞ ‡§ú‡§æ‡§§‡§æ ‡§π‡•à ‡§ì‡§∞ ‡§Ü‡§§‡•ç‡§Æ‡§æ ‡§≠‡§ü‡§ï‡§§‡•Ä ‡§π‡•à ‡§≤‡§ø‡§ñ ‡§≤‡•ã ‡§≠‡§ï‡•ç‡§§‡•ã‡§Ç... ‡§µ‡•à‡§∏‡•á ‡§π‡•Ä 23 ‡§Æ‡§à ‡§ï‡•ã ‡§≠‡§æ‡§ú‡§™‡§æ ‡§Æ‡§∞ ‡§ú‡§æ‡§è‡§ó‡•Ä ‡§î‡§∞ ‡§≠‡§ï‡•ç‡§§ ‡§≠‡§ü‡§ï‡•á‡§Ç‡§ó‡•á........ ‡§á‡§ß‚Ä¶,0,test
602,"RT @user: üíùüí¶‡§§‡•Å‡§Æ‡•ç‡§π‡•á ‡§®‡§æ ‡§¶‡•á‡§ñ ‡§ï‡§∞ ‡§ï‡§¨ ‡§§‡§ïü§î ‡§∏‡§¨‡•ç‡§∞ ‡§ï‡§∞‡•Ç‡§Å.,. üòì ‡§Ü‡§Å‡§ñ‡•á ‡§§‡•ã ‡§¨‡§Å‡§¶ üôà‡§ï‡§∞ ‡§≤‡•Ç‡§Å ,‡§™‡§∞ ‡§á‡§∏ ‡§¶‡§ø‡§≤ ‚ù§‡§ï‡§æ ‡§ï‡•ç‡§Ø‡§æ ‡§ï‡§∞‡•Ç‡§Å..!! üòçü§î http",0,train



HAS21_HI
4594 entries, of which 566 (12.32%) are hateful.


Unnamed: 0,text,label,split
702,Infection (corona) Or Injection ( vaccine ) ‡§ö‡•Å‡§®‡§æ‡§µ ‡§Ü‡§™‡§ï‡•ã ‡§ï‡§∞‡§®‡§æ ‡§π‡•à ‡•§ üôèüôè #CovidVaccine #CovidIndia #CovidCrisisIndia #vaccinated #COVIDEmergency2021,0,train
779,‡§¨‡§Ç‡§ó‡§æ‡§≤ ‡§Æ‡•á‡§Ç ‡§á‡§∏ ‡§Ü‡§§‡§Ç‡§ï‡§µ‡§æ‡§¶‡§ø ‡§ï‡•ã ‡§ú‡•á‡§≤ ‡§Æ‡•á‡§Ç ‡§π‡•ã‡§®‡§æ ‡§ö‡§æ‡§π‡§ø‡§è ‡§•‡§æ ‡§≤‡•á‡§ï‡§ø‡§® ‡§á‡§∏‡•á ‡§Æ‡•Å‡§ñ‡•ç‡§Ø‡§Æ‡§Ç‡§§‡•ç‡§∞‡•Ä ‡§¨‡§®‡§æ ‡§¶‡§ø‡§Ø‡§æ ‡§ó‡§Ø‡§æ #BengalBurning http,0,train
10,‡§ú‡•ã ‡§∂‡§π‡§æ‡§¨‡•Å‡§¶‡•ç‡§¶‡•Ä‡§® ‡§ï‡•á ‡§∏‡§æ‡§• ‡§π‡•à‡§Ç ‡§µ‡•ã ‡§á‡§∏ ‡§ü‡•ç‡§µ‡•Ä‡§ü ‡§ï‡•ã ‡§∞‡•Ä‡§ü‡•ç‡§µ‡•Ä‡§ü ‡§ï‡§∞‡•á‡§Ç ‡§î‡§∞ ‡§á‡§∏ ‡§π‡•à‡§∏‡§ü‡•à‡§ó ‡§™‡§∞ ‡§ü‡•ç‡§µ‡•Ä‡§ü ‡§ï‡§∞‡•á‡§Ç #JusticeForShahabuddin,0,train
3217,@user @user @user @user @user @user @user @user @user @user ‡§π‡§Æ ‡§ï‡•á‡§Ç‡§¶‡•ç‡§∞ ‡§∏‡§∞‡§ï‡§æ‡§∞ ‡§∏‡•á #CBI ‡§ú‡§æ‡§Ç‡§ö ‡§ï‡•Ä ‡§Æ‡§æ‡§Ç‡§ó ‡§ï‡§∞‡§§‡§æ ‡§π‡•Ç‡§Å ‡§ì ‡§≠‡•Ä ‡§∏‡•Å‡§™‡•ç‡§∞‡•Ä‡§Æ ‡§ï‡•ã‡§∞‡•ç‡§ü ‡§ï‡•Ä ‡§®‡§ø‡§ó‡§∞‡§æ‡§®‡•Ä ‡§Æ‡•á‡§Ç ‡§®‡§ø‡§∑‡•ç‡§™‡§ï‡•ç‡§∑ ‡§ú‡§æ‡§Ç‡§ö ‡§ï‡•Ä ‡§Æ‡§æ‡§Ç‡§ó ‡§ï‡§∞‡§§‡§æ ‡§π‡•Ç‡§Å ‡§á‡§∂‡•ç‡§Æ‡•á ‡§¶‡•ã‡§∑‡•Ä ‡§≤‡•ã‡§ó‡•ã ‡§ï‡•ã ‡§´‡§æ‡§Ç‡§∏‡•Ä ‡§ï‡•Ä ‡§∏‡§ú‡§æ ‡§π‡•ã‡§®‡•Ä ‡§ö‡§æ‡§π‡§ø‡§è #JusticeForShahabuddin,0,train
1774,‡§ú‡§ø‡§∏ ‡§∂‡§ñ‡•ç‡§∏ ‡§ï‡•ã ‡§ú‡•á‡§≤ ‡§∏‡•á ‡§≤‡•á‡§®‡•á ‡§ï‡•á ‡§≤‡§ø‡§è ‡§π‡§ú‡§æ‡§∞‡•ã‡§Ç ‡§ó‡§æ‡§°‡§ø‡§Ø‡§æ‡§Ç ‡§ú‡§æ‡§§‡•Ä ‡§•‡•Ä‡•§‡•§‡•§‡•§ ‡§Ü‡§ú ‡§â‡§∏‡§ï‡•Ä ‡§Æ‡§Ø‡•ç‡§Ø‡§æ‡§§ ‡§ò‡§∞ ‡§π‡•à ‡§®‡§æ ‡§ú‡§æ ‡§∏‡§ï‡•Ä‡•§‡•§ ‡§§‡§∞‡§∏ ‡§Ü‡§§‡§æ ‡§π‡•à ‡§á‡§∏ ‡§ï‡•ã‡§Æ ‡§™‡§∞‡•§‡•§‡•§‡•§ @user @user @user @user #JusticeForShahabuddin,1,train



KEN20_EN
23192 entries, of which 11596 (50.00%) are hateful.


Unnamed: 0,text,label
8070,Your first task should be to lock that little clit of yours up securely. Free cocks on a sissy is fucking disgusting and they have no right to be uncaged.,1
10812,Maybe they should import millions more immigrants to drive wages down and increase their tax burdens?,0
1758,Ahh yeah I get what you're saying. Well time to grab them and cut their dicks off. Hard to commie rape when you know you're dick will get cut off.,1
28719,You cuffed a girl that fucked you and one of your niggas within the same week URL,1
19872,Kill em all put them hoes to REST üòÇüòÇüòÇ,1



OUS19_AR
3353 entries, of which 755 (22.52%) are hateful.


Unnamed: 0,text,label
390,@user @user ÿßŸÑÿß ÿÆŸÜÿßÿ≤Ÿäÿ± ŸÇÿ∑ÿ± ŸäŸÇŸàŸÑŸàŸÜ ÿßŸäÿ±ÿßŸÜ ÿØŸàŸÑŸá ÿ¥ÿ±ŸäŸÅŸá Ÿà ÿÆŸÑŸäŸÅÿ© ÿßŸÑÿ¥Ÿàÿßÿ∞ ÿßŸÑŸÖÿ™ÿ≠ÿßŸÑŸÅ ŸÖÿπŸáŸÖ,1
644,@user @user ŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸáŸá ÿµÿßÿ±ŸàÿÆ ÿ®ŸàŸÑ ÿßŸÑÿ®ÿπŸäÿ± ÿ•ÿ≠ŸÜÿß ÿßŸÑŸÑŸä ÿØŸÇŸäŸÜÿß ÿÆÿ¥ŸàŸÖŸÉŸÖ‚Ä¶ http,1
2313,@user @user ÿπÿßŸÑŸÖ ÿ¥ŸäÿπŸä ŸäŸàÿµŸä ÿ®ÿ¥ÿ±ÿ® ÿ®ŸàŸÑ ÿßŸÑÿ®ÿπŸäÿ± http,1
2101,Ÿàÿ®ÿµŸÅÿ™ŸÜÿß ŸÖÿ∫ÿßÿ±ÿ®ÿ© ŸÑÿß ŸÜŸÇÿ®ŸÑ ÿØÿπŸÖ ŸÖŸÜ ÿÆÿßŸÜŸÜÿß ŸÅŸä ŸäŸàŸÖ ÿßŸÑÿßŸäÿßŸÖ ŸàŸÉÿßŸÜ Ÿäÿ¥ÿ™ŸÖŸÜÿß Ÿäÿß ÿ¥ÿßÿ±ÿ® ÿ®ŸàŸÑ ÿßŸÑÿ®ÿπŸäÿ± http,1
2303,@user @user Ÿáÿ∞ÿß ÿßŸÑŸÇÿ±ÿ∂ÿßŸàŸä ÿÆŸÜÿ≤Ÿäÿ± ŸàŸÜÿµÿßÿ® ŸàŸÇÿØ Ÿàÿ¨ÿØ ÿ∂ÿßŸÑÿ™Ÿá ŸÅŸä ÿ≠ŸÉÿßŸÖ ŸÇÿ∑ÿ± ÿßŸÑÿ£ÿ∫ÿ®Ÿäÿßÿ° Ÿàÿ≥Ÿäÿ∑ÿ± ÿπŸÑŸäŸáŸÖ Ÿàÿ£ÿÆÿ∞ ŸÖŸÜ ÿ£ŸÖŸàÿßŸÑ ÿßŸÑÿ¥‚Ä¶ http,1



OUS19_FR
4014 entries, of which 399 (9.94%) are hateful.


Unnamed: 0,text,label
2248,@user @user @user @user ils mangent pas de porc parce que quand il les tuaie... http,0
2246,@user @user et ouais des mongol √† marseille,0
3166,peut-√™tre que ceux qui √©mettent les oukases non violents sont relativement prot√©g√©s de la violence. mais il semble‚Ä¶ http,0
3706,@user @user ¬´ @user islamo - gauchiste ? ¬ª,0
3545,@user @user @user @user encore un autre mongol qui veut se donner bonne conscience a‚Ä¶ http,0



SAN20_IT
8100 entries, of which 3388 (41.83%) are hateful.


Unnamed: 0,text,label,split
118,"Roma, gli spezzano il dito per rubargli fede: caccia a due rom http http",0,train
4191,"@user La bella domanda √®: se sono profughi, come fanno a sapere dove rivolgersi per denunciare una persona, da parte un ministro del governo italiano???? Chi li sta guidando???? Ci scommetterei una mano che proviene tutto dal pdisastro!!!!!",1,train
1300,Inchiesta a torino - Trovato in Croazia il tesoro della ‚Äúregina‚Äù rom | Liguria | Genova | Il Secolo XIX http,0,train
6917,"Ma @user dimmi ,vorresti far sbarcare questi finti rifugiati politici e finti profughi? Se lo farai ,prometto che non avrai un solo voto da noi !!!! Rispetta le idee √® i principi del CAPITANO SALVINI. Non devi cedere ,tutti abbiamo un cuore ,ma no x questi accattoni. http",1,test
5604,@user @user @user Accidentiüò≤ A pensare che prima la Calabria era un'oasi di pace Ad esempio non c'erano mai stati sequestri di persona Solo con i migranti ospitati a #Riace sono iniziati questi episodi N.B. rispetto i calabresi la cui maggioranza √® formata da brave persone,0,train





## Create and export splits

In [24]:
# set aside 2k from each dataset for testing and 500 for dev
# except for Ousidhoum in French and Arabic, where train set would otherwise be too small
# and for HASOC 20 and 21 in Hindi, where test splits are given

TEST_SIZE = 2000
DEV_SIZE = 500

for dataset in df_dict:
    if "ous19_fr" in dataset:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = 2000, random_state=123)
        devset, testset = train_test_split(devtest, test_size = 1500, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_500.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_1500.csv", index=False)
    elif "ous19_ar" in dataset:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = 1300, random_state=123)
        devset, testset = train_test_split(devtest, test_size = 1000, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_300.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_1000.csv", index=False)
    elif "has19_hi" in dataset or "has20_hi" in dataset: # use provided test sets
        df_dict[dataset][df_dict[dataset]["split"]=="test"].to_csv(f"../0_data/main/1_clean/{dataset}/test_{len(df_dict[dataset][df_dict[dataset]['split']=='test'])}.csv", index=False)
        df_dict[dataset], devset = train_test_split(df_dict[dataset][df_dict[dataset]["split"]=="train"], test_size = 500, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_500.csv", index=False)
    else:
        df_dict[dataset], devtest = train_test_split(df_dict[dataset], test_size = TEST_SIZE+DEV_SIZE, random_state=123)
        devset, testset = train_test_split(devtest, test_size = TEST_SIZE, random_state=123)
        devset.to_csv(f"../0_data/main/1_clean/{dataset}/dev_{DEV_SIZE}.csv", index=False)
        testset.to_csv(f"../0_data/main/1_clean/{dataset}/test_{TEST_SIZE}.csv", index=False)
        
# export all data that is not test or dev, so we can use it for full sample training
for dataset in df_dict:
    df_dict[dataset].to_csv(f"../0_data/main/1_clean/{dataset}/train_{len(df_dict[dataset])}.csv", index=False)

In [25]:
# create differently-sized train portions from rest of data

SEEDS = 10 # for repeated experiments with different random data selection
N_RANGE = [10, 20, 30, 40, 50, 100, 200, 300, 400, 500, 1000, 2000, 3000, 4000, 5000, 10000, 20000]

for dataset in df_dict:
    print(dataset.upper())
    for n in N_RANGE:
        
        # save all splits for English test sets
        if n<len(df_dict[dataset]) and ("dyn21" in dataset or "ken20" in dataset or "fou18" in dataset): 
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"../0_data/main/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
        
        # save splits up to 2k for other datasets
        elif n<len(df_dict[dataset]) and n<=2000: 
            print(f"  saving n = {n} training set")
            for random_state in range(1, SEEDS+1):  
                df_dict[dataset].sample(n, random_state = random_state).to_csv(f"../0_data/main/1_clean/{dataset}/train/train_{n}_rs{random_state}.csv",index=False)
    
    print()

BAS19_ES
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 40 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 300 training set
  saving n = 400 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set

DYN21_EN
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 40 training set
  saving n = 50 training set
  saving n = 100 training set
  saving n = 200 training set
  saving n = 300 training set
  saving n = 400 training set
  saving n = 500 training set
  saving n = 1000 training set
  saving n = 2000 training set
  saving n = 3000 training set
  saving n = 4000 training set
  saving n = 5000 training set
  saving n = 10000 training set
  saving n = 20000 training set

FOR19_PT
  saving n = 10 training set
  saving n = 20 training set
  saving n = 30 training set
  saving n = 