In [1]:
# import relevant packages
import os
import pandas as pd
import re
from html import unescape
from sklearn.model_selection import train_test_split

## Load raw datasets

In [7]:
df_dict = dict()

PATH = "./data/0_raw"

for file in os.listdir(PATH):
    print(file)
    df_dict[file.removesuffix(".csv")] = pd.read_csv(f"{PATH}/{file}")

fortuna2019_portuguese.csv
basile2019_spanish.csv
sanguinetti2020_italian.csv


## Reformat columns

## Clean text

## Create and save splits

In [5]:
# calculates the percentage of positive class in a dataset
def _hate_percentage(df):
    return df.label.sum()/len(df), len(df)

def _percentage_message(percentage, split_name, dataset_name):
    # displays the percentage of the positive class in a dataset
    print(f'percentage of hate in {split_name} in {dataset_name} is {percentage[0]*100:.2f}% out of {percentage[1]} examples')

def _dataset_save(df, language, split):
    # a function to save the resultant data splits
    path = os.path.abspath('data')
    path = os.path.join(f'data/{language}/{split}/')
    df.to_csv(os.path.join(path, 'data.csv'), index=False)
    
def _clean_text(text):
    # a function that contains the entire logic for text cleaning
    text = unescape(text)
    text = re.sub(r"@[A-Za-z0-9_-]+",'@user',text)
    text = re.sub(r"http\S+",'[URL]',text)
    text = re.sub(r"\n",' ',text)
    text = re.sub(r"\t",' ',text)
    text = text.strip()
    return text

def spliter(df, language, random_state=123):
    # encapsulates the logic to split the portuguese and spanish datasets into three splits
    # and saving them to disk
    train, devtest = train_test_split(df, test_size=0.2, stratify=df.label, random_state=random_state)
    dev, test = train_test_split(devtest, test_size=0.5, stratify=devtest.label, random_state=random_state)
    _percentage_message(_hate_percentage(df), 'total', language)
    _percentage_message(_hate_percentage(train), 'train', language)
    _percentage_message(_hate_percentage(dev), 'val', language)
    _percentage_message(_hate_percentage(test), 'test', language)
    _dataset_save(train, language, 'train')
    _dataset_save(dev, language, 'val')
    _dataset_save(test, language, 'test')

## Portuguese

We start by loading the Portuguese dataset and displaying the first five lines.

In [3]:
df_portuguese = pd.read_csv("data/raw_datasets/fortuna2019_portuguese.csv")
df_portuguese.head()

Unnamed: 0,text,hatespeech_comb,hatespeech_G1,annotator_G1,hatespeech_G2,annotator_G2,hatespeech_G3,annotator_G3
0,@__andrea__b \nO cara vive em outro mundo\nNão...,1,1,A,1.0,V,0,E
1,@_carmeloneto Estes incompetentes não cuidam n...,0,1,D,0.0,V,0,C
2,@_carmeloneto \nOs 'cumpanhero' quebraram toda...,0,1,A,0.0,B,0,E
3,@_GlitteryKisses é isso não conseguem pensar n...,0,0,C,0.0,V,0,D
4,@_iglira bom dia macaco branco haha,1,0,A,1.0,I,1,E


We encapsulates the logic to preprocess the Portuguese dataset in a function, and then apply it to the dataset.

In [4]:
def fortuna_preprocess(df):
    # encapsulates the logic to preprocess the portuguese dataset
    df = df.rename(columns={"hatespeech_comb": "label"})
    df["text"] = df["text"].apply(lambda x: _clean_text(x))
    df = df[df.columns[df.columns.isin(['text', 'label'])]]
    return df

df_portuguese = fortuna_preprocess(df_portuguese)
spliter(df_portuguese, 'Portuguese')

percentage of hate in total in Portuguese is 31.53% out of 5670 examples
percentage of hate in train in Portuguese is 31.53% out of 4536 examples
percentage of hate in val in Portuguese is 31.57% out of 567 examples
percentage of hate in test in Portuguese is 31.57% out of 567 examples


## Italian

We load the Italian dataset and inspect the first few lines.

In [6]:
df_italian = pd.read_csv("data/raw_datasets/sanguinetti2020_italian.csv")
df_italian.head()

Unnamed: 0,id,text,hs,stereotype,split
0,2066,"È terrorismo anche questo, per mettere in uno ...",0,0,train
1,2045,@user @user infatti finché ci hanno guadagnato...,0,0,train
2,61,"Corriere: Tangenti, Mafia Capitale dimenticata...",0,0,train
3,1259,"@user ad uno ad uno, perché quando i migranti ...",0,0,train
4,949,Il divertimento del giorno? Trovare i patrioti...,0,0,train


We apply the logic specific to preprocessing and splitting the Italian dataset below. However, because the Italian dataset is slightly different than the two others, we also use a custom logic to the splitting processes as can be seen in the code below.

In [7]:
def sanguinetti_preprocess(df):
    # encapsulates the logic to preprocess the italian dataset
    df = df.rename(columns={"hs": "label"})
    df["text"] = df["text"].apply(lambda x: _clean_text(x))
    df = df[df.columns[df.columns.isin(['text', 'label', 'split'])]]
    return df

df_italian = sanguinetti_preprocess(df_italian)

traindev = df_italian[df_italian["split"] == "train"][["text", "label"]] #we split the data based on the original split
italian_test = df_italian[df_italian["split"] == "test"][["text", "label"]]
italian_train, italian_dev = train_test_split(traindev, test_size=0.1, stratify = traindev.label, random_state=123) #we split train split to train and dev

LANGUAGE = 'Italian'
_percentage_message(_hate_percentage(df_italian), 'total', LANGUAGE) #we calculate the percentages of hate
_percentage_message(_hate_percentage(italian_train), 'train', LANGUAGE)
_percentage_message(_hate_percentage(italian_dev), 'val', LANGUAGE)
_percentage_message(_hate_percentage(italian_test), 'test', LANGUAGE)
_dataset_save(italian_train, LANGUAGE, 'train') #we write the data to disk
_dataset_save(italian_dev, LANGUAGE, 'val')
_dataset_save(italian_test, LANGUAGE, 'test')

percentage of hate in total in Italian is 41.83% out of 8100 examples
percentage of hate in train in Italian is 40.45% out of 6153 examples
percentage of hate in val in Italian is 40.50% out of 684 examples
percentage of hate in test in Italian is 49.25% out of 1263 examples


## Spanish

Finally, we load the Spanish dataset and look at the first few examples as well.

In [8]:
df_spanish = pd.read_csv("data/raw_datasets/basile2019_spanish.csv")
df_spanish.head()

Unnamed: 0,id,text,HS,TR,AG
0,20001,Easyjet quiere duplicar el número de mujeres p...,1,0,0
1,20002,El gobierno debe crear un control estricto de ...,1,0,0
2,20003,Yo veo a mujeres destruidas por acoso laboral ...,0,0,0
3,20004,"— Yo soy respetuoso con los demás, sólamente l...",0,0,0
4,20007,Antonio Caballero y como ser de mal gusto e ig...,0,0,0


Then we preprocess and split the dataset.

In [9]:
def basile_preprocess(df):
    # encapsulates the logic to preprocess the spanish dataset
    df = df.rename(columns={"HS": "label"})
    df["text"] = df["text"].apply(lambda x: _clean_text(x))
    df = df[df.columns[df.columns.isin(['text', 'label'])]]
    return df

df_spanish = basile_preprocess(df_spanish)
spliter(df_spanish, 'Spanish')

percentage of hate in total in Spanish is 41.52% out of 4950 examples
percentage of hate in train in Spanish is 41.52% out of 3960 examples
percentage of hate in val in Spanish is 41.62% out of 495 examples
percentage of hate in test in Spanish is 41.41% out of 495 examples
