# Pipeline for Dataset Merge 

In [1]:
import csv
import pandas as pd
import os
from IPython.display import display


## Load Dataset

In [2]:
cwd = os.getcwd()
data_dir = os.path.join(cwd + '/data/')
datasets = ['germeval2018','germeval2019','hasoc2019','hasoc2020','polly','hatespeech_refugees']
formats = [".txt",".csv",".tsv"]
#dataset = datasets[0]



## Functions for different Dataset-Preprocessing

### Preprocessing GermEval Datasets

In [3]:
def preprocessGermEval(dataset):
    dataset_dir = data_dir + dataset + "/"
    if "germeval" in dataset:
        files = os.listdir(dataset_dir)
        df = pd.DataFrame()
        if any("merged" in file for file in files): 
            print("already merged")
        else:
            for file in files:
                for f in formats:
                    if f in file:
                        file_path = dataset_dir + file
                        print("Merge file: " + file)
                        df_file = pd.read_csv(file_path, sep='\t', header = None)
                        #if df_file['1'] =
                        #display(df_file[600:650])
                        df_file = df_file.iloc[:,0:2]
                        df_file[2] = dataset
                        df = df.append(df_file)
            for index, row in df.iterrows():
                if row.iloc[1] == 'OFFENSE': row.iloc[1] = '1'
                elif row.iloc[1] == 'OTHER': row.iloc[1] = '0'
            #display(df)
            #df.to_csv(dataset_dir + dataset + "_merged.csv", sep =',')
            if "germeval2018" in dataset: 
                df_ge18 = df
                return df_ge18
            if "germeval2019" in dataset: 
                df_ge19 = df
                return df_ge19   
    else:
        print("This function can only preprocess GermEval datasets")
                    
#preprocessGermEval(dataset)

### Preprocessing Hasoc Datasets

In [4]:
def preprocessHasoc(dataset,delimiter):
    dataset_dir = data_dir + dataset + "/"
    if "hasoc" in dataset:
        files = os.listdir(dataset_dir)
        df = pd.DataFrame()
        if any("merged" in file for file in files): 
            print("already merged")
        else:
            for file in files:
                for f in formats:
                    if f in file:
                        file_path = dataset_dir + file
                        print("Merge file: " + file)
                        df_file = pd.read_csv(file_path, sep=delimiter, header = None)
                        df_file = df_file.replace('\n','', regex=True)
                        #display(df_file)
                        df_file = df_file.iloc[:,1:3]
                        df_file[3] = dataset
                        df = df.append(df_file)
            for index, row in df.iterrows():
                if row.iloc[1] == 'HOF': row.iloc[1] = '1'
                elif row.iloc[1] == 'NOT': row.iloc[1] = '0'      
            df = df.drop(0)
            df = df.rename(columns={1: 0, 2: 1, 3: 2})
            #display(df)
            #df.to_csv(dataset_dir + dataset + "_merged.csv", sep =',')
            if "hasoc2019" in dataset: 
                df_ha19 = df
                return df_ha19
            if "hasoc2020" in dataset: 
                df_ha20 = df
                return df_ha20 
    else:
        print("This function can only preprocess HASOC datasets")
#preprocessHasoc(dataset)         

### Preprocessing Polly Corpus

In [5]:
def preprocessPollyCorpus(dataset):
    dataset_dir = data_dir + dataset + "/"
    if "polly" in dataset:
        df = pd.DataFrame()
        df = pd.read_csv(dataset_dir + "hatespeech_polly.csv", sep=',',header=None)
        df = df.replace('\n','', regex=True)
        #display(df)
        df = df.iloc[:,0:2]
        df[1] = 1
        df = df.drop(0)
        df[2] = dataset
        #display(df)
        #df.to_csv(dataset_dir + dataset + "_merged.csv", sep =',')
        return df
    else:
        print("This function can only preprocess the Polly Corpus!")
    
    
#preprocessPollyCorpus(dataset)

### Preprocessing Refugee Corpus

In [6]:
def preprocessRefugeeCorpus(dataset):
    dataset_dir = data_dir + dataset + "/"
    if "refugee" in dataset:
        df = pd.DataFrame()
        df = pd.read_csv(dataset_dir + "hatespeech_refugees.csv",sep=',',header=None)
        #display(df)
        df_new = pd.DataFrame()
        for index, row in df.iterrows():
            if row.iloc[1] == row.iloc[2]:
                if row.iloc[1] == 'NO': row.iloc[1] = '0'
                elif row.iloc[1] == 'YES': row.iloc[1] = '1'
                df_new = df_new.append(row)
        df_new = df_new.iloc[:,0:2]
        df_new[2] = dataset
        #display(df_new)
        #df_new.to_csv(dataset_dir + dataset + "_merged.csv",sep = ',')
        return df_new
    else:
        print("This function can only preprocess the Refugee Corpus!")
#preprocessRefugeeCorpus(dataset)

## Load Preprocessed Datasets

In [7]:
#df_germeval2018 = pd.read_csv(cwd + '/data/germeval2018/germeval2018_merged.csv', sep=',', header = 0)
df_germeval2018 = preprocessGermEval(datasets[0])

#df_germeval2019 = pd.read_csv(cwd + '/data/germeval2019/germeval2019_merged.csv', sep=',', header = 0)
df_germeval2019 = preprocessGermEval(datasets[1])

#df_hasoc2019 = pd.read_csv(cwd + '/data/hasoc2019/hasoc2019_merged.csv', sep=',', header = 0)
df_hasoc2019 = preprocessHasoc(datasets[2],'\t')

#df_hasoc2020 = pd.read_csv(cwd + '/data/hasoc2020/hasoc2020_merged.csv', sep=',', header = 0)
df_hasoc2020 = preprocessHasoc(datasets[3],';')

#df_hatespeech_refugees = pd.read_csv(cwd + '/data/hatespeech_refugees/hatespeech_refugees_merged.csv', sep=',', header = 0)
df_hatespeech_refugees = preprocessRefugeeCorpus(datasets[5])


#df_polly = pd.read_csv(cwd + '/data/polly/polly_merged.csv', sep=',', header = 0)
df_polly = preprocessPollyCorpus(datasets[4])

df_datasets = [df_germeval2018,df_germeval2019,df_hasoc2019,df_hasoc2020,df_hatespeech_refugees,df_polly]

display(df_datasets)

Merge file: germeval2018_test.txt
Merge file: germeval2018_training.txt
Merge file: germeval2019_training_subtask12.txt
Merge file: germeval2019GoldLabelsSubtask3.txt
Merge file: germeval2019GoldLabelsSubtask1_2.txt
Merge file: germeval2019_training_subtask3.txt
Merge file: german_dataset.tsv
Merge file: hasoc_de_test_gold.tsv
Merge file: hasoc_2020_de_test_new.csv
Merge file: hasoc_2020_de_train_new.csv


[                                                      0  1             2
 0     Meine Mutter hat mir erzählt, dass mein Vater ...  0  germeval2018
 1     @Tom174_ @davidbest95 Meine Reaktion; |LBR| Ni...  0  germeval2018
 2     #Merkel rollt dem Emir von #Katar, der islamis...  0  germeval2018
 3     „Merle ist kein junges unschuldiges Mädchen“ K...  0  germeval2018
 4     @umweltundaktiv Asylantenflut bringt eben nur ...  1  germeval2018
 ...                                                 ... ..           ...
 5004  Gegens. Zul. zu Patenamt &amp; gegenseitige An...  0  germeval2018
 5005  @GlasenappHenrik Zu Merkel fällt mir nur ein, ...  1  germeval2018
 5006  @KokoLores20 @krippmarie Ein richtiges Zeichen...  1  germeval2018
 5007  @Hartes_Geld ,Honecker‘Merkel macht uns zur ,D...  1  germeval2018
 5008  Warum wurden die G20-Chaoten nicht sofort auf ...  0  germeval2018
 
 [8534 rows x 3 columns],
                                                       0  1             2
 0     @jo

## Merge all Dataframes into one dataset

In [8]:
def mergeDatasets(df_datasets):
    df_all_datasets = pd.DataFrame()
    for df in df_datasets:
        df_all_datasets = df_all_datasets.append(df)    
    df_all_datasets = df_all_datasets.reset_index()
    df_all_datasets = df_all_datasets.drop(columns={"index"})
    df_all_datasets = df_all_datasets.rename(columns={0:'text',1:'hate',2:'dataset'})
    #df_all_datasets.to_csv(cwd + '/data/final_dataset.csv', sep=',')
    #display(df_all_datasets)
    return df_all_datasets
    

In [9]:
df = mergeDatasets(df_datasets)

In [11]:
df = df.drop_duplicates(subset='text')

In [23]:
df = df.reset_index(drop=True)
df.to_csv("final_dataset.csv",sep=",")

In [24]:
df

Unnamed: 0,text,hate,dataset
0,"Meine Mutter hat mir erzählt, dass mein Vater ...",0,germeval2018
1,@Tom174_ @davidbest95 Meine Reaktion; |LBR| Ni...,0,germeval2018
2,"#Merkel rollt dem Emir von #Katar, der islamis...",0,germeval2018
3,„Merle ist kein junges unschuldiges Mädchen“ K...,0,germeval2018
4,@umweltundaktiv Asylantenflut bringt eben nur ...,1,germeval2018
...,...,...,...
45126,RT @name: Urin wird im Hoden gespeichert,1,polly
45127,"RT @name: Viele von denen, die sich jetzt über...",1,polly
45128,RT @name: Wenn eine Schule schon wegen der rel...,1,polly
45129,"RT @news: Als Twitter @name sperrte, schwiegst...",1,polly
