In [27]:
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm
from sklearn.model_selection import train_test_split

## GHC

In [4]:
ghc_df = pd.read_csv('GHC\GabHateCorpus_annotations.tsv', delimiter='\t')
ghc_df.head()

Unnamed: 0,ID,Annotator,Text,Hate,HD,CV,VO,REL,RAE,SXO,GEN,IDL,NAT,POL,MPH,EX,IM
0,27044,4,Ah the PSYOPS antifa crew is back. That’s how ...,0,0,0,0,,,,,,,,,,
1,27044,15,Ah the PSYOPS antifa crew is back. That’s how ...,0,0,0,0,,,,,,,,,,
2,27044,10,Ah the PSYOPS antifa crew is back. That’s how ...,0,0,0,0,,,,,,,,,,
3,27044,8,Ah the PSYOPS antifa crew is back. That’s how ...,0,0,0,0,,,,,,,,,,
4,27045,4,Get the new Android app update released today ...,0,0,0,0,,,,,,,,,,


In [55]:
print(len(ghc_df['ID'].unique()))

27665


In [69]:
f = open('GHC\ghc_preprocessed.tsv', 'a', encoding='utf-8')
writer = csv.writer(f, delimiter='\t')
writer.writerow(['txt', 'hate'])
for idx in tqdm(ghc_df['ID'].unique()):
    rows = ghc_df.loc[ghc_df['ID'] == idx]
    row_len = len(rows)
    row_sum = sum(list(rows['Hate']))
    if row_sum >= int(row_len * (2 / 3)):
        writer.writerow([rows['Text'].iloc[0], '1'])
    elif row_sum == 0:
        writer.writerow(rows[['Text', 'Hate']].iloc[0])
f.close()

100%|██████████| 27665/27665 [00:19<00:00, 1384.81it/s]


In [40]:
ghc_df_after = pd.read_csv('GHC/ghc_preprocessed.tsv', delimiter='\t')
ghc_df_after['hate'].value_counts()

0    19852
1     2563
Name: hate, dtype: int64

In [37]:
RANDOM_SEED = 42
df_train, df_test = train_test_split(
  ghc_df_after,
  test_size=0.2,
  random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
  df_test,
  test_size=0.5,
  random_state=RANDOM_SEED
)
df_train.to_csv('real_dataset/ghc_train.tsv', sep='\t', header=['text', 'is_hate'], columns = ['txt', 'hate'])
df_val.to_csv('real_dataset/ghc_val.tsv', sep='\t', header=['text', 'is_hate'], columns = ['txt', 'hate'])
df_test.to_csv('real_dataset/ghc_test.tsv', sep='\t', header=['text', 'is_hate'], columns = ['txt', 'hate'])
print()

In [45]:
print(df_train.iloc[:, 1].value_counts())
print(df_val.iloc[:, 1].value_counts())
print(df_test.iloc[:, 1].value_counts())

0    15859
1     2073
Name: hate, dtype: int64
0    2003
1     238
Name: hate, dtype: int64
0    1990
1     252
Name: hate, dtype: int64


## Stormfront

In [62]:
LABELS = {
    'noHate': 0,
    'hate': 1
}

def get_text(file_id):
    with open(f'Stormfront/all_files/{file_id}.txt', encoding='utf-8') as f:
        return f.readline()

df = pd.read_csv('Stormfront/annotations_metadata.csv')
df.index.name = 'doc_id'  
df['text'] = df['file_id'].apply(get_text)
df['label'].replace(LABELS, inplace=True)
df.rename(columns={'label': 'is_hate'}, inplace=True)
df.to_csv('stormfront.tsv', sep='\t', columns = ['text', 'is_hate'])

In [48]:
stormfront_df_after = pd.read_csv('Stormfront/stormfront.tsv', delimiter='\t')
stormfront_df_after['is_hate'].value_counts()

0           9507
1           1196
relation     168
idk/skip      73
Name: is_hate, dtype: int64

In [51]:
stormfront_df_final = stormfront_df_after[(stormfront_df_after['is_hate'].isin(['0', '1']))]
stormfront_df_final.head()

Unnamed: 0,doc_id,text,is_hate
0,0,"As of March 13th , 2014 , the booklet had been...",0
1,1,In order to help increase the booklets downloa...,0
2,2,( Simply copy and paste the following text int...,0
3,3,Click below for a FREE download of a colorfull...,1
4,4,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,0


In [53]:
RANDOM_SEED = 42
df_train, df_test = train_test_split(
  stormfront_df_final,
  test_size=0.2,
  random_state=RANDOM_SEED
)
df_val, df_test = train_test_split(
  df_test,
  test_size=0.5,
  random_state=RANDOM_SEED
)
df_train.to_csv('real_dataset/stormfront_train.tsv', sep='\t', columns = ['text', 'is_hate'], header=['text', 'is_hate'])
df_val.to_csv('real_dataset/stormfront_val.tsv', sep='\t', columns = ['text', 'is_hate'], header=['text', 'is_hate'])
df_test.to_csv('real_dataset/stormfront_test.tsv', sep='\t', columns = ['text', 'is_hate'], header=['text', 'is_hate'])