# Create MetaHateES

In [None]:
import pandas as pd

my_datasets = [
    (1, 'DETESTS 2024'), 
    (2, 'EXIST 2O21 & 2023'),
    (3, 'HaSCoSVa'),
    (4, 'HaterNet'),
    (5, 'NewsCom-TOX'),
    (6, 'OffendES'),
    (7, 'SemEval 2019 Hateval'),
    (8, 'Spanish MisoCorpus 2020'),
    (9, 'HateFootball'),   
    (10, 'MeTwo'),
]

In [None]:
# 1. DETESTS
# Two text genres: comments on news articles (detests) and posts on Twitter reacting to hoaxes (stereohoax) about the integration of immigrants
# Uses part of the data from NewsCom-TOX
# Convert data to format: text, label (0-1), source (twitter or news), dataset (detests2024), content_type (stereotype), variation (europe)
df = pd.read_csv('data/detests2024.csv')
df = df[['source', 'text', 'stereotype', 'implicit']]
df = df.dropna()
df.drop_duplicates(subset='text', keep="first")
df['source'].replace({'stereohoax': 'twitter', 'detests': 'news'}, inplace=True)
df = df.rename(columns={'stereotype': 'label'})
df['dataset'] = 'detests2024'
df['content_type'] = 'sterotype'
df['variation'] = 'europe'
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [34]:
# 2. EXIST 2023
# Convert data to format: text, label (0-1), source (twitter), dataset (exist2023), content_type (sexism), variation (europe)
df_train = pd.read_csv('data/exist2023.tsv', sep='\t')
df_dev = pd.read_csv('data/exist2023_dev.tsv', sep='\t')
df = pd.concat([df_train, df_dev])
df = df[df['lang'] == 'es']
df = df[['tweet', 'labels_task1', 'labels_task2']]
df = df.dropna()
df = df.rename(columns={'tweet': 'text'})
df.drop_duplicates(subset='text', keep="first")

def calculate_label(row):
    majority_task1_yes = row["labels_task1"].count("YES") >= len(row["labels_task1"]) / 2
    majority_task2 = sum(label in ["JUDGEMENTAL", "DIRECT"] for label in row["labels_task2"]) >= len(row["labels_task2"]) / 2
    return 1 if majority_task1_yes and majority_task2 else 0
df["label"] = df.apply(calculate_label, axis=1)
df['source'] = 'twitter'
df['dataset'] = 'exist2023'
df['content_type'] = 'sexism'
df['variation'] = 'europe'
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [41]:
# 2. EXIST 2021
# Convert data to format: text, label (0-1), source (twitter, gab), dataset (exist2021), content_type (sexism), variation (europe)
df_train = pd.read_csv('data/exist2021_train.tsv', sep='\t')
df_test = pd.read_csv('data/exist2021_test.tsv', sep='\t')
df = pd.concat([df_train, df_test])
df = df[df['language'] == 'es']
df = df[['text', 'source', 'task1']]
df = df.dropna()
df = df.rename(columns={'task1': 'label'})
df.drop_duplicates(subset='text', keep="first")
df['dataset'] = 'exist2021'
df['content_type'] = 'sexism'
df['variation'] = 'europe'
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [3]:
# 3. HaSCoSVa
# Convert data to format: text, label (0-1), source (twitter), dataset (hascosva), content_type (hate_speech)
df = pd.read_csv('data/hascosva.tsv', sep='\t')
df = df.dropna()
df.drop_duplicates(subset='text', keep="first")
df['dataset'] = 'hascosva'
df['source'] = 'twitter'
df['content_type'] = 'hate_speech'
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [22]:
# 4. HaterNet
# Convert data to format: text, label (0-1), source (twitter), dataset (haternet), content_type (hate_speech)
df = pd.read_csv('data/haternet.tsv', sep='\t')
df = df.dropna()
df.drop_duplicates(subset='text', keep="first")
df['dataset'] = 'haternet'
df['source'] = 'twitter'
df['content_type'] = 'hate_speech'
df['variation'] = 'europe'
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [5]:
# 5. NewsCom-TOX
# Convert data to format: text, label (0-1), source (twitter), dataset (newscom-tox), content_type (toxicity)
df_train = pd.read_csv('data/newscom-tox_train.csv', sep=',')
df_test = pd.read_csv('data/newscom-tox_test.csv', sep=',')
df = pd.concat([df_train, df_test])
df = df[['comment', 'toxicity']]
df = df.dropna()
df = df.rename(columns={'comment': 'text'})
df.drop_duplicates(subset='text', keep="first")
df = df.rename(columns={'toxicity': 'label'})
df['dataset'] = 'newscom-tox'
df['source'] = 'news'
df['content_type'] = 'toxicity'
df['variation'] = 'europe'
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [None]:
# 6. OffendES
# Convert data to format: text, label (0-1), source (twitter), dataset (offendes), content_type (offensive)
df_dev = pd.read_csv('data/offendes_dev.tsv', sep='\t')
df_test = pd.read_csv('data/offendes_test.tsv', sep='\t')
df_train = pd.read_csv('data/offendes_train.tsv', sep='\t')
df = pd.concat([df_dev, df_test, df_train])
df = df.dropna()
df.drop_duplicates(subset='comment', keep="first")
df['label'].replace({'NO': 0, 'NOE': 0, 'OFG': 1, 'OFP': 1 }, inplace=True)
df = df.rename(columns={'comment': 'text'})
df['dataset'] = 'offendes'
df['source'] = df['media']
df['content_type'] = 'offensive'
df['variation'] = 'europe'
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [15]:
# 7. SemEval 2019 HatEval
# Convert data to format: text, label (0-1), source (twitter), dataset (hateval), content_type (hate_speech)
df_dev = pd.read_csv('data/hateval_dev.csv', sep=',')
df_test = pd.read_csv('data/hateval_test.csv', sep=',')
df_train = pd.read_csv('data/hateval_train.csv', sep=',')
df = pd.concat([df_dev, df_test, df_train])
df = df.dropna()
df.drop_duplicates(subset='text', keep="first")
df = df.rename(columns={'HS': 'label'})
df['dataset'] = 'hateval'
df['source'] = 'twitter'
df['content_type'] = 'hate_speech'
df['variation'] = 'europe' # they use castilian spanish speaking annotators - the data is castilian spanish 
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [None]:
# 8. MisoCorpus
# Convert data to format: text, label (0-1), source (twitter), dataset (misocorpus), content_type (hate_speech)
df = pd.read_csv('data/misocorpus.csv', sep=',')
df = df.dropna()
df.drop_duplicates(subset='text', keep="first")
df['source'] = 'twitter'
df['content_type'] = 'misogyny'
df["variation"] = df["post_author_country_location"].apply(lambda x: "europe" if x == "Spain" else "latam")
df = df.rename(columns={'labels': 'label'})
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [1]:
# 9. HateFootball
df = pd.read_csv('data/hate-football.csv', sep=',')
df = df.rename(columns={'tweet': 'text'})
df["label"] = df["label"].apply(lambda x: 1 if "racist" in x or "misogyny" in x else 0)
df['source'] = 'twitter'
df['content_type'] = 'aggresive'
df['variation'] = 'europe'
df['dataset'] = 'hate-football'
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [2]:
# 10. MeTwo 
df = pd.read_csv('data/MeTwo.csv', sep=',')
df["label"] = df["categoria"].apply(lambda x: 1 if x == "SEXIST" else 0)
df['source'] = 'twitter'
df['content_type'] = 'sexism'
df['variation'] = 'europe' # expressions used to create the data are in castilian spanish
df['dataset'] = 'metwo'
df = df[['label', 'text', 'source', 'variation', 'dataset', 'content_type']]

In [None]:
dataset_names = [
    'detests2024.tsv',
    'exist2021.tsv',
    'exist2023.tsv',
    'hascosva.tsv',
    'haternet.tsv',
    'hateval.tsv',
    'misocorpus.tsv',
    'newscom-tox.tsv',
    'offendes.tsv',
    'hate-football.tsv',
    'metwo.tsv',
]

full_df = pd.DataFrame()

for name in dataset_names:
    full_df = pd.concat([full_df, pd.read_csv(f'data/{name}', sep='\t')])
full_df.dropna(inplace=True)
full_df.drop_duplicates(subset='text', keep="first")
full_df = full_df[full_df['variation'] == 'europe']
full_df['label'] = full_df['label'].replace({'non-sexist': 0, 'sexist': 1})
full_df['label'] = full_df['label'].replace({'0': 0, '1': 1})
full_df['label'] = full_df['label'].replace({'0.0': 0, '1.0': 1})
full_df['label'] = full_df['label'].astype(int)
full_df.to_csv('data/MetaHateES.tsv', sep='\t', index=False)

In [None]:
import pandas as pd

df = pd.read_csv('data/MetaHateES.tsv', sep='\t')

summary = {}

summary['Total samples'] = len(df)
summary['Hate speech samples'] = f"{(df['label'] == 1).sum()} ({(df['label'] == 1).mean():.2%})"
summary['Non-hate samples'] = f"{(df['label'] == 0).sum()} ({(df['label'] == 0).mean():.2%})"

summary['Unique sources'] = df['source'].unique().tolist()
summary['Unique datasets'] = df['dataset'].unique().tolist()
dataset_count = df['dataset'].value_counts().head(15)
summary['Count per dataset'] = dataset_count.to_dict()

summary['Samples from Europe only'] = f"{(df['variation'] == 'europe').sum()} ({(df['variation'] == 'only europe').mean():.2%})"

top_content = df['content_type'].value_counts().head(10)
summary['Top 10 content types'] = top_content.to_dict()

hate_by_content = df[df['content_type'].isin(top_content.index)]
hate_pct_by_content = hate_by_content.groupby('content_type')['label'].mean().sort_values(ascending=False).apply(lambda x: f"{x:.2%}")
summary['Hate by content type (top 10)'] = hate_pct_by_content.to_dict()

hate_by_source = df.groupby('source')['label'].mean().sort_values(ascending=False).apply(lambda x: f"{x:.2%}")
summary['Hate by source'] = hate_by_source.to_dict()

hate_by_source_count = df[df['label'] == 1].groupby('source').size().sort_values(ascending=False)
summary['Hate count by source'] = hate_by_source_count.to_dict()

count_by_source = df.groupby('source').size().sort_values(ascending=False)
summary['Count by source'] = count_by_source.to_dict()

summary_df = pd.DataFrame.from_dict(summary, orient='index', columns=['Value'])

summary_df

In [None]:
df[['content_type', 'source']].value_counts()