# Load processed datasets
From datasets.ipynb

In [1]:
import pickle

path = '/storage2/mamille3/hegemonic_hate/tmp/processed_datasets.pkl'
with open(path, 'rb') as f:
    hate_datasets = pickle.load(f)
hate_datasets.keys()

dict_keys(['cad', 'elsherief2021', 'sbic', 'kennedy2020', 'salminen2018'])

# Split datasets
Output: train, dev and test folds for with-hegemonic and no-hegemonic splits. All splits/folds have 30/70 hate/no-hate ratios

In [20]:
# for dataset in hate_datasets:

# Remove instances with hegemonic labels
print(len(hate_datasets['cad']))
print(hate_datasets['cad'].group_label.unique())
no_hegemonic = hate_datasets['cad'].query('group_label != "hegemonic"')
print(len(no_hegemonic))
print(no_hegemonic.group_label.unique())

27494
[nan 'other' 'marginalized' 'hegemonic']
27159
[nan 'other' 'marginalized']


In [21]:
# Sample to specific ratios of hate/nonhate
hate_ratio = 0.3

# Original ratios (though have already been lightly processed, which can affect this
# print(hate_datasets['cad'].hate.value_counts(normalize=True))
# print(hate_datasets['cad'].hate.value_counts())

# Desired sampling of non-hate. Keep all hate rows (for no_hegemonic)
n_hate = no_hegemonic.hate.sum()
n_samples = {
    True: n_hate,
    False: int((n_hate*(1-hate_ratio))/hate_ratio)
}

resampled_no_heg = no_hegemonic.groupby('hate').apply(lambda x: x.sample(n_samples[x.name]))
resampled_no_heg.index = resampled_no_heg.index.droplevel('hate')
resampled_no_heg = resampled_no_heg.sample(frac=1, random_state=9)
print(resampled_no_heg.hate.value_counts())
print(resampled_no_heg.hate.value_counts(normalize=True))
print(resampled_no_heg.group_label.value_counts())
n_samples

False    9081
True     3892
Name: hate, dtype: int64
False    0.699992
True     0.300008
Name: hate, dtype: float64
marginalized    2356
other           1536
Name: group_label, dtype: int64


{True: 3892, False: 9081}

In [22]:
# Sample with_hegemonic dataset
resampled_with_heg = hate_datasets['cad'].groupby('hate').apply(lambda x: x.sample(n_samples[x.name]))
resampled_with_heg.index = resampled_with_heg.index.droplevel('hate')
resampled_with_heg = resampled_with_heg.sample(frac=1, random_state=9)
print(resampled_with_heg.hate.value_counts())
print(resampled_with_heg.hate.value_counts(normalize=True))
print(resampled_with_heg.group_label.value_counts())

False    9081
True     3892
Name: hate, dtype: int64
False    0.699992
True     0.300008
Name: hate, dtype: float64
marginalized    2175
other           1409
hegemonic        308
Name: group_label, dtype: int64


In [26]:
# Split into train/dev/test 60/20/20
import numpy as np

with_heg, no_heg = {}, {}
with_heg['train'], with_heg['dev'], with_heg['test'] = np.split(resampled_with_heg, [int(0.6*len(resampled_with_heg)), int(0.8*len(resampled_with_heg))])
no_heg['train'], no_heg['dev'], no_heg['test'] = np.split(resampled_no_heg, [int(0.6*len(resampled_no_heg)), int(0.8*len(resampled_no_heg))])

print('with_heg')
for name, fold in with_heg.items():
    print(f'{name}: {len(fold)} instances')
    print(fold.group_label.value_counts())
    print()
print()
print('no_heg')
for name, fold in no_heg.items():
    print(f'{name}: {len(fold)} instances')
    print(fold.group_label.value_counts())

with_heg
train: 7783 instances
marginalized    1344
other            850
hegemonic        187
Name: group_label, dtype: int64

dev: 2595 instances
marginalized    438
other           274
hegemonic        66
Name: group_label, dtype: int64

test: 2595 instances
marginalized    393
other           285
hegemonic        55
Name: group_label, dtype: int64


no_heg
train: 7783 instances
marginalized    1459
other            922
Name: group_label, dtype: int64
dev: 2595 instances
marginalized    466
other           312
Name: group_label, dtype: int64
test: 2595 instances
marginalized    431
other           302
Name: group_label, dtype: int64


In [19]:
# Save out
# Just do /tmp for now, but when I settle on which splits are important, then save out to csvs in hate_speech/<dataset>
dataset = 'cad'
outpath = f'/storage2/mamille3/hegemonic_hate/tmp/{dataset}_hegsplits_{hate_ratio}hate.pkl'
with open(outpath, 'wb') as f:
    pickle.dump({'with_heg': resampled_with_heg, 'no_heg': resampled_no_heg}, f)