# Load processed datasets
From datasets.ipynb

In [30]:
import pickle

path = '/storage2/mamille3/hegemonic_hate/tmp/processed_datasets.pkl'
with open(path, 'rb') as f:
    hate_datasets = pickle.load(f)
hate_datasets.keys()

dict_keys(['cad', 'elsherief2021', 'sbic', 'kennedy2020', 'salminen2018'])

# Split datasets
Output: train, dev and test folds for with-hegemonic and no-hegemonic splits. All splits/folds have 30/70 hate/no-hate ratios

In [39]:
hate_datasets['salminen2018'].hate.value_counts(normalize=True)

True     0.733706
False    0.266294
Name: hate, dtype: float64

In [44]:
import pandas as pd
import pdb

# for dataset in sorted(hate_datasets)[4:5]:
for dataset in sorted(hate_datasets):
    print(dataset)

    # Remove instances with hegemonic labels
    # print(len(hate_datasets[dataset]))
    # print(hate_datasets[dataset].group_label.unique())
    no_hegemonic = hate_datasets[dataset].query('group_label != "hegemonic"')
    # print(len(no_hegemonic))
    # print(no_hegemonic.group_label.unique())

    # Sample to specific ratios of hate/nonhate
    hate_ratio = 0.30 # have to upsample nonhate from Kennedy+2020 if <0.3 and from Salminen+2018 if <0.7ish

    # Desired sampling of non-hate. Keep all hate rows (for no_hegemonic)
    n_hate = no_hegemonic.hate.sum()
    # print(n_hate)
    n_samples = {
        True: n_hate,
        False: int((n_hate*(1-hate_ratio))/hate_ratio)
    }
    # print(n_samples)
    # print(no_hegemonic.hate.value_counts())
    
    def get_n_samples(x):
        """ Get number of samples for a dataset split """
        desired_n = n_samples[x.name]
        if desired_n > sum(x.hate==x.name):
            return x.sample(desired_n, replace=True) # upsample nonhate
        else:
            return x.sample(desired_n, replace=False)

    resampled_no_heg = no_hegemonic.groupby('hate').apply(get_n_samples)
    resampled_no_heg.index = resampled_no_heg.index.droplevel('hate')
    resampled_no_heg = resampled_no_heg.sample(frac=1, random_state=9)
    # print(resampled_no_heg.hate.value_counts())
    # print(resampled_no_heg.hate.value_counts(normalize=True))
    # print(resampled_no_heg.group_label.value_counts())
    # n_samples

    # Sample with_hegemonic dataset
    # Want to preserve all the hegemonic hate instances so take them out first, then add them back in
    hegemonic_hate = hate_datasets[dataset].query('hate and group_label=="hegemonic"')
    n_samples[True] = n_hate-len(hegemonic_hate)
    no_hegemonic_hate = hate_datasets[dataset].query('not (hate and group_label=="hegemonic")')
    # resampled_with_heg = no_hegemonic_hate.groupby('hate').apply(lambda x: x.sample(n_samples[x.name]))
    resampled_with_heg = no_hegemonic_hate.groupby('hate').apply(get_n_samples)
    resampled_with_heg.index = resampled_with_heg.index.droplevel('hate')
    resampled_with_heg = pd.concat([resampled_with_heg, hegemonic_hate], axis=0) # add hegemonic back in
    resampled_with_heg = resampled_with_heg.sample(frac=1, random_state=9)
    # print(resampled_with_heg.hate.value_counts())
    # print(resampled_with_heg.hate.value_counts(normalize=True))
    # print(resampled_with_heg.group_label.value_counts())

    # Split into train/dev/test 60/10/30
    import numpy as np

    with_heg, no_heg = {}, {}
    with_heg['train'], with_heg['dev'], with_heg['test'] = np.split(resampled_with_heg, [int(0.6*len(resampled_with_heg)), int(0.7*len(resampled_with_heg))])
    no_heg['train'], no_heg['dev'], no_heg['test'] = np.split(resampled_no_heg, [int(0.6*len(resampled_no_heg)), int(0.7*len(resampled_no_heg))])

    print('with_heg')
    for name, fold in with_heg.items():
        print(f'{name}: {len(fold)} instances')
        print(fold.group_label.value_counts())
        print(fold.hate.value_counts(normalize=True))
        # Test hate ratio
        print()
    # print('no_heg')
    # for name, fold in no_heg.items():
    #     print(f'{name}: {len(fold)} instances')
    #     print(fold.group_label.value_counts())
    
    print('********************************************')

cad
with_heg
train: 7783 instances
marginalized    1299
other            874
hegemonic        208
Name: group_label, dtype: int64
False    0.694077
True     0.305923
Name: hate, dtype: float64

dev: 1298 instances
marginalized    213
other           154
hegemonic        38
Name: group_label, dtype: int64
False    0.687982
True     0.312018
Name: hate, dtype: float64

test: 3892 instances
marginalized    633
other           384
hegemonic        89
Name: group_label, dtype: int64
False    0.715827
True     0.284173
Name: hate, dtype: float64

********************************************
elsherief2021
with_heg
train: 10395 instances
marginalized    1903
hegemonic        694
other            532
Name: group_label, dtype: int64
False    0.69899
True     0.30101
Name: hate, dtype: float64

dev: 1733 instances
marginalized    305
hegemonic       117
other            78
Name: group_label, dtype: int64
False    0.710906
True     0.289094
Name: hate, dtype: float64

test: 5198 instances
marginal

In [45]:
# Save out
# Just do /tmp for now, but when I settle on which splits are important, then save out to csvs in hate_speech/<dataset>
for dataset in hate_datasets:
    outpath = f'/storage2/mamille3/hegemonic_hate/tmp/{dataset}_hegsplits_{hate_ratio}hate.pkl'
    with open(outpath, 'wb') as f:
        pickle.dump({'with_heg': resampled_with_heg, 'no_heg': resampled_no_heg}, f)