# Load processed datasets
From datasets.ipynb

In [39]:
import pickle

path = '/storage2/mamille3/hegemonic_hate/tmp/processed_datasets.pkl'
with open(path, 'rb') as f:
    hate_datasets = pickle.load(f)
hate_datasets.keys()

dict_keys(['cad', 'elsherief2021', 'sbic', 'kennedy2020', 'salminen2018'])

# Split datasets
Output: train, dev and test folds for with-hegemonic and no-hegemonic splits. All splits/folds have 30/70 hate/no-hate ratios

In [40]:
from collections import Counter
import pandas as pd
import pdb
from tqdm.notebook import tqdm

# for dataset in sorted(hate_datasets)[4:5]:
for dataset in tqdm(sorted(hate_datasets)):
    print(dataset)

    # Remove instances with hegemonic labels
    # print(len(hate_datasets[dataset]))
    # print(hate_datasets[dataset].group_label.unique())
    no_hegemonic = hate_datasets[dataset].query('group_label != "hegemonic"')
    # print(len(no_hegemonic))
    # print(no_hegemonic.group_label.unique())

    # Sample to specific ratios of hate/nonhate
    hate_ratio = 0.30

    # Desired sampling of non-hate. Keep all hate rows (for no_hegemonic since that's the smallest set)
    n_hate = no_hegemonic.hate.sum()
    # print(n_hate)
    n_samples = {
        True: n_hate,
        False: int((n_hate*(1-hate_ratio))/hate_ratio)
    }
    # print(n_samples)
    # print(no_hegemonic.hate.value_counts())
    
    def get_n_samples(x):
        """ Get number of samples for a dataset split """
        desired_n = n_samples[x.name]
        if desired_n > sum(x.hate==x.name): # if there are more rows needed than are present
            return x.sample(desired_n, random_state=9, replace=True) # upsample nonhate
        else:
            return x.sample(desired_n, random_state=9, replace=False)

    resampled_no_heg = no_hegemonic.groupby('hate').apply(get_n_samples)
    resampled_no_heg.index = resampled_no_heg.index.droplevel('hate')
    resampled_no_heg = resampled_no_heg.sample(frac=1, random_state=9)
    # print(resampled_no_heg.hate.value_counts())
    # print(resampled_no_heg.hate.value_counts(normalize=True))
    # print(resampled_no_heg.group_label.value_counts())
    # n_samples

    # Sample with_hegemonic dataset
    # Want to preserve all the hegemonic instances (hate or non-hate) for maximum differences between datasets.
    # So take them out first, then add them back in
    # Want to make this exactly the same as no_hegemonic, but with hegemonic instances replacing others
    hegemonic_hate = hate_datasets[dataset].query('hate and group_label=="hegemonic"')
    # n_samples[True] = n_hate-len(hegemonic_hate)
    hegemonic_nonhate = hate_datasets[dataset].query('(not hate) and (group_label=="hegemonic")')
    # n_samples[False] = n_samples[False]-len(hegemonic_nonhate)
    # no_hegemonic_hate = hate_datasets[dataset].query('not (hate and group_label=="hegemonic")')
    # n_samples[True] = n_hate-len(hegemonic_hate)
    # no_hegemonic_hate = hate_datasets[dataset].query('not (hate and group_label=="hegemonic")')
    # resampled_with_heg = no_hegemonic_hate.groupby('hate').apply(lambda x: x.sample(n_samples[x.name]))
    n_nonhate = len(resampled_no_heg.query('not hate'))
    desired_n_hegemonic_nonhate = int(len(hegemonic_nonhate)/len(hate_datasets[dataset].query('not hate')) * n_samples[False]) # match ratio overall in the dataset
    replacement = False
    if desired_n_hegemonic_nonhate > len(hegemonic_nonhate): # have oversampled nonhate and some nonhate is labeled hegemonic
        replacement = True
    resampled_with_heg = pd.concat([resampled_no_heg.query('hate').sample(n_hate-len(hegemonic_hate)), 
                                    hegemonic_hate,
                                    resampled_no_heg.query('not hate').sample(n_samples[False] - desired_n_hegemonic_nonhate),
                                    hegemonic_nonhate.sample(desired_n_hegemonic_nonhate, replace=replacement)
                                   ], axis=0)
    
    # resampled_with_heg = no_hegemonic_hate.groupby('hate').apply(get_n_samples)
    # resampled_with_heg.index = resampled_with_heg.index.droplevel('hate')
    # resampled_with_heg = pd.concat([resampled_with_heg, hegemonic_hate], axis=0) # add hegemonic back in
    resampled_with_heg = resampled_with_heg.sample(frac=1, random_state=9)
    # print(resampled_with_heg.hate.value_counts())
    # print(resampled_with_heg.hate.value_counts(normalize=True))
    # print(resampled_with_heg.group_label.value_counts())

    # Test overlap between with_heg and no_heg to see if it's maximum that it can be
    # Should be exact overlap on not hate that's not hegemonic
    # print(len(resampled_with_heg))
    # print(len(resampled_no_heg))
    # intersection = set(resampled_with_heg.index).intersection(set(resampled_no_heg.index))
    unique_heg = Counter(resampled_with_heg.index) - Counter(resampled_no_heg.index)
    assert sum(unique_heg.values()) == len(hegemonic_hate) + desired_n_hegemonic_nonhate
    # intersection = sorted(resampled_with_heg.index.tolist()) -
    # assert (len(resampled_with_heg.index.drop_duplicates()) - len(intersection)) == (len(hegemonic_hate) + desired_n_hegemonic_nonhate)
    # print(len(intersection))
    # print(len(hegemonic_hate))
    # with_heg_nonhate = resampled_with_heg.query('not hate')
    # no_heg_nonhate = resampled_no_heg.query('not hate')
    # # print(len(with_heg_nonhate))
    # # print(len(no_heg_nonhate))
    # nonhate_intersection = pd.merge(with_heg_nonhate, no_heg_nonhate) 
    # assert len(with_heg_nonhate) == len(no_heg_nonhate) == len(nonhate_intersection)

    # Split into train/dev/test 60/10/30
    import numpy as np

    with_heg, no_heg = {}, {}
    with_heg['train'], with_heg['dev'], with_heg['test'] = np.split(resampled_with_heg, [int(0.6*len(resampled_with_heg)), int(0.7*len(resampled_with_heg))])
    no_heg['train'], no_heg['dev'], no_heg['test'] = np.split(resampled_no_heg, [int(0.6*len(resampled_no_heg)), int(0.7*len(resampled_no_heg))])

    print('with_heg')
    for name, fold in with_heg.items():
        print(f'{name}: {len(fold)} instances')
        print(fold.group_label.value_counts())
        print(fold.hate.value_counts(normalize=True))
        # Test hate ratio
        print()
    # print('no_heg')
    # for name, fold in no_heg.items():
    #     print(f'{name}: {len(fold)} instances')
    #     print(fold.group_label.value_counts())
    
    print('********************************************')

    # Save out
    # Just do /tmp for now, but when I settle on which splits are important, then save out to csvs in hate_speech/<dataset>
    outpath = f'/storage2/mamille3/hegemonic_hate/tmp/{dataset}_hegsplits_{hate_ratio}hate.pkl'
    with open(outpath, 'wb') as f:
        pickle.dump({'with_heg': with_heg, 'no_heg': no_heg}, f)

  0%|          | 0/5 [00:00<?, ?it/s]

cad
with_heg
train: 7587 instances
marginalized    1244
other            838
hegemonic        182
Name: group_label, dtype: int64
False    0.701595
True     0.298405
Name: hate, dtype: float64

dev: 1265 instances
marginalized    230
other           128
hegemonic        38
Name: group_label, dtype: int64
False    0.686957
True     0.313043
Name: hate, dtype: float64

test: 3794 instances
marginalized    628
other           414
hegemonic        92
Name: group_label, dtype: int64
False    0.701107
True     0.298893
Name: hate, dtype: float64

********************************************
elsherief2021
with_heg
train: 10393 instances
marginalized    1904
hegemonic        694
other            517
Name: group_label, dtype: int64
False    0.700279
True     0.299721
Name: hate, dtype: float64

dev: 1733 instances
marginalized    321
hegemonic       106
other            94
Name: group_label, dtype: int64
False    0.699365
True     0.300635
Name: hate, dtype: float64

test: 5197 instances
margin