# Configurations

In [None]:
SEED = 42

PSET = [12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56]
NUM_PARTIES = PSET[11]    # 0(12) ~ 11(56)

# DATASETS = ['cifar10', 'mnist', 'permuted_mnist', 'fmnist']
DATASETS = ['cifar10', 'mnist', 'fmnist']

IFD, STRATIFY = True, True
# IFD, STRATIFY = True, False
# IFD, STRATIFY = False, False
# IFD, STRATIFY = False, True

# Data Split

In [None]:
import os
import random
import numpy as np

In [None]:
# ========= #
# SEED = 42 #
# ========= #

os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed = SEED
np.random.seed = SEED

In [None]:
PATH_ROOT = os.path.dirname(os.getcwd())
PATH_DATA = os.path.join(PATH_ROOT, 'data')

print('Root directory: ', PATH_ROOT)
print('Data directory: ', PATH_DATA)

In [None]:
import pickle
import pandas as pd

from tqdm import tqdm
from torchvision import datasets

In [None]:
# =========================================================== #
# DATASETS = ['cifar10', 'mnist', 'permuted_mnist', 'fmnist'] #
# =========================================================== #

dfdict = {}

for ds in tqdm(DATASETS):
    print('| {} |'.format(ds.upper()))
    if ds == 'cifar10':
        train_dataset = datasets.CIFAR10(PATH_DATA, train=True, download=True)
    elif ds == 'mnist':
        train_dataset = datasets.MNIST(PATH_DATA, train=True, download=True)
    elif ds == 'permuted_mnist':
        continue
    elif ds == 'fmnist':
        train_dataset = datasets.FashionMNIST(PATH_DATA, train=True, download=True)

    with open(os.path.join(PATH_DATA, '{}_sorted.pkl'.format(ds)), 'rb') as f:
        data = pickle.load(f)
        
    print('|-- Keys: ', data.keys())
    print('|-- Size: ', len(data['indices']))
    
    df = pd.DataFrame.from_dict(data)
    df.columns = ['indices', 'forgetting counts']
    df = df.sort_values('indices').reset_index(drop=True)
    df['forgettable'] = df['forgetting counts'] > 0
    df['forgettable'] = df['forgettable'].astype(np.int)
    df['targets'] = train_dataset.targets

    print('|-- Forgettables: ')
    print(df['forgettable'].value_counts())
    display(df.head())
    
    df.to_csv(os.path.join(PATH_DATA, '{}_flagged.csv'.format(ds)), index=False, encoding='utf-8')
    
    dfdict[ds] = df

In [None]:
for ds, df in tqdm(dfdict.items()):
    forget_df = df.loc[df['forgettable'] == 1]
    unforget_df = df.loc[df['forgettable'] == 0]

    print('| {} |'.format(ds.upper()))
    print('|-- Forgettable samples: ', len(forget_df.index))
    print('|-- Unforgettable samples: ', len(unforget_df.index))

In [None]:
# =============== #
# NUM_PARTIES = 8 #
# IFD = True      #
# STRATIFY = True #
# =============== #

dirdict = {}

for ds, df in tqdm(dfdict.items()):
    print('| {} |'.format(ds.upper()))
    PATH_DSET = os.path.join(PATH_DATA, '{}'.format(ds))
    if not os.path.exists(PATH_DSET):
        os.mkdir(PATH_DSET)
    print('|-- Directory: '.format(ds.upper()), PATH_DSET)
    
    PATH_PART = os.path.join(PATH_DSET, '{}parties'.format(NUM_PARTIES))
    if not os.path.exists(PATH_PART):
        os.mkdir(PATH_PART)
    print('|-- {}-party directory: '.format(NUM_PARTIES), PATH_PART)
    
    if IFD:
        PATH_FGD = os.path.join(PATH_PART, 'ifd')
        print('|-- Identical forgettable distribution directory: ', PATH_FGD)
    else:
        PATH_FGD = os.path.join(PATH_PART, 'non_ifd')
        print('|-- Non-identical forgettable distribution directory: ', PATH_FGD)
    if not os.path.exists(PATH_FGD):
        os.mkdir(PATH_FGD)
    
    if STRATIFY:
        PATH_TGD = os.path.join(PATH_FGD, 'stratified')
        print('|-- Stratified label distribution directory: ', PATH_TGD)
    else:
        PATH_TGD = os.path.join(PATH_FGD, 'random')
        print('|-- Random label distribution directory: ', PATH_TGD)
    if not os.path.exists(PATH_TGD):
        os.mkdir(PATH_TGD)
        
    dirdict[ds] = PATH_TGD

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# =============== #
# SEED = 42       #
# NUM_PARTIES = 8 #
# IFD = True      #
# STRATIFY = True #
# =============== #

for ds, df in tqdm(dfdict.items()):
    savedir = dirdict[ds]
    print('| {} |'.format(savedir.upper()))
    
    parties = []

    if IFD:
        forget_df = df.loc[df['forgettable'] == 1]
        unforget_df = df.loc[df['forgettable'] == 0]

        forget_items = int(len(forget_df.index) / NUM_PARTIES)
        unforget_items = int(len(unforget_df.index) / NUM_PARTIES)

        tmp_df1 = forget_df.copy()
        tmp_df2 = unforget_df.copy()

        for i in range(NUM_PARTIES - 1):
            if STRATIFY:
                f, _ = train_test_split(tmp_df1['indices'], train_size=forget_items, random_state=SEED, shuffle=True, stratify=tmp_df1['targets'])
                u, _ = train_test_split(tmp_df2['indices'], train_size=unforget_items, random_state=SEED, shuffle=True, stratify=tmp_df2['targets'])
            else:
                f, _ = train_test_split(tmp_df1['indices'], train_size=forget_items, random_state=SEED, shuffle=True)
                u, _ = train_test_split(tmp_df2['indices'], train_size=unforget_items, random_state=SEED, shuffle=True)
            tmp_df1 = tmp_df1.drop(f)
            tmp_df2 = tmp_df2.drop(u)
            p = pd.concat([f, u])
            p = p.to_numpy()
            f = f.to_numpy()
            u = u.to_numpy()
            d = {'all': p, 'forget': f, 'unforget': u}
            parties.append(d)

        f = tmp_df1['indices']
        u = tmp_df2['indices']
        p = pd.concat([f, u])
        p = p.to_numpy()
        f = f.to_numpy()
        u = u.to_numpy()
        d = {'all': p, 'forget': f, 'unforget': u}
        parties.append(d)

        # In case not equally divided
        # parties[-1]['forget'] = np.append(parties[-1]['forget'], parties[-2]['forget'][-1])
        # parties[-2]['forget'] = parties[-2]['forget'][:-1]
        # parties[-2]['unforget'] = np.append(parties[-2]['unforget'], parties[-1]['unforget'][-1])
        # parties[-1]['unforget'] = parties[-1]['unforget'][:-1]
        # parties[-1]['all'] = np.concatenate([parties[-1]['forget'], parties[-1]['unforget']])
        # parties[-2]['all'] = np.concatenate([parties[-2]['forget'], parties[-2]['unforget']])
    else:
        tmp_df = df.copy()
        num_items = int(len(tmp_df.index) / NUM_PARTIES)
        
        for i in range(NUM_PARTIES - 1):
            if STRATIFY:
                p, _ = train_test_split(tmp_df['indices'], train_size=num_items, random_state=SEED, shuffle=True, stratify=tmp_df['targets'])
            else:
                p, _ = train_test_split(tmp_df['indices'], train_size=num_items, random_state=SEED, shuffle=True)
            tmp_df = tmp_df.drop(p)
            p = p.to_numpy()
            f = df.iloc[p].loc[df['forgettable'] == 1, 'indices'].to_numpy()
            u = df.iloc[p].loc[df['forgettable'] == 0, 'indices'].to_numpy()
            d = {'all': p, 'forget': f, 'unforget': u}
            parties.append(d)
            
        p = tmp_df['indices']
        p = p.to_numpy()
        f = df.iloc[p].loc[df['forgettable'] == 1, 'indices'].to_numpy()
        u = df.iloc[p].loc[df['forgettable'] == 0, 'indices'].to_numpy()
        d = {'all': p, 'forget': f, 'unforget': u}
        parties.append(d)

    for i, p in enumerate(parties):
        print('|-- [{:>2}]'.format(i + 1), p['all'].shape, p['forget'].shape, p['unforget'].shape)
        tmp_df = df.iloc[p['all']]
        tmp_df.to_csv(os.path.join(savedir, '{}_p{}.csv'.format(ds, i + 1)), index=False, encoding='utf-8')
        print('|---- Saved as: ', '{}_p{}.csv'.format(ds, i + 1))
    print()
    break