# DATA PARTITIONING

Given a raw data set, this notebook performs the following operations:
- randomly partitons a raw data set into training and test set
- further splits the training set into training and validation fold combinations
- applies `MaxAbsScaler` and exports scaled and raw data as CSV and pickle files

This ensures that fairness processors trained in other notebooks use the same data partititioning.

## 1. Parameters and preparations

In [1]:
##### IMPORTS

# working paths
%run code_00_working_paths.py

import sys
sys.path.append(func_path)

import pickle
import numpy as np
import time
from sklearn.preprocessing import MaxAbsScaler
import os

from load_data import load_dataset

from aif360.metrics import BinaryLabelDatasetMetric

pip install 'aif360[Reductions]'
pip install 'aif360[Reductions]'
pip install 'aif360[inFairness]'
pip install 'aif360[Reductions]'


In [2]:
##### PARAMETERS

# specify data set
# one of ['data1', 'data2', ..., 'data50']
data = 'german'

# partitioning
test_ratio = 0.3
num_folds  = 10
seed       = 1

## 2. Data import

In [3]:
##### DATA IMPORT

dataset_orig = load_dataset(path = data_path + 'raw/' + data + '.csv', 
                            data = data) 

ValueError: Dataset 'german' is not recognized.

In [83]:
##### DATA PREP

# protected attribute
protected           = 'race'
privileged_groups   = [{'race': 1}] 
unprivileged_groups = [{'race': 0}]

# check dimensions
print(dataset_orig.metadata['params']['df'].shape)

(800, 6)


In [84]:
# Convert the StandardDataset to a pandas DataFrame
df = dataset_orig.convert_to_dataframe()[0]

# Display the first few rows of the DataFrame
print(df.head())

     priors  sentence  race  rehab=Not-participated  rehab=Participated  \
0  1.026650  3.155870   1.0                     1.0                 0.0   
1  0.477052  1.551613   1.0                     0.0                 1.0   
2  1.031786  4.398357   1.0                     0.0                 1.0   
3  0.913128  2.880531   1.0                     1.0                 0.0   
4  1.233794  3.899129   1.0                     0.0                 1.0   

   target  
0     2.0  
1     2.0  
2     2.0  
3     2.0  
4     2.0  


## 3. Partitioning and scaling

In [85]:
##### PARTITIONING

# set seed
np.random.seed(seed)

# Assuming data is in the format 'dataN' where N is the dataset number
dataset_number = data[4:]

# Create the directory path (assuming directories data1, data2, ..., data50 already exist)
output_dir = os.path.join(data_path, 'prepared')

# Train / test partitioning
dataset_orig_train, dataset_orig_test = dataset_orig.split([1 - test_ratio], shuffle=True)
tr = dataset_orig_train.convert_to_dataframe()[0]
te = dataset_orig_test.convert_to_dataframe()[0]

# Export test set
te.to_csv(os.path.join(output_dir, data + '_' + 'orig_test.csv'), index=None, header=True)
pickle.dump(dataset_orig_test, open(os.path.join(output_dir, data + '_' + 'orig_test.pkl'), 'wb'))
print(tr.shape, te.shape)

# cross-validation on the training set
skf = dataset_orig_train.split(num_or_size_splits = num_folds, seed = seed, shuffle = True)

(560, 6) (240, 6)


In [86]:
##### SCALING

# timer
cv_start = time.time()

# data partitioning loop
for fold in range(num_folds):

    ##### DATA PARTITIONING

    # validation fold
    data_valid = skf[fold].copy()

    # train folds
    train_folds = [f for f in range(num_folds) if f != fold]
    for fold_idx in train_folds:

        if fold_idx == train_folds[0]:
            data_train = skf[fold_idx].copy()
        else:
            data_train.features             = np.concatenate([data_train.features, skf[fold_idx].features],                         axis = 0)
            data_train.instance_names       = np.concatenate([data_train.instance_names, skf[fold_idx].instance_names],             axis = 0)
            data_train.instance_weights     = np.concatenate([data_train.instance_weights, skf[fold_idx].instance_weights],         axis = 0)
            data_train.labels               = np.concatenate([data_train.labels, skf[fold_idx].labels],                             axis = 0)
            data_train.protected_attributes = np.concatenate([data_train.protected_attributes, skf[fold_idx].protected_attributes], axis = 0)
            data_train.scores               = np.concatenate([data_train.scores, skf[fold_idx].scores],                             axis = 0)

    # test set
    data_test = dataset_orig_test.copy()
    
    # convert to DF
    tr = data_train.convert_to_dataframe()[0]
    va = data_valid.convert_to_dataframe()[0]

    # Export CSV
    tr.to_csv(os.path.join(output_dir, data + '_' + f'orig_{fold}_train.csv'), index=None, header=True)
    va.to_csv(os.path.join(output_dir, data + '_' + f'orig_{fold}_valid.csv'), index=None, header=True)
    
    # Export pickle
    pickle.dump(data_train, open(os.path.join(output_dir, data + '_' + f'orig_{fold}_train.pkl'), 'wb'))
    pickle.dump(data_valid, open(os.path.join(output_dir, data + '_' + f'orig_{fold}_valid.pkl'), 'wb'))
    print('--', tr.shape, va.shape)


    ##### SCALING

    # scale features
    min_max_scaler      = MaxAbsScaler()
    data_train.features = min_max_scaler.fit_transform(data_train.features)
    data_valid.features = min_max_scaler.transform(data_valid.features)
    data_test.features  = min_max_scaler.transform(data_test.features)

    # convert to DF
    tr = data_train.convert_to_dataframe()[0]
    va = data_valid.convert_to_dataframe()[0]
    te = data_test.convert_to_dataframe()[0]

    # Save CSV
    tr.to_csv(os.path.join(output_dir, data + '_' + f'scaled_{fold}_train.csv'), index=None, header=True)
    va.to_csv(os.path.join(output_dir, data + '_' + f'scaled_{fold}_valid.csv'), index=None, header=True)
    te.to_csv(os.path.join(output_dir, data + '_' + f'scaled_{fold}_test.csv'), index=None, header=True)
    
    # Save pickle
    pickle.dump(data_train, open(os.path.join(output_dir, data + '_' + f'scaled_{fold}_train.pkl'), 'wb'))
    pickle.dump(data_valid, open(os.path.join(output_dir, data + '_' + f'scaled_{fold}_valid.pkl'), 'wb'))
    pickle.dump(data_test, open(os.path.join(output_dir, data + '_' + f'scaled_{fold}_test.pkl'), 'wb'))

# print performance
print('')
print('Finished in {:.2f} minutes'.format((time.time() - cv_start) / 60))

-- (504, 6) (56, 6)
-- (504, 6) (56, 6)
-- (504, 6) (56, 6)
-- (504, 6) (56, 6)
-- (504, 6) (56, 6)
-- (504, 6) (56, 6)
-- (504, 6) (56, 6)
-- (504, 6) (56, 6)
-- (504, 6) (56, 6)
-- (504, 6) (56, 6)

Finished in 0.01 minutes
