## Split Data Error Mitigation Example

In [None]:
!pip install -e ../../../responsible-ai-mitigations

In [None]:
import pandas as pd
import zipfile
import pathlib
from raimitigations.dataprocessing import Split

from urllib.request import urlretrieve
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
outdirname = 'mitigations-datasets.2.22.2022'
zipfilename = outdirname + '.zip'
if not pathlib.Path(outdirname).exists () :
    urlretrieve('https://publictestdatasets.blob.core.windows.net/data/' + zipfilename, '../../' + zipfilename)
    with zipfile.ZipFile('../../' + zipfilename, 'r') as unzip:
        unzip.extractall('../../.')

data_dir = ('../../' + outdirname + '/hr_promotion')
dataset =  pd.read_csv(data_dir + '/train.csv').drop(['employee_id'], axis=1)
seed = 42

dataset.head()


In [None]:
# # Parameters

# dataset - Panda Data Frame. 
# target – A string representing the name of the label column, or the label column integer index (zero base)
# train_size – The training data split size.  The default is 0.9, which split the dataset to 90% training and 10% testing. 
    # Training and Test split values add up to 1. 
# random_state – Control the randomization of the algorithm. 
    # ‘None’: the random number generator is the RandomState instance used by np.random.  
# categorical_features – A Boolean flag to indicates the presence of categorical features. Default is True.  
# drop_null: If flag is set to True, records with null values are dropped, otherwise they are replaced by the mean.
    # Default is True.
# drop_duplicates: if flag is set to True, duplicate records are dropped. Default is False.
# Stratify: If not None, data is split in a stratified fashion, using this as the class labels. Default is False.

# data before 
dataset.head()
print(dataset.shape)

target_index = dataset.columns.get_loc('is_promoted')

data_split =  Split(dataset,target_index , 0.9, seed, True, True, True, True)
# data_split =  Split(dataset, 'is_promoted', 0.9, seed, True, True, True, True)
train_data, test_data = data_split.split()


# data after 
train_data.head()
print(train_data.shape)

In [None]:
# stratify ON
print('Stratify ON')

#                 dataset
#                 target
#                 train_size
#                 random_state = None
#                 categorical_features = True 
#                 drop_null = True
#                 drop_duplicates = False
#                 is_stratify = False

data_split =  Split(dataset, 12, 0.5, seed, False, False, True, True)
train_data, test_data = data_split.split()

df_train = pd.DataFrame(train_data)
print('Train dataset % of target (1 over 0): ' + str(df_train.is_promoted.value_counts()[1] /
df_train.is_promoted.value_counts()[0]))
df_test = pd.DataFrame(test_data)
print('Test dataset % of target (1 over 0): ' + str(df_test.is_promoted.value_counts()[1] /
df_test.is_promoted.value_counts()[0]))
# df_train.is_promoted.value_counts().plot(kind='bar', title=' Train dataset count (is_promoted)')
# df_test.is_promoted.value_counts().plot(kind='bar', title='Test dataset count (is_promoted)')



In [None]:
# stratify OFF
print('Stratify OFF')


   
data_split =  Split(dataset, 12, 0.5, seed, False, False, True, False)
train_data, test_data = data_split.split()
df = pd.DataFrame(train_data)
print('Train dataset % of target (1 over 0): ' + str(df.is_promoted.value_counts()[1] /
df.is_promoted.value_counts()[0]))
df = pd.DataFrame(test_data)
print('Test dataset % of target (1 over 0): ' + str(df.is_promoted.value_counts()[1] /
df.is_promoted.value_counts()[0])) 
