## Transform Data Error Mitigation Example

In [1]:
import sys
sys.path.append('../../../ResponsibleAIToolbox-Mitigation/')

import pandas as pd
from dataprocessing import Transformer

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

# print(__doc__)

In [2]:
data_dir = '../datasets/hr_promotion'
dataset =  pd.read_csv(data_dir + '/train.csv').drop(['employee_id'], axis=1)
seed =42

dataset.head()

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/hr_promotion/train.csv'

In [None]:
# # Parameters

# dataset - A Panda Data Frame representing the data to transform. 
# target – A string representing the name of the label column, or the label column integer index (zero base)

# transformer_type - Enum object for available transformations. 
    # StandardScaler: sklearn.preprocessing.StandardScaler  
        # Standardize features by removing the mean and scaling to unit variance.   
        # z = (x - u) / s (where u is the mean of the training samples or zero if with_mean=False, and s is the standard 
        # deviation of the training samples or one if with_std=False). 
    # MinMaxScaler: sklearn.preprocessing.MinMaxScaler 
        # Transform features by scaling each feature to a given range. This estimator scales and translates each feature 
        # individually such that it is in the given range on the training set, e.g. between zero and one.  
    # RobustScaler: sklearn.preprocessing.RobustScaler 
        # Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data 
        # according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st 
        # quartile (25th quantile) and the 3rd quartile (75th quantile).  
    # PowerTransformer: sklearn.preprocessing.PowerTransformer 
        # Apply a power transform feature-wise to make data more Gaussian-like. This is useful for modeling issues related 
        # to heteroscedasticity (non-constant variance), or other situations where normality is desired. 
        # Box-Cox transform requires input data to be strictly positive, while Yeo-Johnson supports both positive and negative data.  
    # QuantileTransformer: sklearn.preprocessing.QuantileTransformer 
        # Transform features using quantiles information. This method transforms the features to follow a uniform or a normal 
        # distribution. Therefore, for a given feature, this transformation tends to spread out the most frequent values. 
        # It also reduces the impact of (marginal) outliers: this is therefore a robust preprocessing scheme.  
    # Normaliser: sklearn.preprocessing.Normalizer 
        # Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one 
        # nonzero component is rescaled independently of other samples so that its norm (l1, l2 or inf) equals one. 
        
#  – List of the features to transform. The list could be the indexes or the names of the features. 

# random_state - Control the randomization of the algorithm. 
    # ‘None’: the random number generator is the RandomState instance used by np.random.

# method - {‘yeo-johnson’, ‘box-cox’}, default=’yeo-johnson’  

# output_distribution - {‘uniform’, ‘normal’}, Marginal distribution for the transformed data. The choices are 
    # ‘uniform’ (default) or ‘normal’.
    
# transform_features = ['department', 'region', 'education','gender', 'recruitment_channel']
# transform_features = [6,7,8]


target_index = dataset.columns.get_loc('is_promoted')

# standard_scaler =  Transformer(dataset, 'is_promoted',Transformer.TransformerType.StandardScaler, None, seed)
standard_scaler =  Transformer(dataset, target_index, Transformer.TransformerType.StandardScaler, None, seed)
standard_scaler.transform()
print('StandardScaler transformer')
print('')

minmax_scaler =  Transformer(dataset, 'is_promoted', Transformer.TransformerType.MinMaxScaler, None, seed)
minmax_scaler.transform()
print('MinMaxScaler transformer')
print('')

robust_scaler =  Transformer(dataset, 'is_promoted', Transformer.TransformerType.RobustScaler, None, seed)
robust_scaler.transform()
print('RobustScaler transformer')
print('')

power_transformer =  Transformer(dataset, 'is_promoted', Transformer.TransformerType.PowerTransformer, None, seed)
power_transformer.transform()
print('PowerTransformer transformer')
print('')

quantile_transformer =  Transformer(dataset, 'is_promoted', Transformer.TransformerType.QuantileTransformer, None, seed)
quantile_transformer.transform()
print('QuantileTransformer transformer')
print('')

normaliser =  Transformer(dataset, 'is_promoted', Transformer.TransformerType.Normalizer, None, seed)
normaliser.transform()
print('Normalizer transformer')
print('')


In [None]:

target_index = dataset.columns.get_loc('is_promoted')

# transform all data
standard_scaler =  Transformer(dataset, 'is_promoted',Transformer.TransformerType.StandardScaler, None, seed)
standard_scaler.transform()
print('StandardScaler transformer')
print('')

# Transform specific features
transform_features = ['department', 'region']
# transform_features = [0,1]

standard_scaler =  Transformer(dataset, target_index,Transformer.TransformerType.StandardScaler, transform_features, seed)
standard_scaler.transform()
print('StandardScaler transformer for: ' + str(transform_features))
print('')
