In [3]:
import sys
sys.path.append('../../../ResponsibleAIToolbox-Mitigation/')

import pandas as pd
from errorsmitigation.dataprocessing import DataTransformer

# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = "all"

# print(__doc__)

In [4]:
data_dir = '../../datasets/hr_promotion'
dataset =  pd.read_csv(data_dir + '/train.csv').drop(['employee_id'], axis=1)
seed =42

dataset.head()

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [8]:
# # Parameters

# dataset - A Panda Data Frame representing the data to transform. 
# target – A string representing the name of the label column, or the label column integer index (zero base)

# transformer_type - Enum object for available transformations. 
    # StandardScaler: sklearn.preprocessing.StandardScaler  
        # Standardize features by removing the mean and scaling to unit variance.   
        # z = (x - u) / s (where u is the mean of the training samples or zero if with_mean=False, and s is the standard 
        # deviation of the training samples or one if with_std=False). 
    # MinMaxScaler: sklearn.preprocessing.MinMaxScaler 
        # Transform features by scaling each feature to a given range. This estimator scales and translates each feature 
        # individually such that it is in the given range on the training set, e.g. between zero and one.  
    # RobustScaler: sklearn.preprocessing.RobustScaler 
        # Scale features using statistics that are robust to outliers. This Scaler removes the median and scales the data 
        # according to the quantile range (defaults to IQR: Interquartile Range). The IQR is the range between the 1st 
        # quartile (25th quantile) and the 3rd quartile (75th quantile).  
    # PowerTransformer: sklearn.preprocessing.PowerTransformer 
        # Apply a power transform feature-wise to make data more Gaussian-like. This is useful for modeling issues related 
        # to heteroscedasticity (non-constant variance), or other situations where normality is desired. 
        # Box-Cox transform requires input data to be strictly positive, while Yeo-Johnson supports both positive and negative data.  
    # QuantileTransformer: sklearn.preprocessing.QuantileTransformer 
        # Transform features using quantiles information. This method transforms the features to follow a uniform or a normal 
        # distribution. Therefore, for a given feature, this transformation tends to spread out the most frequent values. 
        # It also reduces the impact of (marginal) outliers: this is therefore a robust preprocessing scheme.  
    # Normaliser: sklearn.preprocessing.Normalizer 
        # Normalize samples individually to unit norm. Each sample (i.e. each row of the data matrix) with at least one 
        # nonzero component is rescaled independently of other samples so that its norm (l1, l2 or inf) equals one. 
        
#  – List of the features to transform. The list could be the indexes or the names of the features. 

# random_state - Control the randomization of the algorithm. 
    # ‘None’: the random number generator is the RandomState instance used by np.random.

# method - {‘yeo-johnson’, ‘box-cox’}, default=’yeo-johnson’  

# output_distribution - {‘uniform’, ‘normal’}, Marginal distribution for the transformed data. The choices are 
    # ‘uniform’ (default) or ‘normal’.
    
# transform_features = ['department', 'region', 'education','gender', 'recruitment_channel']
# transform_features = [6,7,8]


target_index = dataset.columns.get_loc('is_promoted')

# standard_scaler =  DataTransformer(dataset, 'is_promoted',DataTransformer.TransformerType.StandardScaler, None, seed)
standard_scaler =  DataTransformer(dataset, target_index, DataTransformer.TransformerType.StandardScaler, None, seed)
print(standard_scaler.Transform())
print('StandardScaler transformer')
print('')

minmax_scaler =  DataTransformer(dataset, 'is_promoted', DataTransformer.TransformerType.MinMaxScaler, None, seed)
minmax_scaler.Transform()
print('MinMaxScaler transformer')
print('')

robust_scaler =  DataTransformer(dataset, 'is_promoted', DataTransformer.TransformerType.RobustScaler, None, seed)
robust_scaler.Transform()
print('RobustScaler transformer')
print('')

power_transformer =  DataTransformer(dataset, 'is_promoted', DataTransformer.TransformerType.PowerTransformer, None, seed)
power_transformer.Transform()
print('PowerTransformer transformer')
print('')

quantile_transformer =  DataTransformer(dataset, 'is_promoted', DataTransformer.TransformerType.QuantileTransformer, None, seed)
quantile_transformer.Transform()
print('QuantileTransformer transformer')
print('')

normaliser =  DataTransformer(dataset, 'is_promoted', DataTransformer.TransformerType.Normalizer, None, seed)
normaliser.Transform()
print('Normalizer transformer')
print('')


                      0          1                 2         3  4   5    6   \
0      Sales & Marketing   region_7  Master's & above  sourcing  1  35  5.0   
1             Operations  region_22        Bachelor's     other  1  30  5.0   
2      Sales & Marketing  region_19        Bachelor's  sourcing  1  34  3.0   
3      Sales & Marketing  region_23        Bachelor's     other  2  39  1.0   
4             Technology  region_26        Bachelor's     other  1  45  3.0   
...                  ...        ...               ...       ... ..  ..  ...   
54803         Technology  region_14        Bachelor's  sourcing  1  48  3.0   
54804         Operations  region_27  Master's & above     other  1  37  2.0   
54805          Analytics   region_1        Bachelor's     other  1  27  5.0   
54806  Sales & Marketing   region_9               NaN  sourcing  1  29  1.0   
54807                 HR  region_22        Bachelor's     other  1  27  1.0   

       7  8  9   10 11        12        13  
0     

In [6]:

target_index = dataset.columns.get_loc('is_promoted')

# transform all data
standard_scaler =  DataTransformer(dataset, 'is_promoted',DataTransformer.TransformerType.StandardScaler, None, seed)
standard_scaler.Transform()
print('StandardScaler transformer')
print('')

# Transform specific features
transform_features = ['department', 'region']
# transform_features = [0,1]

standard_scaler =  DataTransformer(dataset, target_index,DataTransformer.TransformerType.StandardScaler, transform_features, seed)
standard_scaler.Transform()
print('StandardScaler transformer for: ' + str(transform_features))
print('')


StandardScaler transformer

StandardScaler transformer for: ['department', 'region']

