# Rebalance Class (imblearn)

In [1]:
import sys
sys.path.append('../../notebooks')

import pandas as pd
import numpy as np
from raimitigations.dataprocessing import Rebalance
from notebooks.download import download_datasets

## 1 - Dataset with Headers

In [2]:
data_dir = '../../../datasets/'
download_datasets(data_dir)
dataset =  pd.read_csv(data_dir + 'hr_promotion/train.csv')
dataset.drop(columns=['employee_id'], inplace=True)
dataset

Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54803,Technology,region_14,Bachelor's,m,sourcing,1,48,3.0,17,0,0,78,0
54804,Operations,region_27,Master's & above,f,other,1,37,2.0,6,0,0,56,0
54805,Analytics,region_1,Bachelor's,m,other,1,27,5.0,3,1,0,79,0
54806,Sales & Marketing,region_9,,m,sourcing,1,29,1.0,2,0,0,45,0


In [3]:
dataset['is_promoted'].value_counts()

0    50140
1     4668
Name: is_promoted, dtype: int64

In [4]:
rebalance = Rebalance(
				df=dataset,
				rebalance_col='is_promoted',
				k_neighbors=6,
				verbose=False
			)
df_resample = rebalance.fit_resample()
print(df_resample['is_promoted'].value_counts())
df_resample

0    50140
1    50140
Name: is_promoted, dtype: int64


Unnamed: 0,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted
0,Sales & Marketing,region_7,Master's & above,f,sourcing,1.0,35.000000,5.000000,8.000000,0.0,49.000000,0
1,Operations,region_22,Bachelor's,m,other,1.0,30.000000,5.000000,4.000000,0.0,60.000000,0
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,1.0,34.000000,3.000000,7.000000,0.0,50.000000,0
3,Sales & Marketing,region_23,Bachelor's,m,other,2.0,39.000000,1.000000,10.000000,0.0,50.000000,0
4,Technology,region_26,Bachelor's,m,other,1.0,45.000000,3.000000,2.000000,0.0,73.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
100275,Operations,region_2,Master's & above,m,sourcing,1.0,48.218391,2.109195,19.109195,0.0,62.781609,1
100276,Sales & Marketing,region_25,Master's & above,m,other,1.0,37.000000,1.603526,6.698237,0.0,50.000000,1
100277,Sales & Marketing,region_25,Master's & above,m,other,1.0,35.000000,5.000000,4.299350,0.0,47.000000,1
100278,Finance,region_22,Bachelor's,f,sourcing,1.0,27.924155,3.924155,4.000000,0.0,58.000000,1


In [5]:
rebalance = Rebalance(
				df=dataset,
				rebalance_col='is_promoted',
				strategy_over={0:50140, 1:20000},
				k_neighbors=6
			)
df_resample = rebalance.fit_resample()
print(df_resample['is_promoted'].value_counts())

No categorical columns specified. These columns have been automatically identfied as the following:
['department', 'region', 'education', 'gender', 'recruitment_channel']
No columns specified for imputation. These columns have been automatically identified:
['education', 'previous_year_rating']
Running oversampling...
...finished
0    50140
1    20000
Name: is_promoted, dtype: int64


In [6]:
cat_df = dataset.drop(columns=['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'awards_won?', 'avg_training_score'])
X = cat_df.drop(columns=['is_promoted'])
y = cat_df['is_promoted']

print(type(X))

rebalance = Rebalance(
				strategy_over=0.5,
				k_neighbors=4
			)
x_resample, y_resample = rebalance.fit_resample(X=X, y=y)
y_resample.value_counts()

<class 'pandas.core.frame.DataFrame'>
No categorical columns specified. These columns have been automatically identfied as the following:
['department', 'region', 'education', 'gender', 'recruitment_channel']
No columns specified for imputation. These columns have been automatically identified:
['education']
Running oversampling...
...finished


0    50140
1    25070
Name: is_promoted, dtype: int64

In [7]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)

rebalance = Rebalance(
				df=dataset,
				rebalance_col='is_promoted',
				strategy_over={0:50140, 1:20000},
				over_sampler=smote
			)
df_resample = rebalance.fit_resample()
print(df_resample['is_promoted'].value_counts())
df_resample

No categorical columns specified. These columns have been automatically identfied as the following:
['department', 'region', 'education', 'gender', 'recruitment_channel']
No columns specified for imputation. These columns have been automatically identified:
['education', 'previous_year_rating']

Over Sampler already provided.

Running oversampling...
...finished
0    50140
1    50140
Name: is_promoted, dtype: int64


Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,department_Finance,department_HR,department_Legal,department_Operations,...,region_region_7,region_region_8,region_region_9,education_Below Secondary,education_Master's & above,education_NULL,gender_m,recruitment_channel_referred,recruitment_channel_sourcing,is_promoted
0,1.000000,35.000000,5.000000,8.000000,0.0,49.000000,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
1,1.000000,30.000000,5.000000,4.000000,0.0,60.000000,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,1.000000,34.000000,3.000000,7.000000,0.0,50.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,2.000000,39.000000,1.000000,10.000000,0.0,50.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1.000000,45.000000,3.000000,2.000000,0.0,73.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100275,1.000000,33.274008,5.000000,7.725992,0.0,57.274008,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
100276,1.550476,30.449524,3.000000,4.000000,0.0,87.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
100277,1.986988,50.973976,3.013012,6.000000,0.0,63.013012,0,0,0,0,...,0,0,0,0,1,0,1,0,0,1
100278,1.000000,32.000000,4.763964,2.763964,0.0,60.000000,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [8]:
rebalance = Rebalance(
				df=dataset,
				rebalance_col='is_promoted',
				strategy_over={0:50140, 1:10000},
				strategy_under='auto'
			)
df_resample = rebalance.fit_resample()
print(df_resample['is_promoted'].value_counts())
df_resample

No categorical columns specified. These columns have been automatically identfied as the following:
['department', 'region', 'education', 'gender', 'recruitment_channel']
No columns specified for imputation. These columns have been automatically identified:
['education', 'previous_year_rating']
Running oversampling...
...finished
Running undersampling...
...finished
0    49358
1    10000
Name: is_promoted, dtype: int64


Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,department_Finance,department_HR,department_Legal,department_Operations,...,region_region_7,region_region_8,region_region_9,education_Below Secondary,education_Master's & above,education_NULL,gender_m,recruitment_channel_referred,recruitment_channel_sourcing,is_promoted
0,1.0,35.000000,5.000000,8.000000,0.000000,49.000000,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
1,1.0,30.000000,5.000000,4.000000,0.000000,60.000000,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,1.0,34.000000,3.000000,7.000000,0.000000,50.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,2.0,39.000000,1.000000,10.000000,0.000000,50.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1.0,45.000000,3.000000,2.000000,0.000000,73.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59353,1.0,32.764739,4.117631,7.882369,0.117631,95.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
59354,1.0,43.790991,2.790991,2.790991,0.000000,75.790991,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
59355,1.0,28.000000,4.458665,2.541335,1.000000,84.458665,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
59356,1.0,33.594383,5.000000,3.000000,0.000000,72.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
rebalance = Rebalance(
				df=dataset,
				rebalance_col='is_promoted',
				strategy_over={0:50140, 1:10000},
				under_sampler=True
			)
df_resample = rebalance.fit_resample()
print(df_resample['is_promoted'].value_counts())
df_resample

No categorical columns specified. These columns have been automatically identfied as the following:
['department', 'region', 'education', 'gender', 'recruitment_channel']
No columns specified for imputation. These columns have been automatically identified:
['education', 'previous_year_rating']
Running oversampling...
...finished
Running undersampling...
...finished
0    49337
1    10000
Name: is_promoted, dtype: int64


Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,department_Finance,department_HR,department_Legal,department_Operations,...,region_region_7,region_region_8,region_region_9,education_Below Secondary,education_Master's & above,education_NULL,gender_m,recruitment_channel_referred,recruitment_channel_sourcing,is_promoted
0,1.000000,35.000000,5.000000,8.000000,0.0,49.000000,0,0,0,0,...,1,0,0,0,1,0,0,0,1,0
1,1.000000,30.000000,5.000000,4.000000,0.0,60.000000,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,1.000000,34.000000,3.000000,7.000000,0.0,50.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,2.000000,39.000000,1.000000,10.000000,0.0,50.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,1.000000,45.000000,3.000000,2.000000,0.0,73.000000,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59332,1.948997,30.000000,5.000000,4.000000,0.0,84.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
59333,1.511305,38.488695,4.488695,11.000000,0.0,60.000000,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
59334,1.000000,25.475312,3.329256,1.000000,0.0,52.000000,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
59335,1.111413,26.111413,3.000000,3.111413,0.0,71.888587,0,0,0,1,...,0,0,0,0,0,0,1,0,0,1


In [10]:
dataset['no_of_trainings'].unique()

array([ 1,  2,  3,  4,  7,  5,  6,  8, 10,  9])

In [11]:
rebalance = Rebalance(
				df=dataset,
				rebalance_col='no_of_trainings',
				strategy_over="minority",
				under_sampler=False
			)
df_resample = rebalance.fit_resample()
print(df_resample['is_promoted'].value_counts())
df_resample

No categorical columns specified. These columns have been automatically identfied as the following:
['department', 'region', 'education', 'gender', 'recruitment_channel']
No columns specified for imputation. These columns have been automatically identified:
['education', 'previous_year_rating']
Running oversampling...
...finished
0.0    94513
1.0     4668
Name: is_promoted, dtype: int64


Unnamed: 0,department,region,education,gender,recruitment_channel,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted,no_of_trainings
0,Sales & Marketing,region_7,Master's & above,f,sourcing,35.000000,5.000000,8.000000,0.0,49.000000,0.0,1
1,Operations,region_22,Bachelor's,m,other,30.000000,5.000000,4.000000,0.0,60.000000,0.0,1
2,Sales & Marketing,region_19,Bachelor's,m,sourcing,34.000000,3.000000,7.000000,0.0,50.000000,0.0,1
3,Sales & Marketing,region_23,Bachelor's,m,other,39.000000,1.000000,10.000000,0.0,50.000000,0.0,2
4,Technology,region_26,Bachelor's,m,other,45.000000,3.000000,2.000000,0.0,73.000000,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
99176,Procurement,region_22,Bachelor's,m,sourcing,40.646478,3.215493,3.000000,0.0,71.476995,0.0,8
99177,Procurement,region_2,Bachelor's,m,sourcing,31.844303,1.538924,2.730538,0.0,69.461076,0.0,8
99178,Procurement,region_2,Master's & above,m,sourcing,28.271698,3.000000,2.757233,0.0,69.135849,0.0,8
99179,Procurement,region_31,Bachelor's,m,sourcing,34.674388,1.224796,3.000000,0.0,70.149864,0.0,8
