In [None]:
import pandas as pd
import sys
sys.path.append('../../ResponsibleAIToolbox-Mitigation/')
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

from errorsmitigation.dataprocessing import DataRebalance
from databalanceanalysis.databalanceanalysis.utils import undummify

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
data_dir = '../datasets/hr_promotion'
df =  pd.read_csv(data_dir + '/train.csv').drop(['employee_id'], axis=1)
cols_of_interest = ['education', 'gender']
label_col = 'is_promoted'
seed = 42
# handle duplicates
df = df.drop_duplicates()
df = df.dropna()
df.head()
df.shape

In [None]:
from databalanceanalysis.databalanceanalysis.feature_measures import FeatureBalanceMeasure

feature_measures = FeatureBalanceMeasure( cols_of_interest, label_col)

feat_measures1 = feature_measures.measures(df)
feat_measures1

In [None]:
from databalanceanalysis.databalanceanalysis.distribution_measures import DistributionBalanceMeasure

dist_measures = DistributionBalanceMeasure( cols_of_interest)
dist_measures1 = dist_measures.measures(df)
dist_measures1


In [None]:
from databalanceanalysis.databalanceanalysis.aggregate_measures import AggregateBalanceMeasure

agg_measures = AggregateBalanceMeasure( cols_of_interest)
agg_measures1 = agg_measures.measures(df)
agg_measures1

In [None]:
smote_tomek = SMOTETomek()
smote = SMOTE()
tomek = TomekLinks()
dummy_df = pd.get_dummies(df, prefix_sep = "-")
dummy_df.head()

In [None]:
# undummify(dummy_df, prefix_sep = "-", col_list = ['race'])
# undummify(dummy_df, prefix_sep="-", col_list = ['sex'])
# gender_df = undummify(dummy_df, prefix_sep = "-", col_list = ['gender'])
gender_df = undummify(dummy_df, prefix_sep = "-", col = 'gender')
gender_df

In [None]:
data_balance_smote =  DataRebalance(gender_df, 'gender', 'auto', 42, None, smote, None)

print(gender_df.shape)
smote_df = data_balance_smote.Rebalance()
print(smote_df.shape)
# smote_df
# print(smote_df.head)


In [None]:
# print(smote_df.shape)
# smote_df.head()
# dummy_df = pd.get_dummies(smote_df, prefix_sep ="-") # not required
# print(dummy_df.shape)
# dummy_df.head()

smote_df.head()
education_df = undummify(smote_df, prefix_sep = "-", col = 'education')
education_df.head()

education_df['education'].value_counts()


In [None]:
data_balance_smote_2 =  DataRebalance(education_df, 'education', 'auto', 42, None, smote, None)

print(education_df.shape)
smote_df_2 = data_balance_smote_2.Rebalance()
print(smote_df_2.shape)

In [None]:
education_df['education'].value_counts()

In [None]:
education_df.tail()

In [None]:
education_df['education'].value_counts()

In [None]:
education_df.head()

In [None]:
# new_df = undummify(smote_df, prefix_sep="-")

In [None]:
feature_measures.measures(education_df)
feat_measures1

In [None]:
dist_measures.measures(education_df)
dist_measures1

In [None]:
agg_measures.measures(education_df)
agg_measures1

## IGNORE BELOW FOR NOW

In [None]:
race_df = undummify(dummy_df, prefix_sep = "-", col_list = ['race'])
data_rebalance_smote =  DataRebalance(race_df, 'race', None, None, None, smote, None)
data_rebalance_tomek = DataRebalance(race_df, 'race', None, None, None, None, None)
data_rebalance_smote_tomek = DataRebalance(race_df, 'race', None, None, None, None, None)
smote_df = data_rebalance_smote.Rebalance()
tomek_df = data_rebalance_tomek.Rebalance()
smote_tomek_df = data_rebalance_smote_tomek.Rebalance()


In [None]:
smote_df

In [None]:
# df = undummify(df, "-")
# smote_df = undummify(smote_df, "-")
# smote_tomek_df = undummify(smote_tomek_df, "-")
# tomek_df = undummify(tomek_df, "-")

In [None]:
smote_sex_df = undummify( pd.get_dummies(smote_df, prefix_sep= "-"), prefix_sep = "-", col_list = ["sex"]) 

In [None]:
tomek_sex_df = undummify( pd.get_dummies(tomek_df, prefix_sep= "-"), prefix_sep = "-", col_list = ["sex"]) 
smote_tomek_sex_df = undummify( pd.get_dummies(smote_tomek_df, prefix_sep= "-"), prefix_sep = "-", col_list = ["sex"]) 

In [None]:
data_rebalance_smote =  DataRebalance(smote_sex_df, 'sex', None, None, None, smote, None)
data_rebalance_tomek = DataRebalance(tomek_sex_df, 'sex', None, None, None, None, None)
data_rebalance_smote_tomek = DataRebalance(smote_tomek_sex_df, 'sex', None, None, None, None, None)
smote_df = data_rebalance_smote.Rebalance()
tomek_df = data_rebalance_tomek.Rebalance()
# smote_tomek_df = data_rebalance_smote_tomek.Rebalance()

In [None]:
smote_df = undummify()
tomek_df = undummify()
# smote_tomek_df = undummify()

In [None]:
from databalanceanalysis.databalanceanalysis.feature_measures import FeatureBalanceMeasure
feat_measures = feature_measures.measures(df)

First look at the feature balance measures for the dataset without applying SMOTE or TOMEK

In [None]:
feat_measures

After applying SMOTE method, these are the feature balance measures

In [None]:
feature_measures.measures(smote_df)

Applying applying the SMOTE-Tomek Method these are the feature balance measures.

In [None]:
feature_measures.measures(smote_tomek_df)

After appying the Tomek Method

In [None]:
feature_measures.measures(tomek_df)

In [None]:
from databalanceanalysis.databalanceanalysis.distribution_measures import DistributionBalanceMeasure
dist_measures = DistributionBalanceMeasure(cols_of_interest)
dist_measures.measures(df)


In [None]:
dist_measures.measures(smote_df)

In [None]:
dist_measures.measures(smote_tomek_df)

In [None]:
dist_measures.measures(tomek_df)

In [None]:
from databalanceanalysis.databalanceanalysis.aggregate_measures import AggregateBalanceMeasure
agg_measures = AggregateBalanceMeasure(cols_of_interest)
agg_measures.measures(df)

In [None]:
agg_measures.measures(smote_df)

In [None]:
agg_measures.measures(smote_tomek_df)

In [None]:
agg_measures.measures(tomek_df)