## Explore All Errors Mitigation Offerings

In [None]:
!pip install raimitigations

In [None]:

from raimitigations.dataprocessing import Split, Transformer, Rebalance, RandomSample

%matplotlib inline
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import cm

import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
import zipfile
import pathlib
import urllib

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, scale, MinMaxScaler, PowerTransformer


from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

from urllib.request import urlretrieve
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


### Load dataset

In [None]:
outdirname = "mitigations-datasets.2.22.2022"
zipfilename = outdirname + ".zip"
if not pathlib.Path(outdirname).exists():
    urlretrieve(
        "https://publictestdatasets.blob.core.windows.net/data/" + zipfilename,
        "../../" + zipfilename,
    )
    with zipfile.ZipFile("../../" + zipfilename, "r") as unzip:
        unzip.extractall("../../.")

data_dir = "../../" + outdirname + "/hr_promotion"
dataset = pd.read_csv(data_dir + "/train.csv")

seed = 42
dataset.shape
dataset.head()

### Random Sample

In [None]:
dataset_target = dataset.columns.get_loc("is_promoted")

data_sample = RandomSample(dataset, dataset_target, 0.8, False, False, False, True)
random_sample = data_sample.random_sample()

random_sample.shape
random_sample.head()

### Split Dataset 

In [None]:
# dataset - Panda Data Frame.
# target – A string representing the name of the label column, or the label column integer index (zero base)
# train_size – The training data split size.  The default is 0.9, which split the dataset to 90% training and 10% testing.
# Training and Test split values add up to 1.
# random_state – Control the randomization of the algorithm.
# ‘None’: the random number generator is the RandomState instance used by np.random.
# categorical_features – A Boolean flag to indicates the presence of categorical features. Default is True.
# drop_null: If flag is set to True, records with null values are dropped, otherwise they are replaced by the mean.
# Default is True.
# drop_duplicates: if flag is set to True, duplicate records are dropped. Default is False.
# Stratify: If not None, data is split in a stratified fashion, using this as the class labels. Default is False.

random_sample_target = random_sample.columns.get_loc("is_promoted")
data_split = Split(
    random_sample, random_sample_target, 0.9, seed, True, False, False, True
)

train_data, test_data = data_split.split()


print(train_data.shape)
print(test_data.shape)

train_data.head()

### Tansform Dataset

In [None]:
# Data Transformation

# dataset,
# target,
# transformer_type,
# transform_features = None,
# random_state = None,
# method ='yeo-johnson',
# output_distribution  = 'uniform'
# transform_features = None


# StandardScaler = 1
# MinMaxScaler = 2
# RobustScaler = 3
# PowerTransformer = 4
# QuantileTransformer = 5
# Normalizer = 6

train_data_label = train_data.columns.get_loc("is_promoted")
# train_data.iloc[:,target_index]

dt_train = Transformer(
    train_data, train_data_label, Transformer.TransformerType.StandardScaler, None, seed
)
train_data_t = dt_train.transform()


dt_test = Transformer(
    test_data, train_data_label, Transformer.TransformerType.StandardScaler, None, seed
)
test_data_t = dt_test.transform()

train_data.head()
train_data_t.head()

### Accuracy Results

In [None]:
def split_label_index(dataset):
    x = dataset.drop([0], axis=1)
    y = dataset[0]
    return x, y


def split_label(dataset):
    x = dataset.drop(["is_promoted"], axis=1)
    y = dataset["is_promoted"]
    return x, y


# def split_label(dataset):
#     x = dataset.drop(['is_promoted'], axis=1)
#     y = dataset['is_promoted']
#     return x, y

# splitting the training data
x_train, y_train = split_label_index(train_data_t)

# splitting the test data
x_test, y_test = split_label_index(test_data_t)

# LGBMClassifier Model
clf = LGBMClassifier(n_estimators=50)
model = clf.fit(x_train, y_train)

pred = model.predict(x_test)


def conf_matrix(y, pred):
    ((tn, fp), (fn, tp)) = metrics.confusion_matrix(y, pred)
    ((tnr, fpr), (fnr, tpr)) = metrics.confusion_matrix(y, pred, normalize="true")
    return pd.DataFrame(
        [
            [f"TP = {tp} ({tpr:1.2%})", f"FN = {fn} ({fnr:1.2%})"],
            [f"FP = {fp} ({fpr:1.2%})", f"TN = {tn} ({tnr:1.2%})"],
        ],
        index=["True", "False"],
        columns=["Pred 1", "Pred 0"],
    )


print("number of errors on test dataset: " + str(sum(pred != y_test)))

conf_matrix(y_test, pred)

print(classification_report(y_test, pred))

### Mitigations (Nulls, Duplicates)

In [None]:
# Mitigation
# random_sample.iloc[:,random_sample_target]
def split_label(dataset):
    x = dataset.drop(["is_promoted"], axis=1)
    y = dataset["is_promoted"]
    return x, y


def split_label_index(dataset):
    x = dataset.drop([0], axis=1)
    y = dataset[0]
    return x, y


data_split2 = Split(
    random_sample,
    random_sample.columns.get_loc("is_promoted"),
    0.9,
    seed,
    True,
    True,
    True,
    True,
)
train_data2, test_data2 = data_split2.split()


dt_train2 = Transformer(
    train_data2,
    train_data2.columns.get_loc("is_promoted"),
    Transformer.TransformerType.StandardScaler,
    None,
    seed,
)
train_data_t2 = dt_train2.transform()


dt_test2 = Transformer(
    test_data2,
    test_data2.columns.get_loc("is_promoted"),
    Transformer.TransformerType.StandardScaler,
    None,
    seed,
)
test_data_t2 = dt_test2.transform()


x_train2, y_train2 = split_label_index(train_data_t2)
x_test2, y_test2 = split_label_index(test_data_t2)


clf2 = LGBMClassifier(n_estimators=50)
model2 = clf2.fit(x_train2, y_train2)
pred2 = model2.predict(x_test2)


def conf_matrix(y, pred):
    ((tn, fp), (fn, tp)) = metrics.confusion_matrix(y, pred)
    ((tnr, fpr), (fnr, tpr)) = metrics.confusion_matrix(y, pred, normalize="true")
    return pd.DataFrame(
        [
            [f"TP = {tp} ({tpr:1.2%})", f"FN = {fn} ({fnr:1.2%})"],
            [f"FP = {fp} ({fpr:1.2%})", f"TN = {tn} ({tnr:1.2%})"],
        ],
        index=["True", "False"],
        columns=["Pred 1", "Pred 0"],
    )


print("number of errors on test dataset: " + str(sum(pred2 != y_test2)))
conf_matrix(y_test2, pred2)
print(classification_report(y_test2, pred2))

### Compare Results

In [None]:
# compare results before and after removing nulls

from sklearn.metrics import roc_auc_score

((tn2, fp2), (fn2, tp2)) = metrics.confusion_matrix(y_test2, pred2)
precision2 = round(tp2 / (tp2 + fp2), 5)
recall2 = round(tp2 / (tp2 + fn2), 5)

((tn, fp), (fn, tp)) = metrics.confusion_matrix(y_test, pred)
precision = round(tp / (tp + fp), 5)
recall = round(tp / (tp + fn), 5)

preda = model.predict_proba(x_test)[:, 1]
roc_auc = round(roc_auc_score(y_test, preda), 5)

preda2 = model2.predict_proba(x_test2)[:, 1]
roc_auc2 = round(roc_auc_score(y_test2, preda2), 5)


def compare_results():
    return pd.DataFrame(
        [
            [f"{precision}", f"{recall}", f"{roc_auc}"],
            [f"{precision2}", f"{recall2}", f"{roc_auc2}"],
        ],
        columns=["Precision", "Recall", "roc_auc"],
        index=["No Mitigation", "With Mitigation"],
    )


compare_results()

### Mitigation (Rebalance Dataset)

In [None]:
# Rebalance data

tomek = TomekLinks(sampling_strategy="auto")
smote = SMOTE(sampling_strategy="auto", random_state=seed)
smote_tomek = SMOTETomek(sampling_strategy="auto", random_state=seed)
# dataset - A Panda Data Frame representing the data to rebalance.

# target – A string representing the name or the label column integer index (zero base)
# of the target feature to use as the classes for rebalancing the data.

# sampling_strategy
# 'minority': resample only the minority class.
# 'not minority': resample all classes but the minority class.
# 'not majority': resample all classes but the majority class.
# 'all': resample all classes.
# 'auto': equivalent to 'not majority'.

# random_state - Control the randomization of the algorithm.
# ‘None’: the random number generator is the RandomState instance used by np.random.
# ‘If Int’: random_state is the seed used by the random number generator.

# smote_tomek - The SMOTETomek object to use. If not given by Caller, a SMOTE object with default parameters will be given.
#  imblearn.combine.SMOTETomek

# smote - The SMOTE object to use. If not given by Caller, a SMOTE object with default parameters will be given.
# imblearn_over_sampling.SMOTE

# tomek - The TomekLinks object to use. If not given by Caller, a TomekLinks object with sampling strategy=’all’ will be given.  imblearn.under_sampling.TomekLinks


train_data_rebalance3 = Rebalance(
    train_data2, train_data2.columns.get_loc("is_promoted"), "auto", seed, None, smote
)
train_data_r = train_data_rebalance3.rebalance()

# test_data_rebalance3 =  Rebalance(test_data2, test_data2.columns.get_loc('is_promoted'), 'auto', seed, None, smote)
# test_data_r = test_data_rebalance3.Rebalance()

x_train3, y_train3 = split_label(train_data_r)
x_test3, y_test3 = split_label(test_data2)

train_data2.shape
train_data_r.shape

### Accuracy Results

In [None]:
clf3 = LGBMClassifier(n_estimators=50)
model3 = clf3.fit(x_train3, y_train3)
pred3 = model3.predict(x_test3)


def conf_matrix(y, pred):
    ((tn, fp), (fn, tp)) = metrics.confusion_matrix(y, pred)
    ((tnr, fpr), (fnr, tpr)) = metrics.confusion_matrix(y, pred, normalize="true")
    return pd.DataFrame(
        [
            [f"TP = {tp} ({tpr:1.2%})", f"FN = {fn} ({fnr:1.2%})"],
            [f"FP = {fp} ({fpr:1.2%})", f"TN = {tn} ({tnr:1.2%})"],
        ],
        index=["True", "False"],
        columns=["Pred 1", "Pred 0"],
    )


print("number of errors on test dataset: " + str(sum(pred3 != y_test3)))
conf_matrix(y_test3, pred3)
print(classification_report(y_test3, pred3))

### Compare Results

In [None]:
# compare results before and after removing nulls

from sklearn.metrics import roc_auc_score

((tn, fp), (fn, tp)) = metrics.confusion_matrix(y_test2, pred2)
precision2 = round(tp / (tp + fp), 5)
recall2 = round(tp2 / (tp2 + fn2), 5)


((tn3, fp3), (fn3, tp3)) = metrics.confusion_matrix(y_test3, pred3)
precision3 = round(tp3 / (tp3 + fp3), 5)
recall3 = round(tp3 / (tp3 + fn3), 5)


preda3 = model3.predict_proba(x_test3)[:, 1]
roc_auc3 = round(roc_auc_score(y_test3, preda3), 5)

preda2 = model2.predict_proba(x_test2)[:, 1]
roc_auc2 = round(roc_auc_score(y_test2, preda2), 5)


def compare_results():
    return pd.DataFrame(
        [
            [f"{precision}", f"{recall}", f"{roc_auc}"],
            [f"{precision2}", f"{recall2}", f"{roc_auc2}"],
            [f"{precision3}", f"{recall3}", f"{roc_auc3}"],
        ],
        columns=["Precision", "Recall", "roc_auc"],
        index=["No Mitigation", "With Mitigation", "With Mitigation & Rebalance"],
    )


compare_results()