In [9]:
import pandas as pd
import numpy as np
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import recall_score, precision_score, \
    confusion_matrix, accuracy_score, roc_auc_score

In [10]:
# imblearn pipeline steps: remove outliers,
# splitting dat into X, y,
# scaler,
# smote,
# QDA

# Validate


In [11]:
def iqr_removal(df, threshold):
    col_list = ['V4', 'V9', 'V10', 'V11', 'V12', 'V14', 'V16']

    print(f'Before removal: \n {df["Class"].value_counts()}')
    def outlier_treatment(datacolumn):
        sorted(datacolumn)
        q1, q3 = np.percentile(datacolumn , [25,75])
        iqr = q3 - q1
        lower_range = q1 - (threshold * iqr)
        upper_range = q3 + (threshold * iqr)

        return lower_range,upper_range

    for col in col_list:
        lower_range, upper_range = outlier_treatment(df[col])
        outliers = df.loc[(df[col] > upper_range) | (df[col] < lower_range)]
        outliers_indexes = outliers.index
        df = df.drop(outliers_indexes)

    print(f'After removal: \n {df["Class"].value_counts()}')

    return df


def print_metrcis(y_test, y_pred):
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'ROC AUC: {roc_auc_score(y_test, y_pred)}')
    print(f'Precision: {precision_score(y_test, y_pred)}')
    print(f'Recall: {recall_score(y_test, y_pred)}')
    print(f'Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}')

In [12]:
df = pd.read_csv('data/train_df')


missing_data = df.isnull().sum()
print('Amount of missing data: ')
print(missing_data.sum())

Amount of missing data: 
0


In [13]:
# couldnt be done in pipeline bcs pipeline modifies only
# X_train and not y_train during fit
df = iqr_removal(df, 5)

Before removal: 
 0    227451
1       394
Name: Class, dtype: int64
After removal: 
 0    226274
1       114
Name: Class, dtype: int64


In [14]:
label = 'Class'
X_train = df.drop(label, 1)
y_train = df[label]

X_test = pd.read_csv('data/test_df')
y_test = pd.read_csv('data/test_df_y_true')


In [15]:
steps = [('scaler', StandardScaler()),
         ('SMOTETomek', SMOTETomek()),
         ('QDA', QuadraticDiscriminantAnalysis())]

pipeline = Pipeline(steps=steps)


In [16]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print_metrcis(y_test, y_pred)

Accuracy: 0.9541097573821143
ROC AUC: 0.9311761036716318
Precision: 0.03303637713437268
Recall: 0.9081632653061225
Confusion Matrix: 
 [[54259  2605]
 [    9    89]]


In [None]:
# Final validation results with iqr removal threshold 5:
#
# Accuracy: 0.9541097573821143
# ROC AUC: 0.9311761036716318
# Precision: 0.03303637713437268
# Recall: 0.9081632653061225
# Confusion Matrix:
#  [[54259  2605]
#  [    9    89]]