In [27]:
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
    roc_auc_score,
)
from sklearn.metrics import (
    precision_recall_fscore_support,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    average_precision_score,
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from collections import Counter
from plotly import graph_objects as go


import numpy as np

import warnings

warnings.simplefilter("ignore")


TARGET = "target"

CONTINUOUS_FEATURES = [
    "CNT_CHILDREN",
    "AMT_INCOME_TOTAL",
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "CNT_FAM_MEMBERS",
]

TOP_FEATURES = [
    "DAYS_BIRTH",
    "DAYS_EMPLOYED",
    "AMT_INCOME_TOTAL",
    "CNT_FAM_MEMBERS",
    "CNT_CHILDREN",
    "FLAG_OWN_CAR",
    "FLAG_OWN_REALTY",
    "FLAG_WORK_PHONE",
    "FLAG_PHONE",
    "FLAG_EMAIL",
]

BASE_INPUT = "./data/"


train_df = pd.read_csv(os.path.join(BASE_INPUT, "train.csv"))
val_df = pd.read_csv(os.path.join(BASE_INPUT, "val.csv"))
test_df = pd.read_csv(os.path.join(BASE_INPUT, "test.csv"))


train_labels = np.array(train_df.pop(TARGET))
val_labels = np.array(val_df.pop(TARGET))
test_labels = np.array(test_df.pop(TARGET))


_ = train_df.pop("ID"), val_df.pop("ID"), test_df.pop("ID")

In [3]:
for col in CONTINUOUS_FEATURES:
    scaler = StandardScaler()
    train_df[col] = scaler.fit_transform(train_df)
    val_df[col] = scaler.transform(val_df)
    test_df[col] = scaler.transform(test_df)

In [4]:
lr_model = LogisticRegression(
    max_iter=10000, penalty="l1", solver="liblinear", class_weight="balanced"
)

lr_model.fit(train_df, train_labels)

In [5]:
def evaluate(model, testing_set_x, testing_set_y, phase):

    predictions = model.predict_proba(testing_set_x)

    accuracy = accuracy_score(testing_set_y, predictions[:, 1] >= 0.5)
    roc_auc = roc_auc_score(testing_set_y, predictions[:, 1])
    precision = precision_score(testing_set_y, predictions[:, 1] >= 0.5)
    recall = recall_score(testing_set_y, predictions[:, 1] >= 0.5)
    pr_auc = average_precision_score(testing_set_y, predictions[:, 1])
    print(f"---  {phase} ----")
    result = pd.DataFrame(
        [[accuracy, precision, recall, roc_auc, pr_auc]],
        columns=["Accuracy", "Precision", "Recall", "ROC_auc", "PR_auc"],
    )
    return result


evaluate(lr_model, val_df, val_labels, "VAL STATS")

---  VAL STATS ----


Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.601798,0.039182,0.291139,0.430705,0.043342


In [6]:
evaluate(lr_model,train_df,train_labels,'TRAIN STATS')

---  TRAIN STATS ----


Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.626506,0.065678,0.563636,0.639822,0.077685


In [7]:
evaluate(lr_model,test_df,test_labels, 'TEST STATS')

---  TEST STATS ----


Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.64851,0.065598,0.511364,0.604435,0.078663


In [8]:
evaluate(lr_model,val_df,val_labels,'VAL STATS')

---  VAL STATS ----


Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.601798,0.039182,0.291139,0.430705,0.043342


In [9]:
coef = lr_model.coef_[0]
imp_features = pd.Series(train_df.columns)[list(coef != 0)]
X_train = train_df[imp_features]
X_val = val_df[imp_features]
X_test = test_df[imp_features]

In [10]:
lr_model_updated = LogisticRegression(
    max_iter=10000, penalty="l1", solver="liblinear", class_weight="balanced"
)

lr_model_updated.fit(X_train, train_labels)

In [12]:
evaluate(lr_model_updated,X_train,train_labels,'TRAIN STATS')

---  TRAIN STATS ----


Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.626506,0.065678,0.563636,0.639822,0.077685


In [13]:
evaluate(lr_model_updated,X_val,val_labels,'VAL STATS')

---  VAL STATS ----


Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.601798,0.039182,0.291139,0.430705,0.043342


In [28]:
evaluate(lr_model_updated,X_test,test_labels,'TEST STATS')

---  TEST STATS ----


Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.64851,0.065598,0.511364,0.604435,0.078663


In [16]:
cor_matrix = X_train.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

In [22]:
X_train_updated = X_train.drop(to_drop, axis=1)
X_val_updated = X_val.drop(to_drop, axis=1)
X_test_updated = X_test.drop(to_drop, axis=1)

In [23]:
lr_model_updated_2 = LogisticRegression(
    max_iter=10000, penalty="l1", solver="liblinear", class_weight="balanced"
)

lr_model_updated_2.fit(X_train_updated, train_labels)

In [25]:
evaluate(lr_model_updated_2, X_train_updated, train_labels, "TRAIN STATS")

---  TRAIN STATS ----


Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.627309,0.065817,0.563636,0.640129,0.07765


In [26]:
evaluate(lr_model_updated_2, X_test_updated, test_labels, "TEST STATS")

---  TEST STATS ----


Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.651593,0.066176,0.511364,0.604007,0.078651
