## Analysis of Models Home Credit - Credit Risk Model Stability

https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability

Contents:
- [Importing Data and Pre-Processing](#load_data)
- [Exploratory Analysis](#exp_analysis)
- [Train-Test Split](#train_test)

In [20]:
# import packages

import os
from dotenv import load_dotenv
from pyprojroot import here

# set working directory to root of github project for consistent access to helper functions
os.chdir(here())
# load .env variables
load_dotenv()

import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import scipy.stats as st
import math
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    KFold,
)

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression

import lightgbm as lgb
import shap

import snowflake.connector as sc

from elements.utils.data_processors import snake_case
from elements.utils.model_evaluation import classification_model_report

pd.set_option("display.float_format", "{:.4f}".format)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [31]:
# define feature columns

cont_features = [
    "static_monthly_annuity_amount",
    "static_next_month_annuity_amount",
    "static_count_apply_same_email",
    "static_count_apply_last_30_days",
    "static_count_apply_same_phone_number",
    "static_count_apply_same_employer",
    "static_count_apply_same_employer_last_7_days",
    "static_count_apply_same_mobile_phone_number",
    "static_avg_days_past_or_before_due_of_paymt_last_24_months",
    "static_avg_days_past_or_before_due_of_paymt_last_3_months",
    "static_avg_days_past_or_before_due_of_paymt_last_24_months_with_tolerance",
    "static_avg_days_past_due_of_paymt_last_24_months_with_tolerance_from_max_close_date",
    "static_avg_instals_paid_last_24_months",
    "avg_loan_amt_last_24_months",
    "avg_days_past_due_last_9_months",
    "static_avg_outstanding_bal_last_6_months",
    "static_avg_of_pays_made_last_12_months",
    "static_count_clients_have_used_same_mobile_number_last_12_months",
    "static_count_clients_have_used_same_mobile_number_last_3_months",
    "static_count_clients_have_used_same_mobile_number_last_6_months",
    "static_count_apply_match_employers_phone_and_clients",
    "static_count_clients_share_same_mobile_phone",
    "static_count_apply_match_client_alt_phone",
    "static_count_apply_client_match_alt_phone",
]

cat_features = [
    "static_type_of_bank_account",
    "static_type_of_credit_card",
]

# Importing Data and Pre-Processing
<a id='load_data'></a>

In [24]:
# initialize snowflake connector
conn = sc.connect(
    user=os.getenv("SNOWFLAKE_USERNAME"),
    password=os.getenv("SNOWFLAKE_PASSWORD"),
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    role=os.getenv("SNOWFLAKE_ROLE"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
)

In [25]:
# importing data set

modeling_case_default_prediction_df = pd.read_sql(
    "select * from credit_risk_dw.modeling.modeling_case_default_prediction",
    conn,
)

In [27]:
# lowercase columns

modeling_case_default_prediction_df.columns = [
    x.lower() for x in modeling_case_default_prediction_df.columns
]

In [28]:
# check import

modeling_case_default_prediction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1526659 entries, 0 to 1526658
Data columns (total 35 columns):
 #   Column                                                                               Non-Null Count    Dtype  
---  ------                                                                               --------------    -----  
 0   case_model_group_key                                                                 1526659 non-null  object 
 1   case_id                                                                              1526659 non-null  int64  
 2   model_group                                                                          1526659 non-null  object 
 3   is_default                                                                           1526659 non-null  int64  
 4   decision_date                                                                        1526659 non-null  object 
 5   decision_month                                                        

In [29]:
modeling_case_default_prediction_df.is_default.value_counts()

0    1478665
1      47994
Name: is_default, dtype: int64

In [35]:
# pre-processing

for col in cat_features:

    modeling_case_default_prediction_df[col] = modeling_case_default_prediction_df.apply(
        lambda x: snake_case(x[col]),
        axis = 1
    )

    # fill-in missing values

    modeling_case_default_prediction_df[col] = np.where(
        modeling_case_default_prediction_df[col].isnull(),
        'missing',
        modeling_case_default_prediction_df[col]
    )


# Exploratory Analysis
<a id='exp_analysis'></a>

# Train-Test Split
<a id='train_test'></a>