In [1]:
import hc_transformations
import time
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

from imblearn.pipeline import Pipeline, make_pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
#from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix, f1_score, precision_score, recall_score

import shap

In [None]:
SAMPLE_SIZE = 500000

## Preprocessing and split

In [None]:
# preprocessing for training
with hc_transformations.timer("Files loading"):
    tab_csv = hc_transformations.load_csv()
with hc_transformations.timer("Process applications"):
    data = hc_transformations.preprocess_tables(tab_csv)
    X = data[:SAMPLE_SIZE].drop('TARGET', axis=1)
    y = data[:SAMPLE_SIZE]['TARGET']

In [None]:
# save preprocessed data
with hc_transformations.timer("Files loading"):
    tab_csv = hc_transformations.load_csv()
with hc_transformations.timer("Process applications"):
    data = hc_transformations.preprocess_tables(tab_csv, drop_id=False)
    X = data[:SAMPLE_SIZE].drop('TARGET', axis=1)
    y = data[:SAMPLE_SIZE]['TARGET']
    hc_transformations.save_columns_to_csv(X, y, ['SK_ID_CURR'] + columns, ".", file_name='data')

In [None]:
# test data
with hc_transformations.timer("Files loading"):
    tab_csv = hc_transformations.load_csv()
with hc_transformations.timer("Process applications"):
    data = hc_transformations.preprocess_tables(tab_csv, train=False, drop_id=False)

## LASSO reduction

In [None]:
with hc_transformations.timer("LASSO reduction"):
    steps = [('imputer', SimpleImputer(strategy='median')), 
             ('scaler', StandardScaler()), 
             ('model', LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=1, C=0.01, max_iter=300, class_weight='balanced'))]
    pipeline = Pipeline(steps=steps)
    pipeline.fit(X, y)
    columns = list(X.columns[pipeline['model'].coef_[0] != 0])

In [None]:
joblib.dump(columns, "columns.joblib")

In [None]:
columns = [
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'YEARS_BEGINEXPLUATATION_MODE',
 'FLOORSMAX_MODE',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'FLOORSMAX_MEDI',
 'TOTALAREA_MODE',
 'EMERGENCYSTATE_MODE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'FLAG_DOCUMENT_2',
 'FLAG_DOCUMENT_3',
 'FLAG_DOCUMENT_4',
 'FLAG_DOCUMENT_5',
 'FLAG_DOCUMENT_6',
 'FLAG_DOCUMENT_8',
 'FLAG_DOCUMENT_9',
 'FLAG_DOCUMENT_10',
 'FLAG_DOCUMENT_11',
 'FLAG_DOCUMENT_13',
 'FLAG_DOCUMENT_14',
 'FLAG_DOCUMENT_15',
 'FLAG_DOCUMENT_16',
 'FLAG_DOCUMENT_17',
 'FLAG_DOCUMENT_18',
 'FLAG_DOCUMENT_19',
 'FLAG_DOCUMENT_20',
 'FLAG_DOCUMENT_21',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR',
 'IS_FEMALE',
 'NAME_CONTRACT_TYPE_Cash loans',
 'NAME_CONTRACT_TYPE_Revolving loans',
 'NAME_TYPE_SUITE_Children',
 'NAME_TYPE_SUITE_Other_A',
 'NAME_TYPE_SUITE_Other_B',
 'NAME_TYPE_SUITE_Spouse, partner',
 'NAME_INCOME_TYPE_Businessman',
 'NAME_INCOME_TYPE_Maternity leave',
 'NAME_INCOME_TYPE_Pensioner',
 'NAME_INCOME_TYPE_State servant',
 'NAME_INCOME_TYPE_Student',
 'NAME_INCOME_TYPE_Unemployed',
 'NAME_INCOME_TYPE_Working',
 'NAME_EDUCATION_TYPE_Academic degree',
 'NAME_EDUCATION_TYPE_Higher education',
 'NAME_EDUCATION_TYPE_Incomplete higher',
 'NAME_EDUCATION_TYPE_Lower secondary',
 'NAME_EDUCATION_TYPE_Secondary / secondary special',
 'NAME_FAMILY_STATUS_Civil marriage',
 'NAME_FAMILY_STATUS_Married',
 'NAME_FAMILY_STATUS_Separated',
 'NAME_FAMILY_STATUS_Single / not married',
 'NAME_FAMILY_STATUS_Unknown',
 'NAME_FAMILY_STATUS_Widow',
 'NAME_HOUSING_TYPE_House / apartment',
 'NAME_HOUSING_TYPE_Municipal apartment',
 'NAME_HOUSING_TYPE_Office apartment',
 'NAME_HOUSING_TYPE_Rented apartment',
 'WEEKDAY_APPR_PROCESS_START_MONDAY',
 'WEEKDAY_APPR_PROCESS_START_SATURDAY',
 'WEEKDAY_APPR_PROCESS_START_SUNDAY',
 'WEEKDAY_APPR_PROCESS_START_TUESDAY',
 'WEEKDAY_APPR_PROCESS_START_WEDNESDAY',
 'DAYS_EMPLOYED_PERC',
 'INCOME_CREDIT_PERC',
 'INCOME_PER_PERSON',
 'ANNUITY_INCOME_PERC',
 'PAYMENT_RATE',
 'BURO_DAYS_CREDIT_MIN',
 'BURO_DAYS_CREDIT_MAX',
 'BURO_DAYS_CREDIT_VAR',
 'BURO_DAYS_CREDIT_ENDDATE_MIN',
 'BURO_DAYS_CREDIT_ENDDATE_MAX',
 'BURO_DAYS_CREDIT_ENDDATE_MEAN',
 'BURO_DAYS_CREDIT_UPDATE_MEAN',
 'BURO_CREDIT_DAY_OVERDUE_MAX',
 'BURO_CREDIT_DAY_OVERDUE_MEAN',
 'BURO_AMT_CREDIT_MAX_OVERDUE_MEAN',
 'BURO_AMT_CREDIT_SUM_MAX',
 'BURO_AMT_CREDIT_SUM_DEBT_MAX',
 'BURO_AMT_CREDIT_SUM_DEBT_SUM',
 'BURO_AMT_CREDIT_SUM_OVERDUE_MEAN',
 'BURO_AMT_CREDIT_SUM_LIMIT_SUM',
 'BURO_AMT_ANNUITY_MAX',
 'BURO_MONTHS_BALANCE_MIN_MIN',
 'BURO_MONTHS_BALANCE_MAX_MAX',
 'BURO_MONTHS_BALANCE_SIZE_MEAN',
 'BURO_MONTHS_BALANCE_SIZE_SUM',
 'BURO_CREDIT_ACTIVE_Active_MEAN',
 'BURO_CREDIT_ACTIVE_Bad debt_MEAN',
 'BURO_CREDIT_ACTIVE_Closed_MEAN',
 'BURO_CREDIT_ACTIVE_Sold_MEAN',
 'BURO_CREDIT_TYPE_Another type of loan_MEAN',
 'BURO_CREDIT_TYPE_Car loan_MEAN',
 'BURO_CREDIT_TYPE_Consumer credit_MEAN',
 'BURO_CREDIT_TYPE_Credit card_MEAN',
 'BURO_CREDIT_TYPE_Interbank credit_MEAN',
 'BURO_CREDIT_TYPE_Loan for business development_MEAN',
 'BURO_CREDIT_TYPE_Loan for the purchase of equipment_MEAN',
 'BURO_CREDIT_TYPE_Loan for working capital replenishment_MEAN',
 'BURO_CREDIT_TYPE_Microloan_MEAN',
 'BURO_CREDIT_TYPE_Mobile operator loan_MEAN',
 'BURO_CREDIT_TYPE_Mortgage_MEAN',
 'BURO_CREDIT_TYPE_Real estate loan_MEAN',
 'BURO_CREDIT_TYPE_Unknown type of loan_MEAN',
 'ACTIVE_DAYS_CREDIT_MIN',
 'ACTIVE_DAYS_CREDIT_MAX',
 'ACTIVE_DAYS_CREDIT_VAR',
 'ACTIVE_DAYS_CREDIT_ENDDATE_MIN',
 'ACTIVE_DAYS_CREDIT_ENDDATE_MAX',
 'ACTIVE_DAYS_CREDIT_ENDDATE_MEAN',
 'ACTIVE_DAYS_CREDIT_UPDATE_MEAN',
 'ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN',
 'ACTIVE_AMT_CREDIT_SUM_MAX',
 'ACTIVE_AMT_CREDIT_SUM_MEAN',
 'ACTIVE_AMT_CREDIT_SUM_SUM',
 'ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN',
 'ACTIVE_AMT_CREDIT_SUM_DEBT_SUM',
 'ACTIVE_AMT_CREDIT_SUM_LIMIT_MEAN',
 'ACTIVE_AMT_ANNUITY_MEAN',
 'ACTIVE_CNT_CREDIT_PROLONG_SUM',
 'ACTIVE_MONTHS_BALANCE_MIN_MIN',
 'ACTIVE_MONTHS_BALANCE_MAX_MAX',
 'ACTIVE_MONTHS_BALANCE_SIZE_MEAN',
 'ACTIVE_MONTHS_BALANCE_SIZE_SUM',
 'CLOSED_DAYS_CREDIT_MAX',
 'CLOSED_DAYS_CREDIT_MEAN',
 'CLOSED_DAYS_CREDIT_VAR',
 'CLOSED_DAYS_CREDIT_ENDDATE_MIN',
 'CLOSED_DAYS_CREDIT_ENDDATE_MAX',
 'CLOSED_DAYS_CREDIT_ENDDATE_MEAN',
 'CLOSED_CREDIT_DAY_OVERDUE_MAX',
 'CLOSED_AMT_CREDIT_SUM_MEAN',
 'CLOSED_AMT_CREDIT_SUM_SUM',
 'CLOSED_AMT_CREDIT_SUM_DEBT_MAX',
 'CLOSED_AMT_CREDIT_SUM_DEBT_MEAN',
 'CLOSED_AMT_CREDIT_SUM_LIMIT_MEAN',
 'CLOSED_AMT_CREDIT_SUM_LIMIT_SUM',
 'CLOSED_CNT_CREDIT_PROLONG_SUM',
 'CLOSED_MONTHS_BALANCE_MAX_MAX',
 'CLOSED_MONTHS_BALANCE_SIZE_MEAN',
 'CLOSED_MONTHS_BALANCE_SIZE_SUM',
 'PREV_AMT_ANNUITY_MIN',
 'PREV_AMT_ANNUITY_MAX',
 'PREV_AMT_ANNUITY_MEAN',
 'PREV_AMT_APPLICATION_MEAN',
 'PREV_AMT_CREDIT_MIN',
 'PREV_AMT_CREDIT_MAX',
 'PREV_APP_CREDIT_PERC_MEAN',
 'PREV_AMT_DOWN_PAYMENT_MIN',
 'PREV_AMT_DOWN_PAYMENT_MAX',
 'PREV_HOUR_APPR_PROCESS_START_MEAN',
 'PREV_RATE_DOWN_PAYMENT_MIN',
 'PREV_RATE_DOWN_PAYMENT_MAX',
 'PREV_RATE_DOWN_PAYMENT_MEAN',
 'PREV_DAYS_DECISION_MIN',
 'PREV_DAYS_DECISION_MAX',
 'PREV_CNT_PAYMENT_MEAN',
 'PREV_CNT_PAYMENT_SUM',
 'PREV_NAME_CONTRACT_TYPE_Consumer loans_MEAN',
 'PREV_NAME_CONTRACT_TYPE_Revolving loans_MEAN',
 'PREV_NAME_CONTRACT_TYPE_XNA_MEAN',
 'PREV_WEEKDAY_APPR_PROCESS_START_FRIDAY_MEAN',
 'PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_MEAN',
 'PREV_WEEKDAY_APPR_PROCESS_START_SATURDAY_MEAN',
 'PREV_WEEKDAY_APPR_PROCESS_START_SUNDAY_MEAN',
 'PREV_WEEKDAY_APPR_PROCESS_START_THURSDAY_MEAN',
 'PREV_WEEKDAY_APPR_PROCESS_START_TUESDAY_MEAN',
 'PREV_NAME_CONTRACT_STATUS_Canceled_MEAN',
 'PREV_NAME_CONTRACT_STATUS_Refused_MEAN',
 'PREV_NAME_PAYMENT_TYPE_Cash through the bank_MEAN',
 'PREV_NAME_PAYMENT_TYPE_Cashless from the account of the employer_MEAN',
 'PREV_NAME_PAYMENT_TYPE_XNA_MEAN',
 'PREV_CODE_REJECT_REASON_HC_MEAN',
 'PREV_CODE_REJECT_REASON_LIMIT_MEAN',
 'PREV_CODE_REJECT_REASON_SCO_MEAN',
 'PREV_CODE_REJECT_REASON_SCOFR_MEAN',
 'PREV_CODE_REJECT_REASON_SYSTEM_MEAN',
 'PREV_CODE_REJECT_REASON_VERIF_MEAN',
 'PREV_CODE_REJECT_REASON_XAP_MEAN',
 'PREV_CODE_REJECT_REASON_XNA_MEAN',
 'PREV_NAME_CLIENT_TYPE_New_MEAN',
 'PREV_NAME_CLIENT_TYPE_Repeater_MEAN',
 'PREV_NAME_CLIENT_TYPE_XNA_MEAN',
 'PREV_NAME_PORTFOLIO_POS_MEAN',
 'PREV_NAME_PRODUCT_TYPE_walk-in_MEAN',
 'PREV_NAME_PRODUCT_TYPE_x-sell_MEAN',
 'PREV_CHANNEL_TYPE_AP+ (Cash loan)_MEAN',
 'PREV_CHANNEL_TYPE_Car dealer_MEAN',
 'PREV_CHANNEL_TYPE_Channel of corporate sales_MEAN',
 'PREV_CHANNEL_TYPE_Credit and cash offices_MEAN',
 'PREV_CHANNEL_TYPE_Regional / Local_MEAN',
 'PREV_CHANNEL_TYPE_Stone_MEAN',
 'APPROVED_AMT_ANNUITY_MAX',
 'APPROVED_AMT_ANNUITY_MEAN',
 'APPROVED_AMT_APPLICATION_MIN',
 'APPROVED_AMT_CREDIT_MIN',
 'APPROVED_AMT_CREDIT_MAX',
 'APPROVED_APP_CREDIT_PERC_VAR',
 'APPROVED_AMT_DOWN_PAYMENT_MAX',
 'APPROVED_AMT_DOWN_PAYMENT_MEAN',
 'APPROVED_HOUR_APPR_PROCESS_START_MIN',
 'APPROVED_HOUR_APPR_PROCESS_START_MAX',
 'APPROVED_RATE_DOWN_PAYMENT_MAX',
 'APPROVED_DAYS_DECISION_MIN',
 'APPROVED_DAYS_DECISION_MAX',
 'APPROVED_DAYS_DECISION_MEAN',
 'APPROVED_CNT_PAYMENT_MEAN',
 'APPROVED_CNT_PAYMENT_SUM',
 'REFUSED_AMT_ANNUITY_MAX',
 'REFUSED_AMT_ANNUITY_MEAN',
 'REFUSED_AMT_APPLICATION_MEAN',
 'REFUSED_AMT_CREDIT_MIN',
 'REFUSED_AMT_CREDIT_MAX',
 'REFUSED_APP_CREDIT_PERC_MAX',
 'REFUSED_APP_CREDIT_PERC_VAR',
 'REFUSED_AMT_DOWN_PAYMENT_MIN',
 'REFUSED_AMT_GOODS_PRICE_MIN',
 'REFUSED_AMT_GOODS_PRICE_MEAN',
 'REFUSED_HOUR_APPR_PROCESS_START_MIN',
 'REFUSED_HOUR_APPR_PROCESS_START_MAX',
 'REFUSED_RATE_DOWN_PAYMENT_MIN',
 'REFUSED_RATE_DOWN_PAYMENT_MAX',
 'REFUSED_CNT_PAYMENT_MEAN',
 'REFUSED_CNT_PAYMENT_SUM',
 'POS_MONTHS_BALANCE_MEAN',
 'POS_MONTHS_BALANCE_SIZE',
 'POS_SK_DPD_MAX',
 'POS_SK_DPD_MEAN',
 'POS_SK_DPD_DEF_MAX',
 'POS_SK_DPD_DEF_MEAN',
 'POS_NAME_CONTRACT_STATUS_Amortized debt_MEAN',
 'POS_NAME_CONTRACT_STATUS_Approved_MEAN',
 'POS_NAME_CONTRACT_STATUS_Canceled_MEAN',
 'POS_NAME_CONTRACT_STATUS_Demand_MEAN',
 'POS_NAME_CONTRACT_STATUS_Returned to the store_MEAN',
 'POS_NAME_CONTRACT_STATUS_Signed_MEAN',
 'POS_NAME_CONTRACT_STATUS_XNA_MEAN',
 'POS_COUNT',
 'INSTAL_NUM_INSTALMENT_VERSION_NUNIQUE',
 'INSTAL_DPD_MAX',
 'INSTAL_DPD_MEAN',
 'INSTAL_DPD_SUM',
 'INSTAL_DBD_MAX',
 'INSTAL_DBD_MEAN',
 'INSTAL_DBD_SUM',
 'INSTAL_PAYMENT_PERC_MEAN',
 'INSTAL_PAYMENT_DIFF_MAX',
 'INSTAL_PAYMENT_DIFF_MEAN',
 'INSTAL_PAYMENT_DIFF_SUM',
 'INSTAL_PAYMENT_DIFF_VAR',
 'INSTAL_AMT_INSTALMENT_MAX',
 'INSTAL_AMT_PAYMENT_MIN',
 'INSTAL_AMT_PAYMENT_MAX',
 'INSTAL_AMT_PAYMENT_SUM',
 'INSTAL_DAYS_ENTRY_PAYMENT_MAX',
 'INSTAL_DAYS_ENTRY_PAYMENT_MEAN',
 'INSTAL_DAYS_ENTRY_PAYMENT_SUM',
 'INSTAL_COUNT',
 'CC_MONTHS_BALANCE_MAX',
 'CC_AMT_BALANCE_MAX',
 'CC_AMT_BALANCE_MEAN',
 'CC_AMT_CREDIT_LIMIT_ACTUAL_MIN',
 'CC_AMT_CREDIT_LIMIT_ACTUAL_MEAN',
 'CC_AMT_CREDIT_LIMIT_ACTUAL_SUM',
 'CC_AMT_CREDIT_LIMIT_ACTUAL_VAR',
 'CC_AMT_DRAWINGS_ATM_CURRENT_MEAN',
 'CC_AMT_DRAWINGS_ATM_CURRENT_SUM',
 'CC_AMT_DRAWINGS_ATM_CURRENT_VAR',
 'CC_AMT_DRAWINGS_CURRENT_MEAN',
 'CC_AMT_DRAWINGS_CURRENT_SUM',
 'CC_AMT_DRAWINGS_CURRENT_VAR',
 'CC_AMT_DRAWINGS_OTHER_CURRENT_MIN',
 'CC_AMT_DRAWINGS_OTHER_CURRENT_MAX',
 'CC_AMT_DRAWINGS_OTHER_CURRENT_SUM',
 'CC_AMT_DRAWINGS_OTHER_CURRENT_VAR',
 'CC_AMT_DRAWINGS_POS_CURRENT_MIN',
 'CC_AMT_DRAWINGS_POS_CURRENT_MAX',
 'CC_AMT_DRAWINGS_POS_CURRENT_VAR',
 'CC_AMT_INST_MIN_REGULARITY_MIN',
 'CC_AMT_INST_MIN_REGULARITY_SUM',
 'CC_AMT_INST_MIN_REGULARITY_VAR',
 'CC_AMT_PAYMENT_CURRENT_MIN',
 'CC_AMT_PAYMENT_CURRENT_SUM',
 'CC_AMT_PAYMENT_CURRENT_VAR',
 'CC_AMT_PAYMENT_TOTAL_CURRENT_MIN',
 'CC_AMT_PAYMENT_TOTAL_CURRENT_MAX',
 'CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN',
 'CC_AMT_RECEIVABLE_PRINCIPAL_MIN',
 'CC_AMT_RECEIVABLE_PRINCIPAL_MEAN',
 'CC_AMT_RECIVABLE_MAX',
 'CC_AMT_RECIVABLE_MEAN',
 'CC_AMT_TOTAL_RECEIVABLE_MAX',
 'CC_AMT_TOTAL_RECEIVABLE_MEAN',
 'CC_CNT_DRAWINGS_ATM_CURRENT_MIN',
 'CC_CNT_DRAWINGS_ATM_CURRENT_MAX',
 'CC_CNT_DRAWINGS_ATM_CURRENT_MEAN',
 'CC_CNT_DRAWINGS_ATM_CURRENT_VAR',
 'CC_CNT_DRAWINGS_CURRENT_MAX',
 'CC_CNT_DRAWINGS_CURRENT_MEAN',
 'CC_CNT_DRAWINGS_OTHER_CURRENT_MIN',
 'CC_CNT_DRAWINGS_OTHER_CURRENT_MAX',
 'CC_CNT_DRAWINGS_OTHER_CURRENT_MEAN',
 'CC_CNT_DRAWINGS_OTHER_CURRENT_SUM',
 'CC_CNT_DRAWINGS_POS_CURRENT_MIN',
 'CC_CNT_DRAWINGS_POS_CURRENT_MAX',
 'CC_CNT_DRAWINGS_POS_CURRENT_SUM',
 'CC_CNT_DRAWINGS_POS_CURRENT_VAR',
 'CC_CNT_INSTALMENT_MATURE_CUM_MIN',
 'CC_CNT_INSTALMENT_MATURE_CUM_MAX',
 'CC_CNT_INSTALMENT_MATURE_CUM_MEAN',
 'CC_CNT_INSTALMENT_MATURE_CUM_VAR',
 'CC_SK_DPD_MAX',
 'CC_SK_DPD_DEF_VAR',
 'CC_NAME_CONTRACT_STATUS_Active_MIN',
 'CC_NAME_CONTRACT_STATUS_Active_SUM',
 'CC_NAME_CONTRACT_STATUS_Active_VAR',
 'CC_NAME_CONTRACT_STATUS_Approved_MEAN',
 'CC_NAME_CONTRACT_STATUS_Approved_VAR',
 'CC_NAME_CONTRACT_STATUS_Completed_MIN',
 'CC_NAME_CONTRACT_STATUS_Completed_MEAN',
 'CC_NAME_CONTRACT_STATUS_Demand_MAX',
 'CC_NAME_CONTRACT_STATUS_Refused_MAX',
 'CC_NAME_CONTRACT_STATUS_Refused_SUM',
 'CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN',
 'CC_NAME_CONTRACT_STATUS_Sent proposal_VAR',
 'CC_NAME_CONTRACT_STATUS_Signed_MIN',
 'CC_NAME_CONTRACT_STATUS_Signed_MAX',
 'CC_NAME_CONTRACT_STATUS_Signed_SUM',
 'CC_COUNT'
]

## train test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.loc[:, columns], y, random_state=42, stratify=y)

## Models

### Dummy classifier

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, y)
print('Dummy accuracy:', dummy_clf.score(X, y))
print('Dummy ROC AUC score:', roc_auc_score(y, dummy_clf.predict_proba(X)[:, 1]))

### Other models

In [None]:
trained_models = []
models = [
          {"name": "Naive Bayes", "clf": GaussianNB()},
          {"name": "logistic regression", "clf": LogisticRegression(solver='saga', max_iter=500, class_weight='balanced')}, 
          {"name": "Decision Tree", "clf": DecisionTreeClassifier(class_weight='balanced')},
          #{"name": "SVM", "clf": SVC(probability=True, class_weight='balanced', random_state=42)},
          {"name": "XG Boost", "clf": XGBClassifier(use_label_encoder=False, random_state=42)},
          {"name": "Random Forest", "clf": RandomForestClassifier(n_estimators=100, class_weight='balanced')},
          {"name": "Gradient Boosting", "clf": GradientBoostingClassifier(n_estimators=100)}, 
          {"name": "Light GBM Classifier", "clf": LGBMClassifier(class_weight='balanced', random_state=42)}]

for model in models:
    with hc_transformations.timer(model['name']):
        pipe = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()), ('clf', model["clf"])])
        start_time = time.time()
        pipe.fit(X_train, y_train)
        train_time = time.time() - start_time
        train_rocauc_score = roc_auc_score(y_train, pipe.predict_proba(X_train)[:, 1])
        test_rocauc_score = roc_auc_score(y_test, pipe.predict_proba(X_test)[:, 1])
        model_details = {"name": model["name"], "train_rocauc":train_rocauc_score, "test_rocauc":test_rocauc_score, "train_time": train_time, "model": pipe}
        trained_models.append(model_details)

### Training scores

In [None]:
model_df = pd.DataFrame(trained_models)
model_df.sort_values("test_rocauc", inplace=True) 
ax = model_df[["train_rocauc", "test_rocauc", "name"]].plot(kind="line", x="name", figsize=(19,5), title="Classifier Performance Sorted by Test ROC AUC")
ax.legend(["Train ROC AUC", "Test ROC AUC"])
for p in ax.patches:
    ax.annotate(str(round(p.get_height(),3)), (p.get_x() * 1.005, p.get_height() * 1.005))

ax.title.set_size(20)
plt.box(False)
plt.savefig('classifier_performance.png', format='png', transparent=True)

### Training times

In [None]:
model_df = pd.DataFrame(trained_models)
model_df.sort_values("train_time", inplace=True)
ax= model_df[["train_time","name"]].plot(kind="line", x="name", figsize=(19,5), grid=True, title="Classifier Training Time (seconds)")
ax.title.set_size(20)
ax.legend(["Train Time"])
plt.box(False)  
plt.savefig('classifier_time_performance.png', format='png', transparent=True)


## Hyperparameters

In [None]:
grid_parameters = {
                    "model__n_estimators": [1000, 1500, 2000],
                    "model__learning_rate": [0.01, 0.02],
                    "model__max_depth": [2, 3],
                    "over__sampling_strategy": [0.1, 0.2],
                    "under__sampling_strategy": [0.6, 0.7]
                    }

model = LGBMClassifier()
over = SMOTE()
under = RandomUnderSampler()
steps = [('imputer', SimpleImputer(strategy='median')), 
         ('scaler', StandardScaler()), 
         ('over', over), 
         ('under', under), 
         ('model', model)]
pipeline = Pipeline(steps=steps)

clf = GridSearchCV(pipeline, grid_parameters, n_jobs=-1, scoring='roc_auc', cv=3)
with hc_transformations.timer("Grid search fit"):
    clf.fit(X_train, y_train)
cvres = clf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
clf.best_params_

## Training with best params

In [None]:
over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.6)
model = LGBMClassifier(learning_rate=0.02, max_depth=3, n_estimators=2000)
steps = [('imputer', SimpleImputer(strategy='median')), 
         ('scaler', StandardScaler()), 
         ('over', over), 
         ('under', under), 
         ('model', model)]
lgbm_pipeline = Pipeline(steps=steps)
with hc_transformations.timer("Training"):
    lgbm_pipeline.fit(X_train, y_train)
joblib.dump(lgbm_pipeline, 'trained_model.joblib') 

In [None]:
lgbm_pipeline = joblib.load('../input/lgbm-classif/trained_model.joblib')

## Feature importance

In [None]:
d = dict()
i = 0
for key in columns:
    d[key] = lgbm_pipeline["model"].feature_importances_[i]
    i += 1
d2 = dict(sorted(d.items(), key=lambda x: x[1], reverse=True))

fig, ax = plt.subplots(figsize=(20,8))
bp = sns.barplot(x=list(d2.values())[:10], y=list(d2.keys())[:10], ax=ax, color="royalblue")

patches = bp.patches
for i in range(len(patches)):
    x_val = patches[i].get_width() + 15
    y_val = patches[i].get_y() + patches[i].get_height()/2
    #x_val = patches[i].get_x() + patches[i].get_width()/2
    #y_val = patches[i].get_height()+1000
    bp.annotate('{:.0f}'.format(list(d2.values())[i]), (x_val, y_val), ha='center', fontsize=12)

bp.xaxis.grid(True)
bp.set_xlabel("Feature importance", fontsize=12)
bp.set_title("10 most important features", fontsize=16)
plt.savefig('feature_importance.png', format='png', transparent=True)

In [None]:
df_fi = pd.DataFrame.from_dict(d, orient='index', columns=["IMPORTANCE"]).reset_index().rename(columns={"index": "FEATURE_NAME"})

In [None]:
explainer = shap.TreeExplainer(lgbm_pipeline["model"])
X_trans = lgbm_pipeline[:-3].transform(X_test)
test_data_index = 6
subsampled_test_data = X_trans[test_data_index].reshape(1,-1)
shap_values = explainer.shap_values(subsampled_test_data)

df_fi['SHAP_VALUE_0'] = shap_values[0].flatten()
df_fi['SHAP_VALUE_1'] = shap_values[1].flatten()

In [None]:
df_fi = pd.read_csv('feature_importance.csv', usecols=['FEATURE_NAME', 'IMPORTANCE'])

In [None]:
df_fi.to_csv('feature_importance.csv')

## Results on test data

In [None]:
y_pred = lgbm_pipeline.predict(X_test)
print("Confusion matrix\n", pd.DataFrame(confusion_matrix(y_test, y_pred), columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
print(classification_report(y_test, y_pred))

In [None]:
print("Confusion matrix\n", pd.DataFrame(confusion_matrix(y_test, y_pred_final), columns=['pred_neg', 'pred_pos'], index=['neg', 'pos']))
print(classification_report(y_test, y_pred_final))

Le score recall de la classe 1 passe de 0.50 à 0.68

## Threshold

In [None]:
y_pred_proba = lgbm_pipeline.predict_proba(X_test)[:, 1]

Application de la fonction de coût: real_cost = 10 * fn + 1 * fp

In [None]:
with hc_transformations.timer("Threshold evaluation"):
    thresholds = np.arange(0.0, 1.0, 0.0001)
    fscore = np.zeros(shape=(len(thresholds)))
    recallscore = np.zeros(shape=(len(thresholds)))
    real_cost = np.zeros(shape=(len(thresholds)))

    for index, elem in enumerate(thresholds):
        y_pred_prob = (y_pred_proba > elem).astype('int')
        fscore[index] = f1_score(y_test, y_pred_prob)
        recallscore[index] = recall_score(y_test, y_pred_prob)
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred_prob, labels=[0,1]).ravel()
        real_cost[index] = 10 * fn + 1 * fp

    index = np.argmin(real_cost)
    thresholdOpt = round(thresholds[index], ndigits = 4)
    scoreOpt = round(fscore[index], ndigits = 4)
    print('Best Threshold: {} with F-Score: {}'.format(thresholdOpt, scoreOpt))

    df_threshold = pd.DataFrame({'Score':fscore, 'Threshold':thresholds, 'Type':'F1'})

    df_recall = pd.DataFrame({'Score':recallscore, 'Threshold':thresholds, 'Type':'recall'})

    df_real_cost = pd.DataFrame({'Score':real_cost, 'Threshold':thresholds})

    data = pd.concat([df_threshold,df_recall], ignore_index=True)

In [None]:
thresholdOpt = 0.3771

In [None]:
f, ax = plt.subplots(figsize=(8, 8))
sns.lineplot(data=df_real_cost, x="Threshold", y="Score", ax=ax)
plt.show() #savefig('threshold.png', format='png', transparent=True)

In [None]:
y_pred_final = (y_pred_proba > thresholdOpt).astype('int')

## SHAP

In [None]:
explainer = shap.TreeExplainer(lgbm_pipeline["model"])
X_trans = lgbm_pipeline[:-3].transform(X_test)
test_data_index = 6
subsampled_test_data = X_trans[test_data_index].reshape(1,-1)
shap_values = explainer.shap_values(subsampled_test_data)
print("SHAP expected value", explainer.expected_value)
print("Model mean value", lgbm_pipeline.predict_proba(X_trans).mean(axis=0))
print("Model prediction for test data", lgbm_pipeline.predict_proba(subsampled_test_data))

shap.force_plot(explainer.expected_value[1], shap_values[1][0], subsampled_test_data[0], feature_names=columns, show=False, matplotlib=True)
plt.show() #savefig('SHAP.png', format='png', transparent=True)

In [None]:
shap.force_plot(explainer.expected_value[0], shap_values[0][0], subsampled_test_data[0], feature_names=columns, show=False, matplotlib=True)

In [None]:
shap.summary_plot(explainer.shap_values(X_trans),
                  feature_names=columns)

In [None]:
shap.decision_plot(explainer.expected_value[0], shap_values[0][0], columns)