In [29]:
# Based Calculation
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np

# model validation
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate

# Class Imbalance
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

# model : Classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# model evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report

# Hyperparameter tunning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# visualization
import seaborn as sns
from matplotlib import pyplot as plt

# Shap Value
import shap

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

## Read Data

In [2]:
train = pd.read_csv ("train.csv")

In [3]:
train.shape

(307511, 178)

# <center>**Modelling**<center>

## Function

### Model Evaluation

In [4]:
def eval_classification(model):
    y_pred = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    y_pred_proba = model.predict_proba(X_test)
    y_pred_proba_train = model.predict_proba(X_train)
    
    # Accuracy, Precision, Recall, ROC-AUC, F1-score :
    print('Accuracy (Train Set): %.2f' % accuracy_score(y_train, y_pred_train))
    print('Accuracy (Test Set): %.2f' % accuracy_score(y_test, y_pred))
    
    print('Precision (Train Set): %.2f' % precision_score(y_train, y_pred_train))
    print('Precision (Test Set): %.2f' % precision_score(y_test, y_pred))
    
    print('Recall (Train Set): %.2f' % recall_score(y_train, y_pred_train))
    print('Recall (Test Set): %.2f' % recall_score(y_test, y_pred))
    
    print("roc_auc (train-proba): %.2f" % roc_auc_score(y_train, y_pred_proba_train[:, 1]))
    print("roc_auc (test-proba): %.2f" % roc_auc_score(y_test, y_pred_proba[:, 1]))
    
    print('F1-Score (Train Set): %.2f' % f1_score(y_train, y_pred_train))
    print('F1-Score (Test Set): %.2f' % f1_score(y_test, y_pred))

def show_feature_importance(model):
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
    ax.invert_yaxis()

    plt.xlabel('score')
    plt.ylabel('feature')
    plt.title('feature importance score')

def show_best_hyperparameter(model):
    print(model.best_estimator_.get_params())

### Confusion Matrix

In [5]:
def c_matrix (model) :
    y_pred = model.predict(X_test)
    cf_matrix = confusion_matrix(y_test, y_pred)

    # mengatur matrix
    fig = plt.figure(figsize = (7, 4))
    group_names = ['TRUE NEGATIF','FALSE POSITIF','FALSE NEGATIF','TRUE POSITIF']
    group_counts = ['{0:0.0f}'.format(value) for value in cf_matrix.flatten()]
    
    group_percentages = ['{0:.2%}'.format(value) for value in
                      cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f'{v1}\n{v2}\n{v3}' for v1, v2, v3 in
            zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)

    # display matrix
    sns.heatmap(cf_matrix, annot=labels, fmt='', cmap="coolwarm", annot_kws={'size': 15})

## Feature Selection

In [6]:
X = train.drop (columns = ["SK_ID_CURR", "TARGET"], axis = 1)
y = train ["TARGET"]

In [7]:
feats = X.columns

In [8]:
from sklearn.feature_selection import f_classif
anova = pd.DataFrame(f_classif(X,y)).transpose()

In [9]:
anova.columns = ['f-score','p-value']

In [10]:
anova['columns_name'] = feats

In [11]:
# urutan feature penting bisa dilihat dari nilai f-score terbesar ke terkecil
feat_select = anova.sort_values('f-score',ascending=False).head(100)

In [12]:
feat_select = feat_select["columns_name"].unique().tolist()

## Data Split

In [13]:
X = pd.DataFrame(train,columns = feat_select)

In [14]:
X.head()

Unnamed: 0,EXT_SOURCE_2,EXT_SOURCE_3,EXT_SOURCE_1,DAYS_BIRTH,DAYS_EMPLOYED,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,DAYS_LAST_PHONE_CHANGE,CODE_GENDER,NAME_EDUCATION_TYPE,DAYS_ID_PUBLISH,REG_CITY_NOT_WORK_CITY,NAME_INCOME_TYPE,ORGANIZATION_TYPE_XNA,FLAG_EMP_PHONE,REG_CITY_NOT_LIVE_CITY,FLAG_DOCUMENT_3,DAYS_REGISTRATION,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,NAME_HOUSING_TYPE,LIVE_CITY_NOT_WORK_CITY,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,NAME_CONTRACT_TYPE,AMT_CREDIT,OCCUPATION_TYPE_Drivers,FLOORSMAX_AVG,ORGANIZATION_TYPE_Self-employed,FLOORSMAX_MEDI,FLOORSMAX_MODE,FLAG_DOCUMENT_6,FLAG_WORK_PHONE,OCCUPATION_TYPE_Low-skill Laborers,HOUR_APPR_PROCESS_START,ORGANIZATION_TYPE_Business Entity Type 3,FLAG_PHONE,TOTALAREA_MODE,FLAG_OWN_CAR,LIVINGAREA_AVG,ELEVATORS_AVG,LIVINGAREA_MEDI,OCCUPATION_TYPE_Accountants,ELEVATORS_MEDI,ELEVATORS_MODE,OCCUPATION_TYPE_Core staff,LIVINGAREA_MODE,ORGANIZATION_TYPE_Construction,OCCUPATION_TYPE_Sales staff,APARTMENTS_AVG,APARTMENTS_MEDI,OCCUPATION_TYPE_Managers,AMT_REQ_CREDIT_BUREAU_YEAR,APARTMENTS_MODE,ORGANIZATION_TYPE_Transport: type 3,OCCUPATION_TYPE_Security staff,OCCUPATION_TYPE_High skill tech staff,ORGANIZATION_TYPE_School,BASEMENTAREA_AVG,BASEMENTAREA_MEDI,AMT_ANNUITY,ENTRANCES_AVG,ENTRANCES_MEDI,OCCUPATION_TYPE_Cooking staff,BASEMENTAREA_MODE,FLAG_DOCUMENT_16,FLAG_DOCUMENT_13,ENTRANCES_MODE,AMT_REQ_CREDIT_BUREAU_MON,ORGANIZATION_TYPE_Medicine,ORGANIZATION_TYPE_Restaurant,ORGANIZATION_TYPE_Military,ORGANIZATION_TYPE_Police,ORGANIZATION_TYPE_Industry: type 3,ORGANIZATION_TYPE_Bank,ORGANIZATION_TYPE_Security Ministries,FLAG_DOCUMENT_14,CNT_FAM_MEMBERS,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,ORGANIZATION_TYPE_Trade: type 3,OCCUPATION_TYPE_Medicine staff,NONLIVINGAREA_AVG,NAME_TYPE_SUITE,NONLIVINGAREA_MEDI,ORGANIZATION_TYPE_Trade: type 7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_18,NONLIVINGAREA_MODE,ORGANIZATION_TYPE_Agriculture,OCCUPATION_TYPE_Waiters/barmen staff,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_Government,ORGANIZATION_TYPE_Security,OCCUPATION_TYPE_Cleaning staff,REG_REGION_NOT_WORK_REGION,LANDAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,FLAG_DOCUMENT_15,YEARS_BEGINEXPLUATATION_AVG
0,0.307542,0.155054,0.072215,0.111161,0.035563,0.5,0.5,0.264212,1,4,0.294567,0.0,7,0,1.0,0.0,1.0,0.14786,0.077441,0.256321,1,0.0,0.058824,0.083333,0,0.090287,0,0.0833,0,0.0833,0.0833,0.0,0.0,0,0.434783,1,1.0,0.0149,0,0.019,0.0,0.0193,0,0.0,0.0,0,0.0198,0,0,0.0247,0.025,0,0.04,0.0252,0,0,0,0,0.0369,0.0369,0.090032,0.069,0.069,0,0.0383,0.0,0.0,0.069,0.0,0,0,0,0,0,0,0,0.0,0.0,0.005747,0.005814,0,0,0.0,6,0.0,0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0375,0.9722,0.0,0.9722
1,0.727773,0.569889,0.312933,0.522886,0.066324,0.0,0.0,0.192917,0,1,0.040434,0.0,4,0,1.0,0.0,1.0,0.048071,0.271605,0.045016,1,0.0,0.0,0.0,0,0.311736,0,0.2917,0,0.2917,0.2917,0.0,0.0,0,0.478261,0,1.0,0.0714,0,0.0549,0.08,0.0558,0,0.08,0.0806,1,0.0554,0,0,0.0959,0.0968,0,0.0,0.0924,0,0,0,1,0.0529,0.0529,0.132924,0.0345,0.0345,0,0.0538,0.0,0.0,0.0345,0.0,0,0,0,0,0,0,0,0.0,0.052632,0.002874,0.002907,0,0,0.0098,1,0.01,0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0.0132,0.9851,0.0,0.9851
2,0.65019,0.81413,0.514238,0.651466,0.012561,0.5,0.5,0.189888,1,4,0.351674,0.0,7,0,1.0,0.0,0.0,0.172665,0.023569,0.134897,1,0.0,0.0,0.0,1,0.022472,0,0.226282,0,0.225897,0.222315,0.0,1.0,0,0.391304,0,1.0,0.102547,1,0.107399,0.078942,0.108607,0,0.078078,0.07449,0,0.105975,0,0,0.11744,0.11785,0,0.0,0.114231,0,0,0,0,0.088442,0.087955,0.020025,0.149725,0.149213,0,0.087543,0.0,0.0,0.145193,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0.028358,6,0.028236,0,0.0,0.0,0.027022,0,0,0,1,0,0,0.0,0.067169,0.977752,0.0,0.977735
3,0.760751,0.569889,0.514238,0.649154,0.169663,0.5,0.5,0.143756,0,4,0.338613,0.0,7,0,1.0,0.0,1.0,0.398549,0.063973,0.107023,1,0.0,0.0,0.0,0,0.066837,0,0.226282,0,0.225897,0.222315,0.0,0.0,0,0.73913,1,0.0,0.102547,0,0.107399,0.078942,0.108607,0,0.078078,0.07449,0,0.105975,0,0,0.11744,0.11785,0,0.075999,0.114231,0,0,0,0,0.088442,0.087955,0.109477,0.149725,0.149213,0,0.087543,0.0,0.0,0.145193,0.009904,0,0,0,0,0,0,0,0.0,0.052632,0.005747,0.005814,0,0,0.028358,6,0.028236,0,0.0,0.0,0.027022,0,0,0,0,0,0,0.0,0.067169,0.977752,0.0,0.977735
4,0.377472,0.569889,0.514238,0.701409,0.169607,0.5,0.5,0.257689,1,4,0.480478,1.0,7,0,1.0,0.0,0.0,0.174732,0.117845,0.39288,1,1.0,0.0,0.0,0,0.116854,0,0.226282,0,0.225897,0.222315,0.0,0.0,0,0.478261,0,0.0,0.102547,0,0.107399,0.078942,0.108607,0,0.078078,0.07449,1,0.105975,0,0,0.11744,0.11785,0,0.0,0.114231,0,0,0,0,0.088442,0.087955,0.078975,0.149725,0.149213,0,0.087543,0.0,0.0,0.145193,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0.028358,6,0.028236,0,1.0,0.0,0.027022,0,0,0,0,0,0,0.0,0.067169,0.977752,0.0,0.977735


In [15]:
X = X
y = train ["TARGET"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

print("Jumlah data pada X_train:", X_train.shape)
print("Jumlah data pada y_train:", y_train.shape)
print("Jumlah data pada X_test:", X_test.shape)
print("Jumlah data pada y_test:", y_test.shape)

print("y_train_counts:\n", y_train.value_counts())

Jumlah data pada X_train: (215257, 100)
Jumlah data pada y_train: (215257,)
Jumlah data pada X_test: (92254, 100)
Jumlah data pada y_test: (92254,)
y_train_counts:
 TARGET
0    197845
1     17412
Name: count, dtype: int64


## Handling Class Imbalance

In [17]:
X_under, y_under = under_sampling.RandomUnderSampler(random_state = 42).fit_resample(X, y)
X_over, y_over = over_sampling.RandomOverSampler(random_state = 42).fit_resample(X, y)
X_over_smote, y_over_smote = over_sampling.SMOTE(random_state = 42).fit_resample(X, y)

print("Jumlah y :\n", pd.Series(y).value_counts(), "\n")
print("Jumlah y_under :\n", pd.Series(y_under).value_counts(), "\n")
print("Jumlah y_over :\n", pd.Series(y_over).value_counts(), "\n")
print("Jumlah y_over_smote :\n", pd.Series(y_over_smote).value_counts(), "\n")

Jumlah y :
 TARGET
0    282686
1     24825
Name: count, dtype: int64 

Jumlah y_under :
 TARGET
0    24825
1    24825
Name: count, dtype: int64 

Jumlah y_over :
 TARGET
1    282686
0    282686
Name: count, dtype: int64 

Jumlah y_over_smote :
 TARGET
1    282686
0    282686
Name: count, dtype: int64 



## 1. Logistic Regression

### Evaluation

In [31]:
logreg = LogisticRegression()
logreg.fit(X_train,y_train)
predictions = logreg.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[84750    91]
 [ 7349    64]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     84841
           1       0.41      0.01      0.02      7413

    accuracy                           0.92     92254
   macro avg       0.67      0.50      0.49     92254
weighted avg       0.88      0.92      0.88     92254



## 2. Random Forest

In [33]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train,y_train)
predictions = rf.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

[[84824    17]
 [ 7387    26]]
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     84841
           1       0.60      0.00      0.01      7413

    accuracy                           0.92     92254
   macro avg       0.76      0.50      0.48     92254
weighted avg       0.89      0.92      0.88     92254

