# 645 Final Project - Home Credit Default Risk

## Overview
-to predict how capable each applicant is able of repaying a loan

## Model
LogisticRegresion




# Function definitions

In [1]:
import mglearn

def print_grid_search_result(grid_search):
    '''Prints summary of best model from GridSearchCV object.
    
        For the best model of the grid search, print:
        - parameters 
        - cross-validation training score
        
        scores are printed with 3 decimal places.
        grid_search (sklearn GridSearchCV): Fitted GridSearchCV object
        returns: None

    '''
    print("Best parameters: {}".format(grid_search.best_params_))
    print("Best cross-validation score: {:.3f}".format(grid_search.best_score_))
def plot_grid_search_results(grid_search):
    '''For grids with 2 hyperparameters, create a heatmap plot of test scores
        grid_search (sklearn GridSearchCV): Fitted GridSearchCV object
        uses mglearn.tools.heatmap() for plotting.
        
    '''
    results = pd.DataFrame(grid_search.cv_results_)
    params = sorted(grid_search.param_grid.keys())
    assert len(params) == 2, "We can only plot two parameters."
    
    # second dimension in reshape are rows, needs to be the fast changing parameter
    scores = np.array(results.mean_test_score).reshape(len(grid_search.param_grid[params[0]]),
                                                      len(grid_search.param_grid[params[1]]))

    # plot the mean cross-validation scores
    # x-axis needs to be the fast changing parameter
    mglearn.tools.heatmap(scores, 
                          xlabel=params[1], 
                          xticklabels=grid_search.param_grid[params[1]], 
                          ylabel=params[0], 
                          yticklabels=grid_search.param_grid[params[0]],
                          cmap="viridis", fmt="%0.3f")

In [7]:
%matplotlib inline
import numpy as np # makes Python better than MATLAB for array manipulation
import pandas as pd # For reading and putting our CSV data in a shape suitable for processing 

# Traditional ML library
import sklearn 
import sklearn.metrics as metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV

from sklearn.model_selection import StratifiedKFold
from imblearn.ensemble import BalancedBaggingClassifier,BalancedRandomForestClassifier # ML for imbalanced problems
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

import matplotlib.pylab as plt # for plotting


# Features/columns to be removed for several reasons: ethical irrelevant feature, etc
to_remove = ["job_name", "reason","channel","n_issues","ok_since","zip","gender"] 

# Read train and test data and drop columns
df_train = pd.read_csv("./Dataset/puzzle_train_dataset.csv").set_index("ids").drop(to_remove, axis=1)
df_test = pd.read_csv("./Dataset/puzzle_test_dataset.csv").set_index("ids").drop(to_remove, axis=1)

print("Nb. of train samples: %d" %df_train.shape[0])
print("Number of test samples:  %d" %df_test.shape[0])
print("Nb. features: %d" %(df_train.shape[1]-1))

# Remove samples with missing labels
df_train = df_train[pd.notnull(df_train['default'])]
df_train["default"] = df_train["default"].astype("int")

print("Nb. of train samples after removing samples with no label: %d" %df_train.shape[0])
print("Classes ratio (False/True): %f" %((df_train["default"] == 0).sum()/(df_train["default"] == 1).sum()))


# Dataset information
feats_info = pd.concat([df_train.isnull().sum(), df_train.dtypes, df_train.T.apply(lambda x: x.nunique(), axis=1)], axis=1)
feats_info.columns = ["Missing","Type","Unique"]
print("\n Dataset Information (missing features, data type, unique values)")
print(feats_info)

Nb. of train samples: 64592
Number of test samples:  35000
Nb. features: 18
Nb. of train samples after removing samples with no label: 59966
Classes ratio (False/True): 5.305573

 Dataset Information (missing features, data type, unique values)
                    Missing     Type  Unique
default                   0    int64       2
score_1                   0   object       7
score_2                   0   object      35
score_3                   0  float64      87
score_4                   0  float64   59966
score_5                   0  float64   59966
score_6                   0  float64   59966
risk_rate                 0  float64      81
amount_borrowed           0  float64   50484
borrowed_in_months        0  float64       2
credit_limit          18779  float64   26238
income                    0  float64   54273
sign                  18938   object      12
facebook_profile       5971   object       2
state                     0   object      50
real_state                0   objec

## Data Encoding and Feature Engineering
Encoding categorical data into integer format.  
Engineering new features

In [8]:
def get_encoder(df, col):
    dft = df[col].astype(str).to_frame().copy()
    dft["count"] = 1
    return dft.groupby(col).count().to_dict()["count"]
    
def encode_all(df_train, df_test, cols):
    for col in cols:
        enc = get_encoder(df_train, col)
        df_train[col] = df_train[col].astype(str).apply(lambda x: enc.get(x, -1))
        df_test[col] = df_test[col].astype(str).apply(lambda x: enc.get(x, -1))
    return df_train, df_test


# Create additional binary features flag for missing values for all features that have missing values
for col in df_test.columns:
    df_train["is_" + col + "_missing"] = df_train[col].isnull() * 1
    df_test["is_" + col + "_missing"] = df_test[col].isnull() * 1

# Represent categorical feature as a series o binary values and drop first column to avoid redundancy
df_train = pd.get_dummies(df_train, columns=['score_1','score_2','real_state'], drop_first=True)
df_test = pd.get_dummies(df_test, columns=['score_1','score_2','real_state'], drop_first=True)

encode_cols = df_train.dtypes
encode_cols = encode_cols[encode_cols == object].index.tolist()


# Encode categorical variables and replace NaN by -1
df_train, df_test = encode_all(df_train, df_test, encode_cols)
df_train, df_test = df_train.fillna(-1), df_test.fillna(-1)

# Split features and labels
X_train, y_train = df_train.drop(["default"], axis=1), df_train["default"]
X_test = df_test

# Grid Search 
Logistic Regression Classifier


In [None]:
#Do not run. 
#Takes 2hrs to complete
from sklearn.model_selection import GridSearchCV

param_grid = {
            'C':[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100],
            'penalty':['none', 'l1', 'l2', 'elasticnet'],
            'solver':['newton-cg', 'lbfgs', 'liblinear'],
            'fit_intercept':[True,False]
            }

logreg = LogisticRegression(max_iter=2000, random_state=1)
grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid_search.fit(X_train, y_train)
print_grid_search_result(grid_search)

# Ploting model with best hyperparameters   
C = 0.1  
fit_intercetp = True  
penalty = l1  
solver = liblinear  


In [33]:
logreg = LogisticRegression(C=0.1, fit_intercept=True, penalty = 'l1', solver = 'liblinear', max_iter=2000, random_state = 1)
skf = StratifiedKFold( 5, shuffle=True, random_state=100)
cm = np.zeros((2,2))
aucs = []
for (fold, (i_train, i_test)) in enumerate(skf.split(X_train,y_train)):
    logreg.fit(X_train.iloc[i_train], y_train.iloc[i_train])
    i_pred_proba = logreg.predict_proba(X_train.iloc[i_test])
    pred = logreg.predict(X_train.iloc[i_test])
    print(i_pred_proba.shape)
    auc = metrics.roc_auc_score(y_train.iloc[i_test], i_pred_proba[:, 1])
    aucs.append(auc)
    
    cm += metrics.confusion_matrix(y_train.iloc[i_test],pred)
    print("AUC score on fold %i: %2.3f" % (fold, auc))
print("AUC: %2.3f +- %2.4f" % (np.mean(aucs), np.std(aucs)))
print("\nConfusion matrix:")
print(cm)
print("\nAccuracy %2.3f" %(cm.diagonal().sum()/cm.sum()))

logreg.fit(X_train,y_train)

TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]

accuracy = (TP+TN)/(TP+TN+FN+FP)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1 = 2*((precision*recall)/(precision+recall))

print("Accuracy = %2.3f" %(cm.diagonal().sum()/cm.sum()))
print("Precision = %2.3f" %precision)
print("Recall = %2.3f" %recall)
print("F1-score = %2.3f" %f1)


(11994, 2)
AUC score on fold 0: 0.757
(11993, 2)
AUC score on fold 1: 0.755
(11993, 2)
AUC score on fold 2: 0.753
(11993, 2)
AUC score on fold 3: 0.757
(11993, 2)
AUC score on fold 4: 0.756
AUC: 0.756 +- 0.0015

Confusion matrix:
[[49806.   650.]
 [ 8335.  1175.]]

Accuracy 0.850
Accuracy = 0.850
Precision = 0.644
Recall = 0.124
F1-score = 0.207


## Feature Importance
Using Recursive Feature Elimination (RFE) to select most relavant in predicting the target variable features.

In [120]:
from sklearn.feature_selection import RFE
predictors = X_train
selector = RFE(logreg, n_features_to_select=1)
selector = selector.fit(predictors,y_train)

Order and print out feature ranking

In [122]:
order = selector.ranking_
feature_ranks_dict= {}
for i in selector.ranking_:
    feature_ranks_dict[i]=X_train.columns[i-1]

feature_ranks_dict_items=feature_ranks_dict.items()
sorted_dict = sorted(feature_ranks_dict_items)
print(sorted_dict)

[(1, 'score_3'), (2, 'score_4'), (3, 'score_5'), (4, 'score_6'), (5, 'risk_rate'), (6, 'amount_borrowed'), (7, 'borrowed_in_months'), (8, 'credit_limit'), (9, 'income'), (10, 'sign'), (11, 'facebook_profile'), (12, 'state'), (13, 'n_bankruptcies'), (14, 'n_defaulted_loans'), (15, 'n_accounts'), (16, 'is_score_1_missing'), (17, 'is_score_2_missing'), (18, 'is_score_3_missing'), (19, 'is_score_4_missing'), (20, 'is_score_5_missing'), (21, 'is_score_6_missing'), (22, 'is_risk_rate_missing'), (23, 'is_amount_borrowed_missing'), (24, 'is_borrowed_in_months_missing'), (25, 'is_credit_limit_missing'), (26, 'is_income_missing'), (27, 'is_sign_missing'), (28, 'is_facebook_profile_missing'), (29, 'is_state_missing'), (30, 'is_real_state_missing'), (31, 'is_n_bankruptcies_missing'), (32, 'is_n_defaulted_loans_missing'), (33, 'is_n_accounts_missing'), (34, 'score_1_4DLlLW62jReXaqbPaHp1vQ=='), (35, 'score_1_8k8UDR4Yx0qasAjkGrUZLw=='), (36, 'score_1_DGCQep2AE5QRkNCshIAlFQ=='), (37, 'score_1_e4NYDor1

# Run model on Test data
Create output csv file "predictions.csv"

In [34]:
logreg.fit(X_train, y_train)
my_predictions = pd.DataFrame(logreg.predict_proba(X_test)[:, 1], columns=["predictions"], index=X_test.index)
my_predictions.to_csv("logreg_predictions.csv")
my_predictions

Unnamed: 0_level_0,predictions
ids,Unnamed: 1_level_1
e4366223-7aa2-0904-7a47-66479ae46b2a,0.086984
c6416108-c6d7-e6be-c4b5-923dd36c8ec4,0.142029
a90d3929-86ec-2414-89ba-543776b0e82b,0.153835
c5b96a7f-389a-28d0-242d-95db05e69da0,0.550139
1b461faa-926d-565d-b15d-0b452968ac81,0.176417
...,...
cfe269ae-b893-c084-f9f5-3b91f9725b71,0.141661
2feff27a-3dcf-1e19-7583-a8eab192fd23,0.051271
601509fd-20d9-d3b8-b143-defcf5457d2c,0.139706
b0168e1c-ddbd-1b2c-acfb-f09638e1ee34,0.060648


# BalancedBaggingClassifier

In [20]:
#Hyperparameter optimization with GridSearchCV

from sklearn.model_selection import GridSearchCV
from imblearn.ensemble import BalancedBaggingClassifier
param_grid = {
            'base_estimator':[LogisticRegression(C=0.1, max_iter=2000, penalty='l1', random_state=1,solver='liblinear')],
            'n_estimators':[1, 10, 50, 100]
            }

bb_clf = BalancedBaggingClassifier(random_state=1)
grid_search = GridSearchCV(bb_clf, param_grid, cv=5, scoring='accuracy', return_train_score=True)
grid_search.fit(X_train, y_train)
print_grid_search_result(grid_search)

Best parameters: {'base_estimator': LogisticRegression(C=0.1, max_iter=2000, penalty='l1', random_state=1,
                   solver='liblinear'), 'n_estimators': 1}
Best cross-validation score: 0.693


In [30]:

from imblearn.ensemble import BalancedBaggingClassifier
bb_clf = BalancedBaggingClassifier(base_estimator=LogisticRegression(C=0.1, max_iter=2000, penalty='l1', random_state=1,solver='liblinear'), random_state=1, n_estimators=1)
skf = StratifiedKFold( 5, shuffle=True, random_state=100)
cm2 = np.zeros((2,2))
aucs2 = []
for (fold, (i_train, i_test)) in enumerate(skf.split(X_train,y_train)):
    bb_clf.fit(X_train.iloc[i_train], y_train.iloc[i_train])
    i_pred_proba = bb_clf.predict_proba(X_train.iloc[i_test])
    pred = bb_clf.predict(X_train.iloc[i_test])
    print(i_pred_proba.shape)
    auc = metrics.roc_auc_score(y_train.iloc[i_test], i_pred_proba[:, 1])
    aucs2.append(auc)
    
    cm2 += metrics.confusion_matrix(y_train.iloc[i_test],pred)
    print("AUC score on fold %i: %2.3f" % (fold, auc))
print("AUC: %2.3f +- %2.4f" % (np.mean(aucs2), np.std(aucs2)))
print("\nConfusion matrix:")
print(cm2)
print("\nAccuracy %2.3f" %(cm2.diagonal().sum()/cm2.sum()))

TN = cm2[0][0]
TP = cm2[1][1]
FN = cm2[1][0]
FP = cm2[0][1]

accuracy = (TP+TN)/(TP+TN+FN+FP)
precision = TP/(TP+FP)
recall = TP/(TP+FN)
f1 = 2*((precision*recall)/(precision+recall))

print("Accuracy = %2.3f" %(cm2.diagonal().sum()/cm2.sum()))
print("Precision = %2.3f" %precision)
print("Recall = %2.3f" %recall)
print("F1-score = %2.3f" %f1)

(11994, 2)
AUC score on fold 0: 0.754
(11993, 2)
AUC score on fold 1: 0.753
(11993, 2)
AUC score on fold 2: 0.752
(11993, 2)
AUC score on fold 3: 0.757
(11993, 2)
AUC score on fold 4: 0.754
AUC: 0.754 +- 0.0017

Confusion matrix:
[[35057. 15399.]
 [ 3065.  6445.]]

Accuracy 0.692
Accuracy = 0.692
Precision = 0.295
Recall = 0.678
F1-score = 0.411


In [32]:
logreg.fit(X_train, y_train)
my_predictions = pd.DataFrame(logreg.predict_proba(X_test)[:, 1], columns=["predictions"], index=X_test.index)
my_predictions.to_csv("bb_clf.csv")
my_predictions

Unnamed: 0_level_0,predictions
ids,Unnamed: 1_level_1
e4366223-7aa2-0904-7a47-66479ae46b2a,0.086984
c6416108-c6d7-e6be-c4b5-923dd36c8ec4,0.142029
a90d3929-86ec-2414-89ba-543776b0e82b,0.153835
c5b96a7f-389a-28d0-242d-95db05e69da0,0.550139
1b461faa-926d-565d-b15d-0b452968ac81,0.176417
...,...
cfe269ae-b893-c084-f9f5-3b91f9725b71,0.141661
2feff27a-3dcf-1e19-7583-a8eab192fd23,0.051271
601509fd-20d9-d3b8-b143-defcf5457d2c,0.139706
b0168e1c-ddbd-1b2c-acfb-f09638e1ee34,0.060648
