In [3]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
from sklearn.metrics import auc, roc_curve, classification_report
from datetime import datetime, date, time, timedelta

from scipy import stats

#import h2o
#from h2o.frame import H2OFrame
#from h2o.estimators.random_forest import H2ORandomForestEstimator
#from h2o.grid.grid_search import H2OGridSearch

%matplotlib inline
sns.set(style="white",context="talk")

# Recap
1. Build a model which is better than the bank model. 
2. Compare bank profitability vs your model profitability
3. Describe the impact of the most important variables on the prediction. Also, focus on the
variable "is_employed", which describes whether the borrower is employed when she
asks for the loan. How does this variable impact the model? Explain why
Are there any other variables, not in the data provided, that you'd have liked to include in
the model?

# 1. Load the data

In [4]:
loan = pd.read_csv("loan_table.csv",sep=',')
borrower = pd.read_csv("borrower_table.csv",sep=',')

In [5]:
def view(df):
    # Exploring the data types, number of unique values and missing values
    cols = df.columns
    print(f"The dataset consists of {df.shape[0]} rows and {df.shape[1]} columns")
    print(f"The columns are: {df.columns.tolist()}")
    return pd.concat([pd.DataFrame({"data_types":df.dtypes, 
              "value_counts": df.nunique(),
             "null_counts": df.isnull().sum()}).T,df.iloc[0:3,:]],axis = 0)

In [6]:
view(loan)

The dataset consists of 101100 rows and 5 columns
The columns are: ['loan_id', 'loan_purpose', 'date', 'loan_granted', 'loan_repaid']


Unnamed: 0,loan_id,loan_purpose,date,loan_granted,loan_repaid
data_types,int64,object,object,int64,float64
value_counts,101100,5,260,2,2
null_counts,0,0,0,0,53446
0,19454,investment,2012-03-15,0,
1,496811,investment,2012-01-17,0,
2,929493,other,2012-02-09,0,


In [7]:
view(borrower)

The dataset consists of 101100 rows and 12 columns
The columns are: ['loan_id', 'is_first_loan', 'fully_repaid_previous_loans', 'currently_repaying_other_loans', 'total_credit_card_limit', 'avg_percentage_credit_card_limit_used_last_year', 'saving_amount', 'checking_amount', 'is_employed', 'yearly_salary', 'age', 'dependent_number']


Unnamed: 0,loan_id,is_first_loan,fully_repaid_previous_loans,currently_repaying_other_loans,total_credit_card_limit,avg_percentage_credit_card_limit_used_last_year,saving_amount,checking_amount,is_employed,yearly_salary,age,dependent_number
data_types,int64,int64,float64,float64,int64,float64,int64,int64,int64,int64,int64,int64
value_counts,101100,2,2,2,125,110,6845,9653,2,834,62,9
null_counts,0,0,54947,54947,0,6972,0,0,0,0,0,0
0,289774,1,,,8000,0.49,3285,1073,0,0,47,3
1,482590,0,1,0,4500,1.03,636,5299,1,13500,33,1
2,135565,1,,,6900,0.82,2085,3422,1,24500,38,8


### Merge the data

In [8]:
data = pd.merge(left = loan, right = borrower, how = "inner", on= "loan_id")
view(data)

The dataset consists of 101100 rows and 16 columns
The columns are: ['loan_id', 'loan_purpose', 'date', 'loan_granted', 'loan_repaid', 'is_first_loan', 'fully_repaid_previous_loans', 'currently_repaying_other_loans', 'total_credit_card_limit', 'avg_percentage_credit_card_limit_used_last_year', 'saving_amount', 'checking_amount', 'is_employed', 'yearly_salary', 'age', 'dependent_number']


Unnamed: 0,loan_id,loan_purpose,date,loan_granted,loan_repaid,is_first_loan,fully_repaid_previous_loans,currently_repaying_other_loans,total_credit_card_limit,avg_percentage_credit_card_limit_used_last_year,saving_amount,checking_amount,is_employed,yearly_salary,age,dependent_number
data_types,int64,object,object,int64,float64,int64,float64,float64,int64,float64,int64,int64,int64,int64,int64,int64
value_counts,101100,5,260,2,2,2,2,2,125,110,6845,9653,2,834,62,9
null_counts,0,0,0,0,53446,0,54947,54947,0,6972,0,0,0,0,0,0
0,19454,investment,2012-03-15,0,,1,,,8600,0.79,1491,6285,1,45200,42,7
1,496811,investment,2012-01-17,0,,1,,,5300,0.52,141,5793,0,0,42,5
2,929493,other,2012-02-09,0,,1,,,0,,660,3232,1,26500,60,4


In [9]:
data[["loan_granted","loan_repaid"]]

Unnamed: 0,loan_granted,loan_repaid
0,0,
1,0,
2,0,
3,1,1.0
4,1,0.0
...,...,...
101095,1,0.0
101096,1,0.0
101097,1,1.0
101098,1,1.0


### Create the scores
If you grant the loan and the it doesn't get repaid, you lose 1    
If you grant the loan and the it does get repaid, you gain 1    
If you don't grant the loan, you gain 0    

In [10]:
def get_score(x,y):
    '''
    x: granted?
    y: repaid?
    '''
    if x == 0:
        score = 0
    elif x==1 and y == 1:
        score = 1
    elif x ==1 and y == 0:
        score = -1
    return score
    

data["score"] = list(map(lambda x,y:get_score(x,y), data.loan_granted, data.loan_repaid))
data["score"].value_counts()

 0    53446
 1    30706
-1    16948
Name: score, dtype: int64

### Deal with Null

In [11]:
data[["loan_repaid",
      "fully_repaid_previous_loans",
      "currently_repaying_other_loans"]] = data[["loan_repaid",
                                                 "fully_repaid_previous_loans",
                                                 "currently_repaying_other_loans"]].fillna("NA")
#data.avg_percentage_credit_card_limit_used_last_year = data.avg_percentage_credit_card_limit_used_last_year.fillna("unknown")

In [12]:
#pip install -U scikit-learn

In [13]:
pd.get_dummies(data.drop(["loan_id","date"],axis = 1)).head()

Unnamed: 0,loan_granted,is_first_loan,total_credit_card_limit,avg_percentage_credit_card_limit_used_last_year,saving_amount,checking_amount,is_employed,yearly_salary,age,dependent_number,...,loan_purpose_other,loan_repaid_0.0,loan_repaid_1.0,loan_repaid_NA,fully_repaid_previous_loans_0.0,fully_repaid_previous_loans_1.0,fully_repaid_previous_loans_NA,currently_repaying_other_loans_0.0,currently_repaying_other_loans_1.0,currently_repaying_other_loans_NA
0,0,1,8600,0.79,1491,6285,1,45200,42,7,...,0,0,0,1,0,0,1,0,0,1
1,0,1,5300,0.52,141,5793,0,0,42,5,...,0,0,0,1,0,0,1,0,0,1
2,0,1,0,,660,3232,1,26500,60,4,...,1,0,0,1,0,0,1,0,0,1
3,1,0,5400,0.52,3345,2764,1,15800,58,4,...,1,0,1,0,0,1,0,1,0,0
4,1,1,2900,0.76,1050,3695,1,34800,31,4,...,0,1,0,0,0,0,1,0,0,1


In [14]:
# dealing with avg_percentage_credit_card_limit_used_last_year
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=3)
imputed = imputer.fit_transform(pd.get_dummies(data.drop(["loan_id","date"],axis = 1)))
df_imputed = pd.DataFrame(imputed, columns=pd.get_dummies(data.drop(["loan_id","date"],axis = 1)).columns)

# 2. EDA

In [15]:
view(df_imputed)

The dataset consists of 101100 rows and 25 columns
The columns are: ['loan_granted', 'is_first_loan', 'total_credit_card_limit', 'avg_percentage_credit_card_limit_used_last_year', 'saving_amount', 'checking_amount', 'is_employed', 'yearly_salary', 'age', 'dependent_number', 'score', 'loan_purpose_business', 'loan_purpose_emergency_funds', 'loan_purpose_home', 'loan_purpose_investment', 'loan_purpose_other', 'loan_repaid_0.0', 'loan_repaid_1.0', 'loan_repaid_NA', 'fully_repaid_previous_loans_0.0', 'fully_repaid_previous_loans_1.0', 'fully_repaid_previous_loans_NA', 'currently_repaying_other_loans_0.0', 'currently_repaying_other_loans_1.0', 'currently_repaying_other_loans_NA']


Unnamed: 0,loan_granted,is_first_loan,total_credit_card_limit,avg_percentage_credit_card_limit_used_last_year,saving_amount,checking_amount,is_employed,yearly_salary,age,dependent_number,...,loan_purpose_other,loan_repaid_0.0,loan_repaid_1.0,loan_repaid_NA,fully_repaid_previous_loans_0.0,fully_repaid_previous_loans_1.0,fully_repaid_previous_loans_NA,currently_repaying_other_loans_0.0,currently_repaying_other_loans_1.0,currently_repaying_other_loans_NA
data_types,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,...,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
value_counts,2,2,125,363,6845,9653,2,834,62,9,...,2,2,2,2,2,2,2,2,2,2
null_counts,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0,0,1,8600,0.79,1491,6285,1,45200,42,7,...,0,0,0,1,0,0,1,0,0,1
1,0,1,5300,0.52,141,5793,0,0,42,5,...,0,0,0,1,0,0,1,0,0,1
2,0,1,0,0.726667,660,3232,1,26500,60,4,...,1,0,0,1,0,0,1,0,0,1


# 3. Modeling

In [16]:
#data_dum = pd.get_dummies(data[data.loan_granted == 1].drop(["loan_id","date","score","loan_granted"],axis = 1))
data_dum = df_imputed[df_imputed.loan_granted == 1].drop(["score","loan_granted"],axis =1)
x = data_dum.drop(['loan_repaid_1.0',"loan_repaid_NA","loan_repaid_0.0"],axis = 1)
#y = pd.DataFrame(data.loc[data.loan_granted == 1,"loan_repaid"])
y = data.loc[data.loan_granted == 1,"loan_repaid"].astype(int)

In [17]:
# hold out 20% as validation dataset for evaluation purpose
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42)

#### Random Forest

In [18]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier(oob_score = True, max_features='auto' ,random_state=10)
rf_parameters = {"n_estimators":[50,80,100], "max_depth":[3,5,7], "min_samples_split" :[2,5,8], "min_samples_leaf":[2,5,8]}
rf_random_search = RandomizedSearchCV(rf_clf, rf_parameters, n_jobs = -1, cv = 2, scoring = "roc_auc", n_iter = 15)
rf_random_search.fit(x_train, y_train.values.ravel())

print(rf_random_search.best_params_)
rf_clf_best = rf_random_search.best_estimator_
print(rf_clf_best.score(x_train, y_train))

{'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 7}
0.921438501691892


#### XGBoost

In [19]:
from xgboost.sklearn import XGBClassifier

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

xgb_clf = XGBClassifier()
xgb_parameters = {"n_estimators":[560,600,640], "learning_rate":[0.04,0.05], "max_depth":[4,6,8]}
xgb_random_search = RandomizedSearchCV(xgb_clf, xgb_parameters, n_jobs = -1, cv = 10, scoring = "roc_auc", n_iter = 10)
xgb_random_search.fit(x_train, y_train)

print(xgb_random_search.best_params_)
xgb_clf_best = xgb_random_search.best_estimator_
print(xgb_clf_best.score(x_train, y_train))

{'n_estimators': 560, 'max_depth': 4, 'learning_rate': 0.04}
0.9337145555176665
