In [1]:
import numpy as np
import pandas as pd


### Basic Data Exploration

In [2]:
credit_df = pd.read_csv("credit.csv")

In [3]:
credit_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
months_loan_duration,1000.0,20.903,12.058814,4.0,12.0,18.0,24.0,72.0
amount,1000.0,3271.258,2822.736876,250.0,1365.5,2319.5,3972.25,18424.0
percent_of_income,1000.0,2.973,1.118715,1.0,2.0,3.0,4.0,4.0
years_at_residence,1000.0,2.845,1.103718,1.0,2.0,3.0,4.0,4.0
age,1000.0,35.546,11.375469,19.0,27.0,33.0,42.0,75.0
existing_loans_count,1000.0,1.407,0.577654,1.0,1.0,1.0,2.0,4.0
dependents,1000.0,1.155,0.362086,1.0,1.0,1.0,1.0,2.0


In [4]:
credit_df.head(10)  

Unnamed: 0,checking_balance,months_loan_duration,credit_history,purpose,amount,savings_balance,employment_duration,percent_of_income,years_at_residence,age,other_credit,housing,existing_loans_count,job,dependents,phone,default
0,< 0 DM,6,critical,furniture/appliances,1169,unknown,> 7 years,4,4,67,none,own,2,skilled,1,yes,no
1,1 - 200 DM,48,good,furniture/appliances,5951,< 100 DM,1 - 4 years,2,2,22,none,own,1,skilled,1,no,yes
2,unknown,12,critical,education,2096,< 100 DM,4 - 7 years,2,3,49,none,own,1,unskilled,2,no,no
3,< 0 DM,42,good,furniture/appliances,7882,< 100 DM,4 - 7 years,2,4,45,none,other,1,skilled,2,no,no
4,< 0 DM,24,poor,car,4870,< 100 DM,1 - 4 years,3,4,53,none,other,2,skilled,2,no,yes
5,unknown,36,good,education,9055,unknown,1 - 4 years,2,4,35,none,other,1,unskilled,2,yes,no
6,unknown,24,good,furniture/appliances,2835,500 - 1000 DM,> 7 years,3,4,53,none,own,1,skilled,1,no,no
7,1 - 200 DM,36,good,car,6948,< 100 DM,1 - 4 years,2,2,35,none,rent,1,management,1,yes,no
8,unknown,12,good,furniture/appliances,3059,> 1000 DM,4 - 7 years,2,4,61,none,own,1,unskilled,1,no,no
9,1 - 200 DM,30,critical,car,5234,< 100 DM,unemployed,4,2,28,none,own,2,management,1,no,yes


In [5]:
credit_df.shape

(1000, 17)

In [6]:
credit_df.describe()


Unnamed: 0,months_loan_duration,amount,percent_of_income,years_at_residence,age,existing_loans_count,dependents
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,20.903,3271.258,2.973,2.845,35.546,1.407,1.155
std,12.058814,2822.736876,1.118715,1.103718,11.375469,0.577654,0.362086
min,4.0,250.0,1.0,1.0,19.0,1.0,1.0
25%,12.0,1365.5,2.0,2.0,27.0,1.0,1.0
50%,18.0,2319.5,3.0,3.0,33.0,1.0,1.0
75%,24.0,3972.25,4.0,4.0,42.0,2.0,1.0
max,72.0,18424.0,4.0,4.0,75.0,4.0,2.0


In [7]:
credit_df.info()  # many columns are of type object i.e. strings. These need to be converted to ordinal type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_balance        1000 non-null object
months_loan_duration    1000 non-null int64
credit_history          1000 non-null object
purpose                 1000 non-null object
amount                  1000 non-null int64
savings_balance         1000 non-null object
employment_duration     1000 non-null object
percent_of_income       1000 non-null int64
years_at_residence      1000 non-null int64
age                     1000 non-null int64
other_credit            1000 non-null object
housing                 1000 non-null object
existing_loans_count    1000 non-null int64
job                     1000 non-null object
dependents              1000 non-null int64
phone                   1000 non-null object
default                 1000 non-null object
dtypes: int64(7), object(10)
memory usage: 132.9+ KB


### 1-Hot-Encoding

In [8]:
credit_df.dtypes

checking_balance        object
months_loan_duration     int64
credit_history          object
purpose                 object
amount                   int64
savings_balance         object
employment_duration     object
percent_of_income        int64
years_at_residence       int64
age                      int64
other_credit            object
housing                 object
existing_loans_count     int64
job                     object
dependents               int64
phone                   object
default                 object
dtype: object

In [9]:
credit_df_mod = pd.get_dummies(credit_df.drop(columns="default"))

In [10]:
credit_df_mod.shape

(1000, 44)

### Train-Test Split

In [11]:
# capture the target column ("default") into separate vectors for training set and test set

X = credit_df_mod

y = credit_df["default"]




In [12]:
# splitting data into training and test set for independent attributes
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1)


In [13]:
y_test_ind = np.where(y_test == "yes",1,0)
y_train_ind = np.where(y_train == "yes",1,0)

### Decision Tree Model

In [None]:
# invoking the decision tree classifier function. Using 'entropy' method of finding the split columns. Other option 
# could be gini index.  Restricting the depth of the tree to 5 (no particular reason for selecting this)

from sklearn.tree import DecisionTreeClassifier                                 
dt_model = DecisionTreeClassifier(criterion = 'entropy',max_depth=5,min_impurity_decrease=0.01)

In [None]:
dt_model.fit(X_train, y_train)

In [None]:
from IPython.display import Image  
#import pydotplus as pydot
from sklearn import tree
from os import system

Credit_Tree_File = open('credit_tree.dot','w')
dot_data = tree.export_graphviz(dt_model, out_file=Credit_Tree_File, feature_names = list(X_train), 
                                class_names = list(y_train))

Credit_Tree_File.close()


# importance of features in the tree building ( The importance of a feature is computed as the 
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )

print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))


In [None]:
# Visualize graph here http://webgraphviz.com/ to get tree view 



### Predict Model Performance

In [None]:
y_predict_test = dt_model.predict(X_test)
y_predict_train = dt_model.predict(X_train)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
print(confusion_matrix(y_test, y_predict_test))

In [None]:
print("Train accuracy is", accuracy_score(y_train,y_predict_train))
print("Test accuracy is", accuracy_score(y_test,y_predict_test))

In [None]:
(166+45)/len(y_test)

### In-Class assignment
Q1 : What is the test accuracy when the max_depth is set to 10?

Q2: If we predict noone will default, what is the accuracy of the model?


### Predicting probabilities

In [None]:
y_predict_test = dt_model.predict_proba(X_test)

In [None]:
dt_model.classes_

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
y_test_ind = np.where(y_test == "yes",1,0)
y_test_pred_prob = y_predict_test[:,1]

In [None]:
roc_auc_score(y_test_ind,y_test_pred_prob)

#                             Ensemble Learning - Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

bgcl = BaggingClassifier(base_estimator=dt_model, n_estimators=500)
bgcl = bgcl.fit(X_train, y_train)


In [None]:
y_predict_prob_bgc = bgcl.predict_proba(X_test)[:,1]

In [None]:
roc_auc_score(y_test_ind,y_predict_prob_bgc)

# Ensemble RandomForest Classifier

In [52]:
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 5,max_depth=None,random_state=1)
rfcl = rfcl.fit(X_train, y_train)


In [53]:
y_pred_test = rfcl.predict_proba(X_test)[:,1]
y_pred_train = rfcl.predict_proba(X_train)[:,1]


In [54]:
from sklearn.metrics import roc_auc_score
print("train auc is",roc_auc_score(y_train_ind,y_pred_train))

train auc is 0.9899955770931887


In [55]:
print("test auc is",roc_auc_score(y_test_ind,y_pred_test))

test auc is 0.7237828732884156


<b> What happens as we adjust the values of n_estimators and max_depth? 
 What is the impact on train and test accuracy? </b>

## Cross Validation and Grid Search <a name="cv_grid_search"></a>

We should be careful not to use our Test set for model tuning/selection etc, since then the whole objective of having a test set as a measure of how well our model generalizes is lost.

Rather, the preferred approach is to use Cross Validation on our Training set, for Hyper parameter tuning and model selection.


In [56]:
from sklearn.model_selection import cross_val_score
cross_val_results = cross_val_score(rfcl, X_train, y_train, cv=5, scoring = "roc_auc")
cross_val_results

array([0.6486711 , 0.66614721, 0.6171182 , 0.75893071, 0.68753068])

In [57]:
print("average cross validated auc is - ", np.mean(cross_val_results))

average cross validated auc is -  0.6756795789521755


In [58]:

from sklearn.model_selection import GridSearchCV
parameters= {'n_estimators' : [100,200,500], 'max_depth' : [4,6,8,10,20,None]}

rf_grid_search = GridSearchCV(RandomForestClassifier(), parameters, scoring='roc_auc', cv = 5)
rf_grid_search.fit(X_train,y_train)
grid_search_results = rf_grid_search.cv_results_
print("Max score achieved",rf_grid_search.best_score_)
print("Optimal value of Max Depth Hyperparameter - ",rf_grid_search.best_estimator_.max_depth)
print("Optimal value of Trees - ", rf_grid_search.best_estimator_.n_estimators)

Max score achieved 0.7890764928089947
Optimal value of Max Depth Hyperparameter -  6
Optimal value of Trees -  200


In [59]:
## Understanding OOB accuracy

rfcl = RandomForestClassifier(n_estimators = 500,oob_score=True)
rfcl = rfcl.fit(X_train, y_train)

In [None]:
rfcl.oob_score_

In [None]:
print("OOB auc is",roc_auc_score(y_train_ind, rfcl.oob_decision_function_[:,1]))

In [None]:
# How does changing the value of max_depth impact train and test set performance?