In [1]:
#Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

In [6]:
df=pd.read_csv('UCI_Credit_Card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,defaulted
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         30000 non-null  int64  
 1   LIMIT_BAL  30000 non-null  float64
 2   SEX        30000 non-null  int64  
 3   EDUCATION  30000 non-null  int64  
 4   MARRIAGE   30000 non-null  int64  
 5   AGE        30000 non-null  int64  
 6   PAY_0      30000 non-null  int64  
 7   PAY_2      30000 non-null  int64  
 8   PAY_3      30000 non-null  int64  
 9   PAY_4      30000 non-null  int64  
 10  PAY_5      30000 non-null  int64  
 11  PAY_6      30000 non-null  int64  
 12  BILL_AMT1  30000 non-null  float64
 13  BILL_AMT2  30000 non-null  float64
 14  BILL_AMT3  30000 non-null  float64
 15  BILL_AMT4  30000 non-null  float64
 16  BILL_AMT5  30000 non-null  float64
 17  BILL_AMT6  30000 non-null  float64
 18  PAY_AMT1   30000 non-null  float64
 19  PAY_AMT2   30000 non-null  float64
 20  PAY_AM

In this case , we know that there are no major data quality issues, so we'll go ahead and build the model

```

## Data Preparation and Model Building

In [8]:
#Import train-train split
from sklearn.model_selection import train_test_split

In [9]:
# Putting feature variable to X
X = df.drop('defaulted' , axis=1)

# Putting  response variable to y
y=df['defaulted']

# Splitting the data into train and test
X_train , X_test , y_train , y_test = train_test_split(X , y , 
                                            test_size=0.30 , 
                                            random_state=99)

## Default Hyperparameters

Let's first fit a random forest model with default hyperparameters.

In [10]:
# Importing Random Forest classifier from sklearn library
from sklearn.ensemble import RandomForestClassifier

#Running the random forest with default parameters
rfc=RandomForestClassifier()

In [11]:
#fit
rfc.fit(X_train , y_train)

RandomForestClassifier()

In [12]:
# Making predictions
predictions=rfc.predict(X_test)

In [13]:

# Importing classification report and confusion matrix from sklearn metrics 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [14]:
# Let's check the report of our default model
print(classification_report(y_test , predictions))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      6978
           1       0.66      0.36      0.46      2022

    accuracy                           0.81      9000
   macro avg       0.75      0.65      0.67      9000
weighted avg       0.80      0.81      0.79      9000



In [15]:
print(accuracy_score(y_test , predictions))

0.8136666666666666


So far so good , let's now look at the list of hyperparameters which we can tune to improve model performances.

```

## Hyperparameter Tuning

The following hyperparameters are present in a random forest classifier. Note that most of these hypereparameters are actually of the decision trees that are in the forest.
```

⚫n_estimators: integer, optional (default=10): The number of trees in the forest.

⚫ criterion: string, optional (default="gini") The function to measure the quality of a split. Supported criteria       are           "gini" for the Gini impurity and "entropy" for the information gain. Note: this parameter is tree-specific.

⚫ max_features: int, float, string or None, optiona consider when looking for the best split:

        If int, then consider max_features features at each split.

        If float, then max_features is a percentage and int(max features * n_features) features
        are considered at each split. 

        If "auto", then max_features=sqrt(n_features).

        If "sqrt", then max features=sqrt(n_features) (same as "auto").

        If "log2", then max_features=log2(n_features).

        If None, then max_features=n_features.

        Note: the search for a split does not stop until at least one valid partition of the node
        samples is found, even if it requires to effectively inspect more than max features features.


⚫ max_depth: integer or None, optional (default=None) The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

⚫ min_samples_split: int, float, optional (default=2)The minimum number of samples
    required to split an internal node:**


"If int, then consider min_samples_split as the minimum number.

■**If float, then min_samples_split is a percentage and ceil(min_samples_split, n_samples) are the minimum number of samples for each split.

⚫ min_samples_leaf: int, float, optional (default=1)The minimum number of samples 
    required to be at a leaf node:**

        ■ If int, then consider min_samples_leaf as the minimum number.

        ■ If float, then min_samples_leaf is a percentage and ceil(min_samples_leaf * n_samples) are the minimum number of samples for each node.


⚫ min_weight_fraction_leaf: float, optional (default=0)The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.

⚫  max_leaf_nodes: int or None, optional (default=None)Grow trees with max_leaf_nodes in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.

⚫  min_impurity_split: float, Threshold for early stopping in tree growth. A node will split if its impurity is above the threshold, otherwise it is a leaf.


```

## Tuning max_depth

Let's try to find the optimum values for max_depth and understand how the value of max_depth impacts the  overall accuracy of the ensemble

In [21]:
 #GridSearchCV to find optimal n_estimators
from sklearn. model_selection import KFold 
from sklearn.model_selection import GridSearchCV


#specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {"max_depth": range(2, 20 , 5)}

# instantiate the model
rf=RandomForestClassifier()

#fit the tree on training data
rf=GridSearchCV(rf, parameters, cv=n_folds, scoring="accuracy")

rf.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': range(2, 20, 5)}, scoring='accuracy')

In [22]:
#scores of GridSearch CV
scores= rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.680956,0.061416,0.025605,0.003426,2,{'max_depth': 2},0.809048,0.79381,0.801429,0.801429,0.800476,0.801238,0.004834,4
1,1.603813,0.096302,0.041867,0.004218,7,{'max_depth': 7},0.824524,0.818333,0.817381,0.81881,0.817857,0.819381,0.002615,2
2,2.489509,0.082413,0.058343,0.004005,12,{'max_depth': 12},0.827143,0.818571,0.81619,0.819524,0.819286,0.820143,0.003693,1
3,3.031281,0.137769,0.079876,0.014708,17,{'max_depth': 17},0.825238,0.815,0.815952,0.82119,0.817857,0.819048,0.00375,3


In [23]:

# plotting accuracies with max_depth 
plt.figure()
plt.plot(scores["param_max_depth"], 
         scores["mean_train_score"], 
         label="training accuracy")

plt.plot(scores["param_max_depth"], 
         scores ["mean_test_score"],
         label="test accuracy") 
         
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

KeyError: 'mean_train_score'

<Figure size 640x480 with 0 Axes>

You can see that as we increase the value of max_depth, both train and test scores increase till a point, but after that test score starts to decrease. The ensemble tries to overfit as we increase the max_depth.

Thus, controlling the depth of the constituent trees will help reduce overfitting in the forest.

```

## Tuning n_estimators

Let's try to find the optimum values for n_estimators and understand how the value of n_estimators Impacts the overall accuracy. Notice that we'll specify an appropriately low value of max_depth, so that the trees do not overfit.

In [24]:
 #GridSearchCV to find optimal n_estimators
from sklearn. model_selection import KFold 
from sklearn.model_selection import GridSearchCV


#specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {"n_estimators": range(100, 1500 , 400)}

# instantiate the model (note we are specifying a max_depth)
rf=RandomForestClassifier(max_depth=4)

#fit the tree on training data
rf=GridSearchCV(rf, parameters, cv=n_folds, scoring="accuracy")

rf.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(max_depth=4),
             param_grid={'n_estimators': range(100, 1500, 400)},
             scoring='accuracy')

In [25]:
#scores of GridSearch CV
scores= rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.102431,0.068789,0.03379,0.002364,100,{'n_estimators': 100},0.82,0.809286,0.810714,0.809762,0.80881,0.811714,0.00419,4
1,7.840957,1.260335,0.25354,0.04888,500,{'n_estimators': 500},0.816429,0.809524,0.810238,0.812619,0.812857,0.812333,0.002425,1
2,10.568546,1.253172,0.254601,0.006079,900,{'n_estimators': 900},0.81619,0.808333,0.810476,0.813333,0.812619,0.81219,0.002657,2
3,18.183132,4.644759,0.546713,0.139057,1300,{'n_estimators': 1300},0.816667,0.809286,0.809762,0.811905,0.812143,0.811952,0.002614,3


In [26]:

# plotting accuracies with max_depth 
plt.figure()
plt.plot(scores["param_max_depth"], 
         scores["mean_train_score"], 
         label="training accuracy")

plt.plot(scores["param_max_depth"], 
         scores ["mean_test_score"],
         label="test accuracy") 
         
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

KeyError: 'param_max_depth'

<Figure size 640x480 with 0 Axes>

```

## Tuning max_features

Let's see how the model performance varies with max_features, which is the maximum
numbre of features considered for splitting at a node.

In [27]:
 #GridSearchCV to find optimal n_estimators
from sklearn. model_selection import KFold 
from sklearn.model_selection import GridSearchCV


#specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {"max_features": [4 , 8 , 14 , 20 ,24]}

# instantiate the model (note we are specifying a max_depth)
rf=RandomForestClassifier(max_depth=4)

#fit the tree on training data
rf=GridSearchCV(rf, parameters, cv=n_folds, scoring="accuracy")

rf.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(max_depth=4),
             param_grid={'max_features': [4, 8, 14, 20, 24]},
             scoring='accuracy')


Apparently, the training and test scores both seem to increase as we increase max_features, and the model doesn't seem to overfit more with increasing max_features. Think about why that might be the case.

In [None]:
#scores of GridSearch CV
scores= rf.cv_results_
pd.DataFrame(scores).head()

In [None]:

# plotting accuracies with max_depth 
plt.figure()
plt.plot(scores["param_max_depth"], 
         scores["mean_train_score"], 
         label="training accuracy")

plt.plot(scores["param_max_depth"], 
         scores ["mean_test_score"],
         label="test accuracy") 
         
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

```


## Tuning min_samples_leaf

Let's now look at the performance of the ensembles as we  vary min_samples_split

In [28]:
 #GridSearchCV to find optimal n_estimators
from sklearn. model_selection import KFold 
from sklearn.model_selection import GridSearchCV


#specify number of folds for k-fold CV
n_folds = 5

# parameters to build the model on
parameters = {"min_samples_leaf": range(200, 500 , 50)}

# instantiate the model 
rf=RandomForestClassifier()

#fit the tree on training data
rf=GridSearchCV(rf, parameters, cv=n_folds, scoring="accuracy")

rf.fit(X_train , y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'min_samples_leaf': range(200, 500, 50)},
             scoring='accuracy')

In [30]:
#scores of GridSearch CV
scores= rf.cv_results_
pd.DataFrame(scores).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.755539,0.088854,0.04484,0.00592,200,{'min_samples_leaf': 200},0.816667,0.80881,0.81,0.811905,0.811667,0.81181,0.002679,2
1,1.702464,0.043095,0.049835,0.005799,250,{'min_samples_leaf': 250},0.81619,0.810476,0.81,0.811905,0.811429,0.812,0.002201,1
2,1.610397,0.088764,0.043477,0.00583,300,{'min_samples_leaf': 300},0.815714,0.807619,0.807143,0.81,0.813333,0.810762,0.003307,3
3,1.720648,0.130012,0.043554,0.004292,350,{'min_samples_leaf': 350},0.814286,0.807143,0.809048,0.81119,0.809524,0.810238,0.0024,4
4,1.555837,0.118843,0.045798,0.005906,400,{'min_samples_leaf': 400},0.812619,0.800714,0.805952,0.810238,0.805714,0.807048,0.004106,5


In [31]:

# plotting accuracies with max_depth 
plt.figure()
plt.plot(scores["param_min_samples_split"], 
         scores["mean_train_score"], 
         label="training accuracy")

plt.plot(scores["param_min_samples_split"], 
         scores ["mean_test_score"],
         label="test accuracy") 
         
plt.xlabel("min_samples_split")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

KeyError: 'param_min_samples_split'

<Figure size 640x480 with 0 Axes>

```

## Grid Search to Find Optimal Hyperparameters

We can now find the optimal hyperparameters using GridSearchCV

In [32]:
# Create the parameter grid based on the results of random search
param_grid = {
    'max_depth': range(4, 8, 10),
    'min_samples_leaf': range(200, 500 , 50),
    'min_samples_split': range(50, 150, 50),
    'n_estimators': range(100, 200 , 300),
    'max_features': [5,10]
}

n_folds = 3

# instantiate the model 
rf=RandomForestClassifier()

#fit the tree on training data
grid_search=GridSearchCV(estimator=rf, param_grid=param_grid, cv=n_folds, n_jobs=-1 , verbose=1)

grid_search.fit(X_train , y_train)


Fitting 3 folds for each of 24 candidates, totalling 72 fits


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': range(4, 8, 10), 'max_features': [5, 10],
                         'min_samples_leaf': range(200, 500, 50),
                         'min_samples_split': range(50, 150, 50),
                         'n_estimators': range(100, 200, 300)},
             verbose=1)

In [33]:
# Printing the optimal accuracy score and hyperparameters
print("best accuracy" , grid_search.best_score_)
print(grid_search.best_estimator_)

best accuracy 0.8202380952380953
RandomForestClassifier(max_depth=4, max_features=10, min_samples_leaf=200,
                       min_samples_split=100)


#### Running the model with best parameters obtained from grid search

In [34]:
# model with the best hyperparameter
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(max_depth=4, max_features=10, min_samples_leaf=200,
                       min_samples_split=100)

In [35]:
rfc.fit(X_train , y_train)

RandomForestClassifier(max_depth=4, max_features=10, min_samples_leaf=200,
                       min_samples_split=100)

In [36]:
predictions=rfc.predict(X_test)

In [37]:

# Importing classification report and confusion matrix from sklearn metrics 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [38]:
# Let's check the report of our default model
print(classification_report(y_test , predictions))

              precision    recall  f1-score   support

           0       0.84      0.95      0.89      6978
           1       0.69      0.36      0.47      2022

    accuracy                           0.82      9000
   macro avg       0.77      0.66      0.68      9000
weighted avg       0.81      0.82      0.80      9000



In [40]:
# Let's check the report of our default model
print(confusion_matrix(y_test , predictions))

[[6659  319]
 [1296  726]]


In [41]:
(6659+726) / (6659+726+319+1296 )

0.8205555555555556