## Problem Statement 
Before agreeing to give a loan, we want to score the
customer and assess their chances of default. If it’s too high, we reject the application.
This process is called “credit risk scoring.

## Download dataset 


In [None]:
!wget https://github.com/gastonstat/CreditScoring/raw/master/CreditScoring.csv

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [None]:
## read the data into our project
df = pd.read_csv("CreditScoring.csv")

In [None]:
## lets see an overview of the dataset
df.head()

In [None]:
## lets normalize all the column names to lowercase letters
df.columns = df.columns.str.lower()
df.head()

In [None]:
df.dtypes

In [None]:
## lets change in values in the status columsn 
## by mapping them unto new values 
status_values = {
    1: 'ok',
    2: 'default',
    3: 'unk'
}

## lets use the dictionary to do the mapping
df.status = df.status.map(status_values)

In [None]:
home_values = {
    
    1: 'rent',
    2: 'owner',
    3: 'private',
    4: 'ignore',
    5: 'parents',
    6: 'other',
    0: 'unk'
    
}

df.home = df.home.map(home_values)

In [None]:
marital_values = {
    
    1: 'single',
    2: 'married',
    3: 'widow',
    4: 'separated',
    5: 'divorced',
    0: 'unk'
}

df.marital = df.marital.map(marital_values)

In [None]:
records_values = {
    
    1: 'no',
    2: 'yes',
    0: 'unk'
}

df.records = df.records.map(records_values)

In [None]:
job_values = {
    1: 'fixed',
    2: 'parttime',
    3: 'freelance',
    4: 'others',
    0: 'unk'
}

df.job = df.job.map(job_values)

In [None]:
df.head()

In [None]:
## lets view statistical summary of the dataset
df.describe().round()

In [None]:
## lets replace artificial values in our dataset

for c in ['income', 'assets', 'debt']:
    df[c] = df[c].replace(to_replace=99999999, value=np.nan)

In [None]:
df.describe().round()

## Target Variable analysis

In [None]:
df.status.value_counts()

## Dataset Preparation
- Split the dataset into train, validation, and test. (60%, 20%, 20%)
- Handle missing values.
- Use one-hot encoding to encode categorical variables.
- Create the feature matrix X and the target variable y.

In [None]:
## import libaries
from sklearn.model_selection import train_test_split


In [None]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=11)

In [None]:
df_train, df_valid = train_test_split(df_train_full, test_size=0.25, random_state=11)

In [None]:
print(f'Train set {len(df_train)}')
print(f'Valid set {len(df_valid)}')
print(f'Test set {len(df_test)}')

In [None]:
## lets select our target variables
y_train = (df_train.status == 'default').values
y_valid = (df_valid.status == 'default').values 

In [None]:
## lets delete the selected target columns from the dataframe
del df_train['status']
del df_valid['status']

In [None]:
## lets check for missing values in our dataset
df_train.isnull().sum()

In [None]:
df_train = df_train.fillna(0)

In [None]:
df_valid.isnull().sum()

In [None]:
df_valid = df_valid.fillna(0)

## Feature Engineering 
- lets encode our categorical variables in the our dataframe
- Implement our DictVectorizer

In [None]:
## convert our dataframe into a list of dictionaries
train_dict = df_train.to_dict(orient='records')
valid_dict = df_valid.to_dict(orient='records')

In [None]:
## import DictVectorizer
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dict)
X_valid = dv.fit_transform(valid_dict)

## Decision trees
A decision tree is a data structure that encodes a series of if-then-else rules. <br>
Each node in a tree contains a condition. If the condition is satisfied, we go to the <br>
right side of the tree; otherwise, we go to the left. In the end we arrive at the final decision

In [None]:
## Lets implements a decision tree classifier
from sklearn.tree import DecisionTreeClassifier

dt_cl_model = DecisionTreeClassifier()
dt_cl_model.fit(X_train, y_train)

In [None]:
## lets make 
y_train_pred = dt_cl_model.predict_proba(X_train)[:, 1]

In [None]:
y_train.shape

In [None]:
X_train.shape

In [None]:
## lets evaluate our model using the AUC
from sklearn.metrics import roc_auc_score

train_auc_score = roc_auc_score(y_train, y_train_pred)

print(f"Training Accuracy: {train_auc_score}")

In [None]:
## lets check the accuracy on our validation dataset
y_valid_pred = dt_cl_model.predict_proba(X_valid)[:, 1]

In [None]:
## lets evaluate our model using the AUC
from sklearn.metrics import roc_auc_score

valid_auc_score = roc_auc_score(y_valid, y_valid_pred)

print(f"Validation Accuracy: {valid_auc_score}")

### 

In [None]:
## Lets implements a decision tree classifier
from sklearn.tree import DecisionTreeClassifier

dt_cl_model = DecisionTreeClassifier(max_depth=2)
dt_cl_model.fit(X_train, y_train)


In [None]:
## lets make 
y_train_pred = dt_cl_model.predict_proba(X_train)[:, 1]

In [None]:
## lets evaluate our model using the AUC
from sklearn.metrics import roc_auc_score

train_auc_score = roc_auc_score(y_train, y_train_pred)

print(f"Training Accuracy: {train_auc_score}")

In [None]:
## lets check the accuracy on our validation dataset
y_valid_pred = dt_cl_model.predict_proba(X_valid)[:, 1]

In [None]:
## lets evaluate our model using the AUC
from sklearn.metrics import roc_auc_score

valid_auc_score = roc_auc_score(y_valid, y_valid_pred)

print(f"Validation Accuracy: {valid_auc_score}")

#### IMPURITY
When training a decision tree model, we want to find such T that the impurity of both groups is minimal.<br>
So, the algorithm for finding T is quite simple:
- Try all possible values of T.
- For each T, split the dataset into left and right groups and measure their impurity.
- Select T that has the lowest degree of impurity.

#### STOPPING CRITERIA 
To decide if we want to continue splitting the data, we use stopping criteria <br>
criteria that describe if we should add another split in the tree or stop. <br>
The most common stopping criteria are
- The group is already pure.
- The tree reached the depth limit (controlled by the max_depth parameter).
- The group is too small to continue splitting (controlled by the min_samples_leaf parameter).

Let’s use this information to adjust the training algorithm:
- Find the best split:
    - For each feature try all possible threshold values.
    - Use the one with the lowest impurity.
- If the maximum allowed depth is reached, stop.
- If the group on the left is sufficiently large and it’s not pure yet, repeat on the left.
- If the group on the right is sufficiently large and it’s not pure yet, repeat on the right.

### Parameter tuning for decision tree 
Parameter tuning involves finding the best parameters of the model. <br>
This usually consist of chaning the model and checking its score on the validation dataset.          

In [None]:
## lets tune the max_depth parameter with a few reasonable value
for depth in [1, 2, 3, 4, 5, 6, 10, 15, 20, None]:
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, y_train)
    y_pred_valid = dt.predict_proba(X_valid)[:, 1]
    valid_auc_score = roc_auc_score(y_valid, y_pred_valid)
    print(f"{depth} -> {round(valid_auc_score, 3)}")
    #print(f"{depth} -> {valid_auc_score}")

In [None]:
## let's tune the min_leaf_size parameter 
for depth in [3, 4, 5, 6]:
    for leaf in [1, 5, 10, 15, 20, 100, 200]:
        dt = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=leaf)
        dt.fit(X_train, y_train)
        y_pred_valid = dt.predict_proba(X_valid)[:, 1]
        valid_auc_score = roc_auc_score(y_valid, y_pred_valid)
        print(f"{depth} -> {leaf} -> {round(valid_auc_score, 3)}")

In [None]:
## lets train our final model
dt = DecisionTreeClassifier(max_depth=6, min_samples_leaf=20)
dt.fit(X_train, y_train)


## Random forest 
The combination of mutiple models for predictive purposes is known as ensemble learning,<br>
and a combination of models is called an ensemble. The easiest way to have different models is to train each tree on a different subset of features. <br> 
This way of putting together multiple decision trees into an ensemble is called a random forest.<br>
To train a random forest, we can do this (figure 6.26):
- Train N independent decision tree models.
- For each model, select a random subset of features, and use only them fortraining.
- When predicting, combine the output of N models into one.

### Training a random forest 

In [None]:
## lets implement a RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=10)
rf_model.fit(X_train, y_train)

y_pred_valid = rf_model.predict_proba(X_valid)[:, 1]
valid_auc_score = roc_auc_score(y_valid, y_pred_valid)

print(f"Validation Accuracy -> {round(valid_auc_score,2)}")

Every time we retrain the model, the score changes: it varies from 77% to 80%. <br>
The reason for this is randomization: to train a tree, we randomly select a subset of features. <br>
To make the results consistent, we need to fix the seed for the randomnumber generator by assigning some value to the random_state parameter:

In [None]:
rf_model = RandomForestClassifier(n_estimators=10, random_state=3)
rf_model.fit(X_train, y_train)

y_pred_valid = rf_model.predict_proba(X_valid)[:, 1]
valid_auc_score = roc_auc_score(y_valid, y_pred_valid)

print(f"Validation Accuracy -> {round(valid_auc_score,2)}")

The number of trees in the ensemble is an important parameter, and it influences <br>
the performance of the model. Usually, a model with more trees is better than a model <br>
with fewer trees. On the other hand, adding too many trees is not always helpful. <br>
To see how many trees we need, we can iterate over different values for n_estimators <br>
and see its effect on AUC:

In [None]:
aucs = []
for i in range(10, 201, 10):
    rf_model = RandomForestClassifier(n_estimators=i, random_state=3)
    rf_model.fit(X_train, y_train)
    
    y_pred_valid = rf_model.predict_proba(X_valid)[:, 1]
    valid_auc_score = roc_auc_score(y_valid, y_pred_valid)
    print(f"{i} - {round(valid_auc_score,2)}")
    
    aucs.append(valid_auc_score)

In [None]:
## visualize the results
#plt.figure(figsize=(4, 6))
plt.plot(range(10, 201, 10),aucs)


plt.xlabel('Number of trees')
plt.ylabel('AUC')
plt.title('Number of trees vs AUC')
plt.show()

### Parameter tuning for random forest
A random forest ensemble consists of multiple decision trees, so the most important
parameters we need to tune for random forest are the same:
- max_depth
- min_leaf_size


In [None]:
## lets create a dictionary with AUC results

all_aucs = {}

## iterates over different depth values
for depth in [5, 10, 20]:
    print(f"depth: {depth}")
    aucs = []
    
    ## creates a list with auc results for the current depth level
    for n in range(10, 201, 10):
        ## iterates over different n_estimator values
        rf_model = RandomForestClassifier(n_estimators=n, max_depth=depth, random_state=3)
        rf_model.fit(X_train, y_train)
        
        #lets evaluates the model
        y_pred_valid = rf_model.predict_proba(X_valid)[:, 1]
        valid_auc_score = roc_auc_score(y_valid, y_pred_valid)
        print(f"{n} -> {round(valid_auc_score, 2)}")
        
        aucs.append(valid_auc_score)
    ##lets save all the aucs for the current depth level in the dictionary
    all_aucs[depth] = aucs 
    print()
        
        

In [None]:
num_trees = list(range(10, 201, 10))
plt.plot(num_trees, all_aucs[5], label='depth=5')
plt.plot(num_trees, all_aucs[10], label='depth=10')
plt.plot(num_trees, all_aucs[20], label='depth=20')
plt.legend()

plt.show()

In [None]:
## lets create a dictionary with AUC results

all_aucs = {}

## iterates over different depth values
for leaf in [3, 5, 10]:
    print(f"min_samples_leaf: {leaf}")
    aucs = []
    
    ## creates a list with auc results for the current depth level
    for n in range(10, 201, 10):
        ## iterates over different n_estimator values
        rf_model = RandomForestClassifier(n_estimators=n, max_depth=10, min_samples_leaf=leaf, random_state=3)
        rf_model.fit(X_train, y_train)
        
        #lets evaluates the model
        y_pred_valid = rf_model.predict_proba(X_valid)[:, 1]
        valid_auc_score = roc_auc_score(y_valid, y_pred_valid)
        print(f"{n} -> {round(valid_auc_score, 3)}")
        
        aucs.append(valid_auc_score)
    ##lets save all the aucs for the current depth level in the dictionary
    all_aucs[leaf] = aucs 
    print()
        
        

In [None]:
num_trees = list(range(10, 201, 10))
plt.plot(num_trees, all_aucs[3], label='min_samples_leaf=3')
plt.plot(num_trees, all_aucs[5], label='min_samples_leaf=5')
plt.plot(num_trees, all_aucs[10], label='min_samples_leaf=10')
plt.legend()

plt.show()

###  Final Random Forest model

In [None]:
rf_model = RandomForestClassifier(n_estimators=80, max_depth=10, min_samples_leaf=3, random_state=3)
rf_model.fit(X_train, y_train)
        
#lets evaluates the model
y_pred_valid = rf_model.predict_proba(X_valid)[:, 1]
valid_auc_score = roc_auc_score(y_valid, y_pred_valid)
print(f"Validation Accuracy: -> {round(valid_auc_score, 3)}%")
        

## Gradient boosting 
Gradient boosting train models squentially, where each next model tries to fix errors from the previous one:
- Train the first model
- Look at the errors it makes 
- Train another model that fixes these errors
- Look at the errors again, repeat sequentially. 

### XGBoost: Extreme gradient boosting 
Before we can train an XGBoost model, we need to wrap our data into DMatrix — a
special data structure for finding splits efficiently.
When creating an instance of DMatrix, we pass three parameters:
- X_train: the feature matrix
- y_train: the target variable
- feature_names: the names of features in X_train

In [None]:
## lets implement xgboost
import xgboost as xgb 

In [None]:
## lets convert our training data into DMatrix form
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dv.feature_names_)

## lets convert our validation data into DMatrix form
dvalid = xgb.DMatrix(X_valid, label=y_valid, feature_names=dv.feature_names_)

In [None]:
## lets specify the parameters for training 
xgb_params = {
    'eta':0.3,
    'max_depth': 6,
    'min_child_weigh': 1,
    
    'objective': 'binary:logistic',
    'nthread': 8, 
    'seed': 1,
    'silent': 1
}

In [None]:
## lets start our training with 10 trees
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=10)

In [None]:
## lets create our predictions
y_pred_valid = xgb_model.predict(dvalid)

In [None]:
## lets compute our accuracy
valid_auc_score = roc_auc_score(y_valid, y_pred_valid)
print(f"Validation Accuracy {round(valid_auc_score, 3)}")

### Model performance monitoring 
To get an idea of how AUC changes as the number of trees grows, we can use a watchlist
— a built-in feature in XGBoost for monitoring model performance.
A watchlist is a Python list with tuples. Each tuple contains a DMatrix and its name

In [None]:
## lets create a watch list
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

In [None]:
## lets create a list of parameters for training 
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}

In [None]:
## lets train our model 
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=10)

In [None]:
## lets create a list of parameters for training 
xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}

In [None]:
## lets train our model 
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=500, evals=watchlist, verbose_eval=10)

#### OTHER PARAMETERS TUNING 

In [None]:
## lets create a list of parameters for training 
## lets tune the max_depth parameter
xgb_params = {
    'eta': 0.1,
    'max_depth': 3,
    'min_child_weight': 1,
    
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}

In [None]:
## lets train our model 
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=500, evals=watchlist, verbose_eval=10)

### Testing the final model 
We’re almost ready to use it for risk scoring. We still need to do two things before we can use it:
- Retrain the final model on both train and validation datasets combined. We no longer need the validation dataset, so we can use more data for training, which will make the model slightly better.
- Test the model on the test set. This is the part of data we kept aside from the beginning. Now we use it to make sure the model didn’t overfit and performs well on completely unseen data. <br>

The next steps are:

- Apply the same preprocessing to df_full_train and df_test as we did to df_train and df_val. As a result, we get the feature matrices X_train and X_test as well as our target variables y_train and y_test.
- Train a model on the combined dataset with the parameters we selected previously.
- Apply the model to the test data to get the test predictions.
- Verify that the model performs well and doesn’t overfit.

In [None]:
## lets create the target variable
y_train = (df_train_full.status == 'default').values
y_test = (df_test.status == 'default').values

In [None]:
## remove the target variable column from the dataframe
del df_train_full['status']
del df_test['status']

In [None]:
##convert the dataframe into a list of dictionaries
## replace all missing values with zeros
dict_train = df_train_full.fillna(0).to_dict(orient='records')
dict_test = df_test.fillna(0).to_dict(orient='records')

In [None]:
## use one-hot encoding to get the feature matrices
dict_vect = DictVectorizer(sparse=False)
X_train = dict_vect.fit_transform(dict_train)
X_test = dict_vect.fit_transform(dict_test)

In [None]:
##lets train the XGBoost model using this data
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=dict_vect.feature_names_)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=dict_vect.feature_names_)


In [None]:
xgb_params = {
    'eta': 0.1,
    'max_depth':3,
    'min_child_weigh':1,
    
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread': 8,
    'seed': 1,
    'silent': 1
}

num_trees = 60

In [None]:
xgb_model = xgb.train(xgb_params, dtrain, num_boost_round=num_trees)

In [None]:
## lets evaluate the performance of the model
y_pred_test = xgb_model.predict(dtest)

## lets compute our accuracy
valid_auc_score = roc_auc_score(y_test, y_pred_test)
print(f"Test Accuracy {round(valid_auc_score, 3)}")