In [31]:
!pip install scikit-learn

import pandas
import numpy
import seaborn
from matplotlib import pyplot
#matplotlib inline

In [32]:
dataframe = pandas.read_csv("/mlbc/chapter-06-credit-risk/CreditScoring.csv")
dataframe.head()

In [33]:
dataframe.columns = dataframe.columns.str.lower()
dataframe.head()

In [34]:
status_values = {
    1: "ok",
    2: "default",
    0: "unk"
}

dataframe.status = dataframe.status.map(status_values)
dataframe.head()

In [35]:
home_values = {
    1: "rent",
    2: "owner",
    3: "private",
    4: "ignored",
    5: "parents",
    6: "other",
    0: "unk"
}

dataframe.home = dataframe.home.map(home_values)

marital_values = {
    1: "single",
    2: "married",
    3: "widow",
    4: "separated",
    5: "divorced",
    0: "unk"
}

dataframe.marital = dataframe.marital.map(marital_values)

records_values = {
    1: "no",
    2: "yes",
    0: "unk"
}

dataframe.records = dataframe.records.map(records_values)

job_values = {
    1: "fixed",
    2: "parttime",
    3: "freelance",
    4: "others",
    0: "unk"
}

dataframe.job = dataframe.job.map(job_values)

dataframe.head()

In [36]:
dataframe.describe().round()

In [37]:
for c in ["income", "assets", "debt"]:
    dataframe[c] = dataframe[c].replace(to_replace=99999999, value=numpy.nan)
    
dataframe.describe().round()

Check to see if any of the loans are in an ambigous or unknown state. Turns out there's just one

In [38]:
dataframe.status.value_counts()

Remove the "unk" (unknown) values from the status, since we don't know whether or not they paid the loan back.

In [39]:
dataframe = dataframe[dataframe.status != "unk"]

dataframe.status.value_counts()

Next we'll split the dataset into train, validation, and test sets

In [40]:
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(dataframe, test_size=0.2, random_state=11)

df_train, df_validation = train_test_split(df_train_full, test_size=0.25, random_state=11)

len(df_train), len(df_validation), len(df_test)

`status` is our target variable and we can call it `y` since it's our dependent variable

In [41]:
# get only the "positive" answers for the training and validation sets
y_train = (df_train.status == "default").values
y_validation = (df_validation.status == "default")

# remove the 'status' column from the datasets so it doesn't impact our training
del df_train["status"]
del df_validation["status"]

In [42]:
# replace the NaN values with 0
df_train = df_train.fillna(0)
df_validation = df_validation.fillna(0)

We need to utilize one-hot encoding, where a value of `1` is "hot" and a value of `0` is "cold" using the `DictVectorizer` from SciKit.

In [43]:
# convert the dataframes into a list of dictionaries
dict_train = df_train.to_dict(orient="records")
dict_validation = df_validation.to_dict(orient="records")

# pass the dictionaries into the DictVectorizer
from sklearn.feature_extraction import DictVectorizer

dictVectorizer = DictVectorizer(sparse=False)

X_train = dictVectorizer.fit_transform(dict_train)
X_val = dictVectorizer.transform(dict_validation)

A decision tree is simply a collection of `if..else` statements that lead to final conclusions. It is similar in structure to a binary tree where the leaves are the resulting answers.

In [44]:
# import the scikit tree classifier
from sklearn.tree import DecisionTreeClassifier

# train the model using the `fit` method

decision_tree = DecisionTreeClassifier()

decision_tree.fit(X_train, y_train)

In [45]:
# user scikit to get the AUC score
from sklearn.metrics import roc_auc_score

# use the decision tree to predict with the training set
y_pred = decision_tree.predict_proba(X_train)[:,1]

# get the score of actual vs predicted
roc_auc_score(y_train, y_pred)

`1.0` represents a perfect score, which is "great". We need to also check with the validation set as well, however.

In [46]:
y_pred = decision_tree.predict_proba(X_val)[:,1]
roc_auc_score(y_validation, y_pred)

The validation set only got a 66%. Meaning the model cannot "generalize" because it does not apply well to sets besides the set it trained on. This can happen when the model is too specific and "memorizes" the data.

We can solve this by enforcing a level of simplicity on the model, so it cannot just match to the set perfectly.

In [47]:
# prevent the tree from growing too large using max_depth

decision_tree = DecisionTreeClassifier(max_depth=2)
decision_tree.fit(X_train, y_train)

# visualize the tree using an export function
from sklearn.tree import export_text
text_tree = export_text(decision_tree, feature_names=dictVectorizer.feature_names_)

print(text_tree)

In [48]:
# check the score with the new limited tree
y_pred = decision_tree.predict_proba(X_train)[:,1]
auc_train = roc_auc_score(y_train, y_pred)
print("train auc", auc_train)

y_pred = decision_tree.predict_proba(X_val)[:,1]
auc_validation = roc_auc_score(y_validation, y_pred)
print("validation auc", auc_validation)

Worse score on the training set and only a slightly better score on the validation set, which indicates the model is now more generalized, rather than specific to a single set of data.

However, there are other parameters besides `max_depth` that we can tune:
<https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html>.

We can use a loop to find the best `max_depth` from a list of potential candidates.

In [49]:
for depth in [1, 2, 3, 4, 5, 6, 10, 15, 20, None]:
    decision_tree = DecisionTreeClassifier(max_depth=depth)
    decision_tree.fit(X_train, y_train)
    
    y_pred = decision_tree.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_validation, y_pred)
    
    print("%4s -> %.3f" % (depth, auc))

With this loop, we can see that the optimal values are `4`, `5`, and `6`.

We can then us another loop to figure out the best `min_samples_leaf` value.

In [50]:
for depth in [4, 5, 6]:
    print("depth", depth)
    
    for leaf_size in [1, 5, 10, 15, 20, 50, 100, 200]:
        decision_tree = DecisionTreeClassifier(max_depth=depth, min_samples_leaf=leaf_size)
        decision_tree.fit(X_train, y_train)
        
        y_pred = decision_tree.predict_proba(X_val)[:,1]
        
        auc = roc_auc_score(y_validation, y_pred)
        
        print("%s -> %.3f" % (leaf_size, auc))
    
    print() # line break

The best combination turns out to be `min_depth=6` and `min_samples_leaf=15`. We can now use them to train the model again.

In [51]:
decision_tree = DecisionTreeClassifier(max_depth=6, min_samples_leaf=15)
decision_tree.fit(X_train, y_train)

## Random Forest and Ensemble Learning

Combining the predictions of multiple, differnt models is called "ensemble learning". The results of all the models' predictions can be aggregated and used to form a final prediction which is often more accurate than a single prediction.

You can train models on different features (e.g. `assets`, `debts`, `records` in the current scenario).

In [52]:
# import the random forest utility from SciKit
from sklearn.ensemble import RandomForestClassifier

# n_estimators is the number of trees
rand_forest = RandomForestClassifier(n_estimators=10)
rand_forest.fit(X_train, y_train)

# test out the training and see if it worked
y_pred = rand_forest.predict_proba(X_val)[:,1]
roc_auc_score(y_validation, y_pred)

The answer shown above will be more or less random each time you train the forest since the library utilizes a randomizer. You can utilize a random seed in order to get consistent results.

In [53]:
# build the forest with a random seed
rand_forest = RandomForestClassifier(n_estimators=10, random_state=3)
rand_forest.fit(X_train, y_train)

# test the training again
y_pred = rand_forest.predict_proba(X_val)[:,1]
roc_auc_score(y_validation, y_pred)

We can again use loops in order to tune parameters.

We'll tune the `n_estimators` parameter this way.

In [54]:
aucs = [] # create list of results

for n in range(10, 201, 10): # 10 to 201 in steps of 10 (10, 20, 30, ..., 190, 200)
    random_forest = RandomForestClassifier(n_estimators=n, random_state=3)
    random_forest.fit(X_train, y_train)
    
    y_pred = random_forest.predict_proba(X_val)[:,1]
    auc = roc_auc_score(y_validation, y_pred)
    
    print("%s -> %.3f" % (n, auc))
    
    aucs.append(auc)
    
# plot the result to see the best auc visually
pyplot.plot(range(10, 201, 10), aucs)

Naturally, the random forest has additional parameters we can tune: <https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html>.

We will now tune `max_depth` first.

In [None]:
all_aucs = {} # dictionary of results

for depth in [5, 10, 15]:
    print("depth", depth)
    
    aucs = []
    
    for n in range(10, 201, 10):
        random_forest = RandomForestClassifier(n_estimators=n, max_depth=depth, random_state=1)
        random_forest.fit(X_train, y_train)
        
        y_pred = random_forest.predict_proba(X_val)[:,1]
        
        auc = roc_auc_score(y_validation, y_pred)
        
        print("%s -> %.3f" % (n, auc))
        
        aucs.append(auc)
    
    all_aucs[depth] = aucs
    
    print() # line break    

In [None]:
num_trees = list(range(10, 201, 10))

pyplot.plot(num_trees, all_aucs[5], label="depth=5")
pyplot.plot(num_trees, all_aucs[10], label="depth=10")
pyplot.plot(num_trees, all_aucs[15], label="depth=15")

pyplot.legend()

We can see ideally a depth of `10` should be used. With that, we can now train the `min_samples_leaf` value.

In [None]:
all_aucs = {} # dictionary of results

for leaf_size in [3, 5, 10]:
    print("min samples", leaf_size)
    
    aucs = []
    
    for n in range(10, 201, 10):
        random_forest = RandomForestClassifier(n_estimators=n, max_depth=10, random_state=1, min_samples_leaf=leaf_size)
        random_forest.fit(X_train, y_train)
        
        y_pred = random_forest.predict_proba(X_val)[:,1]
        
        auc = roc_auc_score(y_validation, y_pred)
        
        print("%s -> %.3f" % (n, auc))
        
        aucs.append(auc)
    
    all_aucs[leaf_size] = aucs
    
    print() # line break    

In [None]:
num_trees = list(range(10, 201, 10))

pyplot.plot(num_trees, all_aucs[3], label="leaf_size=3")
pyplot.plot(num_trees, all_aucs[5], label="leaf_size=5")
pyplot.plot(num_trees, all_aucs[10], label="leaf_size=10")

pyplot.legend()

Thus we can determine the "optimal" parameters for our forest are:

- `max_depth = 10`
- `min_samples_leaf = 5`
- `n_estimators = 200`

In [None]:
# we can now train the optimzal model

random_forest = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_leaf=5, random_state=1)

## Gradient boosting

Random trees are exactly that, random, but you can combine models a little more eintelligently using "boosting". Gradient boosting is where each model is trained off of the previous.

In [None]:
!pip install xgboost