In [None]:
# In this notebook we will look at how to improve the performance of machine learning models
# by trying to identify optmial values for some parameters that guide the learning process
# For example, as we have seen, we can adjust the maximum depth of a decision tree
# But how do we know how to set it to maximize accuracy, and how it interacts for example with 
# the minimum split, another parameter? This process is called (hyper)parameter tuning
# and it is a crucial part of the machine learning model building process

In [None]:
# Start with the main libraries, also for logistic regression and decision trees
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Classification performance evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

# Logistic regression
from sklearn.linear_model import LogisticRegression

# Decision trees
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
# We will start with a dataset about credit card owners, and task is to build a model
# to identify those that will not pay back their creadit next month
# So this is a binary classification problem

credit = pd.read_csv('credit-card-full.csv')

# We have 30000 data points, 1 ID column, 23 predictors, and 1 binary outcome
print(credit.shape)
credit.head()

In [None]:
# Let's try to build our first model with logistic regression

X = credit[credit.columns[1:24]]
y = credit['default payment next month']

# Then we crate training and test set, with 25% of the data in the test set

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize the model
credit_logistic = LogisticRegression()

# As we can see, the algorithm could not construct a good enough model
credit_logistic.fit(X_train,y_train)

In [None]:
# It happens sometimes, but we need to have at least a first model before we can fine-tune it
# As the error message suggest, we can try to change two things first: increase the number of iterations, 
# or change the solver (we do not perform scaling here, although that is an option
# as we know, logistic regression does not require scaling, the results will be correct, but 
# scaling can speed up the model building, requiring less iterations)

# Let's try to increase number of iterations to 500
credit_logistic = LogisticRegression(max_iter = 500)
# The model is constructed now
credit_logistic.fit(X_train,y_train)

In [None]:
# Let's check the performance
pred_logistic = credit_logistic.predict(X_test)
print(confusion_matrix(y_test,pred_logistic))

# As we can see, the results are very bad, we misclassify all the default cases
# We get the error message because some measures cannot be calculated when we do not predict one of the classes at all
print(classification_report(y_test,pred_logistic))

In [None]:
# It seems goig with the default options, we cannot build a good model, so we need to change some attributes
# A basic way is that wwe just try to change values one by one
# In case like this, an intuitive solution could be to tell python that we want to increase the importance of class1
# We can do this using class_weight, as we did with decision trees
# There is a good option, 'balanced', that takes into consideration that one of the classes is smaller
# and adds extra importance based on the proportionality from the data

credit_logistic = LogisticRegression(max_iter = 500, class_weight = 'balanced')
credit_logistic.fit(X_train,y_train)
pred_logistic = credit_logistic.predict(X_test)

# As we can see, while the accuracy decreased, now we have many default customer correctly identified
# Even if we know how to optimize parameters, we still need to understand the business problem
# In this case, we would need to know how much the different misclassifications cost to the bank
# Is it worth to have alsmot 3000 false alarms to identify approx. 75% of default cases
print(confusion_matrix(y_test,pred_logistic))
print(classification_report(y_test,pred_logistic))

In [None]:
# Another important parameter we can try is C, intuitively it prevents overfitting, but can also 
# significantly improve performance

credit_logistic = LogisticRegression(max_iter = 500, class_weight = 'balanced', C = 0.1)
credit_logistic.fit(X_train,y_train)
pred_logistic = credit_logistic.predict(X_test)

# If you play around with the code and try different values for C, you will see that 
# it does not really change the final performance
print(confusion_matrix(y_test,pred_logistic))
print(classification_report(y_test,pred_logistic))

In [None]:
# If we want to try multiple options for a parameter, we can use iteration
# For example, there are different penalty functions available, this is also related to overfitting
# We can check which one leads to the best performance

penalty_list = ['l1', 'l2', 'elasticnet', 'none']
for pen in penalty_list:
    # Specify attributes
    try:
        credit_logistic = LogisticRegression(max_iter = 500, class_weight = 'balanced', penalty = pen)
        credit_logistic.fit(X_train,y_train)
        pred_logistic = credit_logistic.predict(X_test)
        rep_cred = classification_report(y_test,pred_logistic, output_dict = True)
        # Print relevant information in each step
        print('With penalty', pen, 'recall for default class is', rep_cred['1']['recall'],
              'and accuracy is', rep_cred['accuracy'])
    except:
        print('We cannot use', pen, 'with the current solver.')
        
# It seems it does not make a difference

In [None]:
# While can try to iterate over different parameters, double iterate over possible combinations etc.
# But we have a structured and convenient way to do that in python: Grid search
# The main idea is that we specify a set of values of interest for each parameter, and 
# a model will be built for each combination, with the best one selected as the optimal model

# We need a new function for this
from sklearn.model_selection import GridSearchCV

# We start by initializing the model
model = LogisticRegression()

# We can specify possible values for the number of iterations
iterations = [500, 600, 700, 800]

# We can try different C values
c_values = [0.01, 0.1, 1, 10, 100]

# Class weights
weights = ['balanced', {0:0.1, 1:0.9}]

# We define the grid as a dictionary, using the name of parameters as defined in LogistiRegression as keys
# We will have 4x5x2=40 possible combinations, i.e. 40 different models will be tested

grid = dict(max_iter = iterations, C = c_values, class_weight = weights)

# We specify the grid search
# Estimator is the initial model, param_grid is the dictionary specified above
# We can also specify what performance measure we want to optimize
# We can try with recall
grid_search = GridSearchCV(estimator=model, param_grid=grid, scoring='recall')

# We fit the training data
# Note: the grid model building will automatically employ cross-validation
# The default option is 5 folds that we will use now
# So we do not need to use training and test set (as we have seen in cross-validation in BA1)

grid_result = grid_search.fit(X, y)

# Print out the best results
# It tells us what parameters ween need to chose to obtain the model with the best possible recall
# You can try with different settings and change also scoring to, e.g. accuracy

print("Best result is", grid_result.best_score_, 'using', grid_result.best_params_)

In [None]:
# We know the idea now, let's try decision trees

credit_tree = DecisionTreeClassifier(random_state = 42)

# This case we can specify possible values for 
# optimality criterion
criterion = ['gini', 'entropy']

# Maximum depth of the tree
max_depth = [2,4,6,8,10,12]

# Class weights
weights = ['balanced', {0:0.1, 1:0.9}]

# We define the grid, 24 possible models
grid = dict(criterion = criterion, max_depth = max_depth, class_weight = weights)

# We specify the grid search
# In this case we will use a different scoring method, AUC that captures different perspectives at the same time

grid_search = GridSearchCV(estimator=credit_tree, param_grid=grid, scoring='roc_auc')

grid_result = grid_search.fit(X, y)

# Print out the best result
print("Best result is", grid_result.best_score_, 'using', grid_result.best_params_)

In [None]:
# Let's try the best model
credit_tree = DecisionTreeClassifier(max_depth = 6, criterion = 'gini', class_weight= {0:0.1, 1:0.9}, random_state = 42)
tree_model = credit_tree.fit(X,y)
pred_tree = tree_model.predict(X)
print(confusion_matrix(y,pred_tree))
print(classification_report(y,pred_tree))