---
# Model Evaluation and Selection
---

**Content**:

- Regression

- Classification

- Hyperparamter Tuning

- ROC Curve

- Lift Curve

- SMOTE Classification


---
---

In [None]:
# load general dependencies
from collections import Counter
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
import requests


# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# tuning
from hyperopt import STATUS_OK
from hyperopt import fmin
from hyperopt import tpe
from hyperopt import hp
from hyperopt import STATUS_OK
from hyperopt import Trials

# data preparation
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif

# regression metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import d2_tweedie_score

# clustering metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# regression models
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor

# classification models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
# load and refresh custom functions

import importlib
import utils
importlib.reload(utils)

from utils import adaboost_wrapper
from utils import roc_wrapper
from utils import plot_cumulative_gains
from utils import calc_cumulative_gains
from utils import plot_lift_chart

In [None]:
# pandas display settings

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

In [None]:
# matplotlib color settings

dark_plot_theme = True

if dark_plot_theme:
    plt.style.use('dark_background')


In [None]:
# prepare the working directory

cwd = Path()

ipath = cwd / 'data'

ipath.mkdir(exist_ok=True)

ifile = ipath / 'features.csv'


# Load the Dataset

In [None]:
# import raw dataset

cindex = ['id']

data = pd.read_csv(ifile, index_col=cindex).sort_index(axis=1)

# Regression

---

## Feature Selection

In [None]:
target = 'price'
x = data.drop(target, axis=1).select_dtypes(include=np.number)
y = data[target]

selector = SelectKBest(f_regression, k=3)
features_selected = selector.fit_transform(x, y)

In [None]:
selection_dict = {}
selection_dict['score'] = [round(s, 3) for s in selector.scores_]
selection_dict['name'] = selector.feature_names_in_
selection_dict['p-value'] = selector.pvalues_

selection = pd.DataFrame(selection_dict)
display(selection.sort_values('score', ascending=False).head(15))

In [None]:
# prepare data

# set the number of used features
n = 15
vs = selection.sort_values('score', ascending=False).head(n)['name'].values

x = data[vs]
y = data[target]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

## Model Application

---

In [None]:
# regression algorithms and metrics
rs = 42

regs = {}
regs['LinearRegression'] = LinearRegression()
regs['DecisionTreeRegressor'] = DecisionTreeRegressor(random_state=rs)
regs['RandomForestRegressor'] = RandomForestRegressor(random_state=rs)
regs['AdaBoostRegressor'] = AdaBoostRegressor(random_state=rs)
regs['GradientBoostingRegressor'] = GradientBoostingRegressor(random_state=rs)

rmetrics = {}
rmetrics['r2_score'] = r2_score
rmetrics['mean_squared_error'] = mean_squared_error
rmetrics['explained_variance_score'] = explained_variance_score
rmetrics['d2_tweedie_score'] = d2_tweedie_score

In [None]:
# regression calculation

regression_data = {}
for cname, reg in regs.items():
    reg.fit(xtrain, ytrain)
    ypred = reg.predict(xtest)

    classifier_data = {}
    for mname, metric in rmetrics.items():
        classifier_data[mname] = metric(ytest, ypred)

    regression_data[cname] = classifier_data

regression_results = pd.DataFrame(regression_data)
display(regression_results.T.sort_values('r2_score', ascending=False))


**Hyperparamter Tuning**

---

- improve AdaBoostRegressor

In [None]:
# goal: minimize the objective function

def objective(params):
    model, r2, mse = adaboost_wrapper(params, xtrain, xtest, ytrain, ytest)
    return {'loss': -r2, 'status': STATUS_OK}

In [None]:
# build the paramer space

params = {}
params['max_depth'] = hp.choice('max_depth', range(2, 10))
params['min_samples_leaf'] = hp.choice('min_samples_leaf', range(1, 250))
params['learning_rate'] = hp.uniform('learning_rate', 0.01, 10)
params['n_estimators'] = hp.choice('n_estimators', range(5, 100))
params['problem'] = 'regression'

# record trials
trials = Trials()

# minimize with fmin
best = fmin(
    fn=objective,
    space=params,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials,
    return_argmin=False,
    )

print(best)

In [None]:
# show best metrics

_, r2, mse = adaboost_wrapper(best, xtrain, xtest, ytrain, ytest)
print(f'r2={r2:.3f}, mse={mse:.3f}')

In [None]:
# extract tested parameters

values = [t['misc']['vals'] for t in trials.trials]

trial_params = {}

for k, v in values[0].items():
    trial_params[k] = [v[k][0] for v in values]

In [None]:
# plot tested parameters

figsize = (12, 8)
fig, axs = plt.subplots(len(trial_params)+1, figsize=figsize)

#losses = np.where(np.array(trials.losses())<0, trials.losses(), np.nan)
losses = trials.losses()
axs[0].plot(losses)
axs[0].set_ylabel('loss')

for i, (k, v) in enumerate(trial_params.items()):
    axs[i+1].plot(v)
    axs[i+1].set_ylabel(k)


# Classification

---

- Logistic Regression
- Decision Tree
- Random Forest
- Ada Boost Tree
- Gradient Boosting Tree

- define the classification problem
- airbnb more expensive then mean
- short excurse
- show the ROC-Curve AUC-Score
- validate on test set
- use smote

In [None]:
def get_classification_target(target):
    return (target > np.percentile(target, 75)).astype(int)

## Feature Selection

In [None]:
target = 'price'
x = data.drop(target, axis=1).select_dtypes(include=np.number)
y = get_classification_target(data[target])

selector = SelectKBest(f_classif, k=3)
features_selected = selector.fit_transform(x, y)

In [None]:
selection_dict = {}
selection_dict['score'] = [round(s, 3) for s in selector.scores_]
selection_dict['name'] = selector.feature_names_in_
selection_dict['p-value'] = selector.pvalues_

selection = pd.DataFrame(selection_dict)
display(selection.sort_values('score', ascending=False).head(10))

In [None]:
# prepare data

# set the number of used features
n = 5
vs = selection.sort_values('score', ascending=False).head(n)['name'].values

x = data[vs]
y = get_classification_target(data[target])

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)


## Model Application

---

In [None]:
# classification algorithms and metrics
rs = 42

classifiers = {}
classifiers['LogisticRegression'] = LogisticRegression(random_state=rs)
classifiers['DecisionTreeClassifier'] = DecisionTreeClassifier(random_state=rs)
classifiers['RandomForestClassifier'] = RandomForestClassifier(random_state=rs)
classifiers['AdaBoostClassifier'] = AdaBoostClassifier(random_state=rs)
classifiers['GradientBoostingClassifier'] = GradientBoostingClassifier(random_state=rs)

cmetrics = {}
cmetrics['accuracy_score'] = accuracy_score
cmetrics['f1_score'] = f1_score
cmetrics['precision_score'] = precision_score
cmetrics['recall_score'] = recall_score

In [None]:
# classification

classification_data = {}
for cname, reg in classifiers.items():
    reg.fit(xtrain, ytrain)
    ypred = reg.predict(xtest)
    
    classifier_data = {}
    for mname, metric in cmetrics.items():
        classifier_data[mname] = metric(ytest, ypred)

    classification_data[cname] = classifier_data

test_results = pd.DataFrame(classification_data)
display(test_results.T.sort_values('f1_score', ascending=False))


**Hyperparamter Tuning**

---

- improve AdaBoostClassifier

In [None]:
# goal: minimize the objective function

def objective(params):
    model, f1, acc = adaboost_wrapper(params, xtrain, xtest, ytrain, ytest)
    return {'loss': -f1, 'status': STATUS_OK}

In [None]:
# build the paramer space

params = {}
params['max_depth'] = hp.choice('max_depth', range(2, 10))
params['min_samples_leaf'] = hp.choice('min_samples_leaf', range(1, 250))
params['learning_rate'] = hp.uniform('learning_rate', 0.01, 10)
params['n_estimators'] = hp.choice('n_estimators', range(5, 100))
params['problem'] = 'classification'

# record trials
trials = Trials()

# minimize with fmin
best = fmin(
    fn=objective,
    space=params,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials,
    return_argmin=False,
    )

In [None]:
# show best

print(best)
model, f1, acc = adaboost_wrapper(best, xtrain, xtest, ytrain, ytest)
print(f'f1={f1:.3f}, acc={acc:.3f}')

In [None]:
# extract tested parameters

values = [t['misc']['vals'] for t in trials.trials]

trial_params = {}

for k, v in values[0].items():
    trial_params[k] = [v[k][0] for v in values]

In [None]:
# plot tested parameters

figsize = (12, 8)
fig, axs = plt.subplots(len(trial_params)+1, figsize=figsize)

#losses = np.where(np.array(trials.losses())<0, trials.losses(), np.nan)
losses = trials.losses()
axs[0].plot(losses)
axs[0].set_ylabel('loss')

for i, (k, v) in enumerate(trial_params.items()):
    axs[i+1].plot(v)
    axs[i+1].set_ylabel(k)


In [None]:
ypred = model.predict(xtest)
yprob = model.predict_proba(xtest)

**roc**

---

In [None]:
# plot the roc
_ = roc_wrapper(ytest, ypred, yprob)

**lift**

---

In [None]:
# calculate lift

tmp = pd.DataFrame()
tmp['actual'] = ytest
tmp['pred'] = ypred
tmp['prob'] = yprob[:,1]

lift = calc_cumulative_gains(tmp, 'actual', 'pred', 'prob')

In [None]:

plot_cumulative_gains(lift)

In [None]:
plot_lift_chart(lift)

## Classification with SMOTE

---

Synthetic Minority Oversampling Technique

In [None]:
# prepare data using SMOTE

_, xtest, _, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

print('Original dataset shape %s' % Counter(y))

sm = SMOTE(random_state=42)

x_smote, y_smote = sm.fit_resample(x, y)

xtrain, _, ytrain, _ = train_test_split(x_smote, y_smote, test_size=0.2, random_state=42)


print('Resampled dataset shape %s' % Counter(y_smote))

In [None]:
# classification with smote

classification_data = {}
for cname, reg in classifiers.items():
    reg.fit(xtrain, ytrain)
    ypred = reg.predict(xtest)
    
    classifier_data = {}
    for mname, metric in cmetrics.items():
        classifier_data[mname] = metric(ytest, ypred)

    classification_data[cname] = classifier_data

test_results = pd.DataFrame(classification_data)
display(test_results.T.sort_values('f1_score', ascending=False))

---
---
---