---
# Classification
---

**Content**

- Logistic Regression

- Decision Trees

- Ensemble Methods

**Additional Material**

- very cool visualization from [r2d3](http://www.r2d3.us/visual-intro-to-machine-learning-part-1/)
---

In [None]:
dark_plot_theme = True

if dark_plot_theme:
    plt.style.use('dark_background')
    

# pandas display settings
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import sys

# utils
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.utils.validation import check_is_fitted

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


In [None]:
# load and refresh custom functions

import importlib
import utils
importlib.reload(utils)

from utils import get_dichotomous
from utils import roc_wrapper

In [None]:
# save experiment metrics 

exps = {}

# Load Feature Data

assume: no nulls, no outliers (z>3)

In [None]:
# prepare the directory and load the data

cwd = Path()

ipath = cwd / 'data'

ipath.mkdir(exist_ok=True)

ifile = ipath / 'features.csv'
data = pd.read_csv(ifile, index_col=['id'])

data.head(3)

# Classification Target

In [None]:
# remove imputations from the target variable

var = 'review_scores_rating'

mask = ((data[f'imp_z_{var}'] + data[f'imp_{var}']) == 0)

cdata = data[mask].dropna()

target = 'top_rating'

if target not in cdata:
    rel = (cdata[var] == 100).sum() / len(cdata[var]) * 100

    # TODO:
    # Define the new target
    # Select only the highest ratings
    # And save as type "int"

    # TIP:
    # cdata[target] = ...

    cdata[target] = (cdata[var] == 100).astype('uint8') # REMOVE
    
    cdata = cdata.drop(var, axis=1)

In [None]:
# CHECK

if cdata[target].sum() == 6043:
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')

In [None]:
# plot basis of the new target

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

sns.histplot(data=data, x=var, ax=axs[0]);
sns.histplot(data=data[mask], x=var, ax=axs[1]);

print(f'Top Ratings: {rel:.2f} %')

# Logistic Regression
---

$logreg = \frac{1}{1-exp(-z)}$

$z = \beta_0 + x_{i1}\beta_1 + ...$

In [None]:
# prepare the dataset

# select numeric only
ldata = cdata.select_dtypes(include=np.number)

# drop dichotomous features (keep it simple)
cols = [x for x in get_dichotomous(ldata) if x != target]
ldata = ldata.drop(cols, axis=1)

ldata


In [None]:
# train test split

x = ldata.drop(target, axis=1)
y = ldata[target]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=0)


# TODO
# Instantiate "LogisticRegression"
# with the "linlinear" solver.
# And fit the model
# to the training data using ".fit"

# TIP:
# logreg = ...

logreg = LogisticRegression(solver='liblinear', random_state=0) # REMOVE
logreg.fit(xtrain, ytrain); # REMOVE


In [None]:
# CHECK

passed = []

if isinstance(logreg, LogisticRegression):
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')
    passed.append(False)

try:
    check_is_fitted(logreg)
    print('*** passed, well done!')
except:
    print('*** something went wrong, try again')
    passed.append(False)


In [None]:
# TODO:
# Use "logreg" to make predictions
# and get the decision probablilites

# TIP:
# ypred = 
# yprob = 

ypred = logreg.predict(xtest) # REMOVE
yprob = logreg.predict_proba(xtest) # REMOVE

In [None]:
# CHECK

passed = []

if isinstance(ypred, np.ndarray):
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')
    passed.append(False)

if isinstance(yprob, np.ndarray):
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')
    passed.append(False)

In [None]:
# TODO
# Visualize the results.
# Apply the provided "roc_wrapper" convenience function
# with the signature: "roc_wrapper(ytest, ypred, yprob)"
# and save the results in the exps dict.

# TIP:
# exps['logreg_1] = ...

exps['logreg_1'] = roc_wrapper(ytest, ypred, yprob) # REMOVE

improve that!

---

In [None]:
# correlation

cor = np.abs(ldata.drop(get_dichotomous(ldata), axis=1).corr())

# absolute correlation
fig, axs = plt.subplots(1, 2, figsize=(16, 6))
sns.heatmap(cor, annot=False, cmap=plt.cm.Blues, vmin=0, vmax=1, ax=axs[0]);

# absolution correlation > 0.7
sns.heatmap(cor.where(cor>0.7, other=0), annot=False, cmap=plt.cm.Blues, vmin=0, vmax=1, ax=axs[1]);

In [None]:
# drop the correlated feature

if 'host_id' in ldata:
    # TODO
    # "host_id" and "years_registered" are correlated
    # Remove "host_id" from "ldata"

    # TIP:
    # result = ldata...

    result = ldata.drop('host_id', axis=1) # REMOVE

In [None]:
# CHECK

if 'host_id' not in result:
    print('*** passed, well done!')
else:
    ldata = result
    print('*** something went wrong, try again')

In [None]:
# train test split

x = ldata.drop(target, axis=1)
y = ldata[target]
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=0)

# apply logistic regression

logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg.fit(xtrain, ytrain);

# make predictions

ypred = logreg.predict(xtest)
yprob = logreg.predict_proba(xtest)


In [None]:
# plot

exps['logreg_2'] = roc_wrapper(ytest, ypred, yprob)

In [None]:
# plot confusion matrix for the default threshold

# TODO
# Use the sklearn function "confusion_matrix"
# to extract the: TrueNegatives, FalsePositives, FalseNegatives and TruePositives
# Use the "np.ravel" function on the output of "confusion_matrix"

# TIP:
# tn, fp, fn, tp = 

tn, fp, fn, tp = confusion_matrix(ytest, ypred, normalize='all').ravel() # REMOVE


In [None]:
# CHECK

if tn < 1:
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')

In [None]:
fig, ax = plt.subplots()
sns.heatmap([[tp, fp],[fn, tn]], cmap='Blues', vmax=1, annot=True, xticklabels=[1, 0], yticklabels=[1, 0], ax=ax);

ax.xaxis.tick_top();
ax.xaxis.set_label_position('top');
ax.set_xlabel('Actual');
ax.set_ylabel('Predicted');

## Only Imputation Flags

---

In [None]:
# filter to include imputation flags only

pattern = '^imp.*'

# TODO
# Filter "cdata" for the regex in "pattern"
# save the result in "tmp"

# TIP:
# tmp = cdata...

tmp = cdata.filter(regex=pattern) # REMOVE

# add target back in
tmp = tmp.join(cdata[target])

In [None]:
# CHECK

if np.shape(tmp) == (17874, 27):
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')

In [None]:
# train test split

x = tmp.drop(target, axis=1)
y = tmp[target]
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=0)


# apply logistic regression

logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg.fit(xtrain, ytrain);


# make predictions

ypred = logreg.predict(xtest)
yprob = logreg.predict_proba(xtest)

# plot roc

exps['logreg_3'] = roc_wrapper(ytest, ypred, yprob)

---

## Decison Trees

In [None]:
# train test split

x = ldata.drop(target, axis=1)
y = ldata[target]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=0)

In [None]:
# tree

dt = DecisionTreeClassifier()
dt.fit(xtrain, ytrain)

ypred = dt.predict(xtest)
yprob = dt.predict_proba(xtest)

In [None]:
# plot

exps['dt_1'] = roc_wrapper(ytest, ypred, yprob)

In [None]:
# TODO:
# Extract the number of tree leaves
# and the tree depth from "dt"

# TIP:
# leaves = dt...
# depth = dt...

leaves = dt.get_n_leaves() # REMOVE
depth = dt.get_depth() # REMOVE

print("number of leaves:", leaves)
print("depth of the tree:", depth)

In [None]:
# CHECK

if leaves < 1700:
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')


if depth < 35:
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')

In [None]:
# tree

# TODO:
# Instantiate and fit a "DecisionTreeClassifier"
# Set "max_depth" to 5 and "min_samples_leaf" to 100

# TIP:
# dt = ...
# dt...

dt = DecisionTreeClassifier(max_depth=5, min_samples_leaf=100) # REMOVE
dt.fit(xtrain, ytrain) # REMOVE

ypred = dt.predict(xtest)
yprob = dt.predict_proba(xtest)

In [None]:
# CHECK

passed = []

if isinstance(dt, DecisionTreeClassifier):
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')
    passed.append(False)

try:
    check_is_fitted(dt)
    print('*** passed, well done!')
except:
    print('*** something went wrong, try again')
    passed.append(False)

if isinstance(ypred, np.ndarray):
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')
    passed.append(False)

if isinstance(yprob, np.ndarray):
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')
    passed.append(False)

In [None]:
print("number of leaves:", dt.get_n_leaves())
print("depth of the tree:", dt.get_depth())

In [None]:
# plot

exps['dt_2'] = roc_wrapper(ytest, ypred, yprob)

In [None]:
figsize = (12, 10)
plt.figure(figsize=figsize)
plot_tree(dt, max_depth=2, filled=True, fontsize='x-large')
plt.show()

---

## Ensemble Methods

**Random Forests**

In [None]:
n_estimators = 1000

In [None]:
# TODO:
# Fit and predict "RandomForestClassifier"
# Set "n_estimators" in "RandomForestClassifier"
# to the "n_estimators" variable. 

# TIP
#rf = ...
#rf...

#ypred = ...
#yprob = ...

rf = RandomForestClassifier(n_estimators=n_estimators) # REMOVE
rf.fit(xtrain, ytrain) # REMOVE

ypred = rf.predict(xtest) # REMOVE
yprob = rf.predict_proba(xtest) # REMOVE

In [None]:
# CHECK

passed = []

if isinstance(rf, RandomForestClassifier):
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')
    passed.append(False)

try:
    check_is_fitted(rf)
    print('*** passed, well done!')
except:
    print('*** something went wrong, try again')
    passed.append(False)

if isinstance(ypred, np.ndarray):
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')
    passed.append(False)

if isinstance(yprob, np.ndarray):
    print('*** passed, well done!')
else:
    print('*** something went wrong, try again')
    passed.append(False)

In [None]:
# plot

exps['rf'] = roc_wrapper(ytest, ypred, yprob)

**Ada Boost**

---

In [None]:
# fit
ab = AdaBoostClassifier(n_estimators=n_estimators)
ab.fit(xtrain, ytrain)

# predict
ypred = ab.predict(xtest)
yprob = ab.predict_proba(xtest)


In [None]:
# plot

exps['ab'] = roc_wrapper(ytest, ypred, yprob)

**Gradient Boosted Trees**

---

In [None]:
# fit
gb = GradientBoostingClassifier(n_estimators=n_estimators)
gb.fit(xtrain, ytrain)

# predict
ypred = gb.predict(xtest)
yprob = gb.predict_proba(xtest)

In [None]:
# plot

exps['gb'] = roc_wrapper(ytest, ypred, yprob)

**Summary**

---

In [None]:
metric_summary = pd.DataFrame(exps)
metric_summary.T

**Feature Importance Summary**

---

In [None]:
# gini feature importance

models = {}
models['dt'] = dt
models['rf'] = rf
models['ab'] = ab
models['gb'] = gb

imp_summary = pd.DataFrame()

for k, v in models.items():

    # TODO:
    # Extract "feature_names_in_"
    # and "feature_importances_"
    # from the model in variable "v"

    # TIP
    # names = v...
    # imp = v...
    
    names = v.feature_names_in_ # REMOVE
    imp = v.feature_importances_ # REMOVE

    feature_imp = pd.DataFrame(dict(zip(names, imp[:, None])), index=[k])
    imp_summary = pd.concat([imp_summary, feature_imp])

imp_summary.T.sort_values('gb', ascending=False)
