# Classification

Content

- Logistic Regression

    - ...

- Single Decision Trees

    - ...

- Tree Ensemble Methods

ToDo:

- LogReg: ROC curve only from the missing values

- LogReg: The problem with correlated values

Additional Material:

- very cool visualization from [r2d3](http://www.r2d3.us/visual-intro-to-machine-learning-part-1/)

# Central Concepts

**Logistic Regression**

$logreg = \frac{1}{1-exp(-z)}$

$z = \beta_0 + x_{i1}\beta_1 + ...$


---
Content from the slides

Decision Tree
- Classification or Regression: Explain how!
- CART algorithm
- Purity of a sample? Gini-Index, Entropy, Chi-square, Information Gain
- Pruning

Ensemble Models
- Random Forests
- AdaBoost
- GradientBoostedTrees
---


In [None]:
dark_plot_theme = True

if dark_plot_theme:
    plt.style.use('dark_background')


# pandas display settings

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
import sys

from sklearn.model_selection import train_test_split

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# clustering metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

from sklearn.metrics import ConfusionMatrixDisplay


In [None]:
# reload ./utils.py

import importlib
import utils
importlib.reload(utils)
from utils import get_dichotomous

# Load Feature Data

assume: no nulls, no outliers (z>3)

In [None]:
# prepare the directory and load the data

cwd = Path()

ipath = cwd / 'data'

ipath.mkdir(exist_ok=True)

ifile = ipath / 'features.csv'
data = pd.read_csv(ifile, index_col=['id'])

data.head(3)

# Classification Target

In [None]:
# remove imputations from the new target variable

var = 'review_scores_rating'

mask = ((data[f'imp_z_{var}'] + data[f'imp_{var}']) == 0)

cdata = data[mask].dropna()

target = 'top_rating'

if var in cdata:
    rel = (cdata[var] == 100).sum() / len(cdata[var]) * 100

    cdata[target] = (cdata[var] == 100).astype('uint8')
    
    cdata = cdata.drop(var, axis=1)

In [None]:
# plot basis of the new target

fig, axs = plt.subplots(1, 2, figsize=(10, 5))

sns.histplot(data=data, x=var, ax=axs[0]);
sns.histplot(data=data[mask], x=var, ax=axs[1]);

print(f'Top Ratings: {rel:.2f} %')

# Logistic Regression

In [None]:
ldata = cdata.select_dtypes(include=np.number)

# drop dichotomous (keep it simple)
cols = get_dichotomous(ldata)
ldata = ldata.drop(cols, axis=1)

if target not in ldata:
    ldata = ldata.join(cdata[target])


In [None]:
ldata.columns

In [None]:
# train test split

x = ldata.drop(target, axis=1)
y = ldata[target]

xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=0)


# apply logistic regression

logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg.fit(xtrain, ytrain);


# make predictions

ypred = logreg.predict(xtest)
yprob = logreg.predict_proba(xtest)

In [None]:
# quick look

ypred.sum()

In [None]:
# plot probabilities

def roc_wrapper(ytest, yprob):
    # calculate the ROC values
    fpr, tpr, thresholds = roc_curve(ytest, yprob[:,1])

    # plot ROC curve
    fig, axs = plt.subplots(1, 2, figsize=(10, 5))
    axs[0].plot(fpr, tpr, label='tpr')
    axs[0].plot([0,1], [0,1])
    axs[0].set_aspect('equal')

    # probabilities the model predicts a 1
    sns.histplot(yprob[:, 1], ax=axs[1], stat='probability');

    axs[1].set_xlim(0, 1)
    axs[1].set_ylim(0, 1)

In [None]:
roc_wrapper(ytest, yprob)

In [None]:
## correlation

#cor = np.abs(ldata.drop(get_dichotomous(ldata), axis=1).corr())

# absolute correlation
#fig, axs = plt.subplots(1, 2, figsize=(16, 6))
#sns.heatmap(cor, annot=False, cmap=plt.cm.Blues, vmin=0, vmax=1, ax=axs[0]);

# absolution correlation > 0.7
#sns.heatmap(cor.where(cor>0.7, other=0), annot=False, cmap=plt.cm.Blues, vmin=0, vmax=1, ax=axs[1]);

This is bad. 

In [None]:
# drop the correlated feature

ldata = ldata.drop('host_id', axis=1)

In [None]:
# train test split

x = ldata.drop(target, axis=1)
y = ldata[target]
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=0)


# apply logistic regression

logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg.fit(xtrain, ytrain);


# make predictions

ypred = logreg.predict(xtest)
yprob = logreg.predict_proba(xtest)

# plot roc

roc_wrapper(ytest, yprob)

In [None]:
# calculate metrics

cmetrics = {}
cmetrics['accuracy_score'] = accuracy_score
cmetrics['f1_score'] = f1_score
cmetrics['precision_score'] = precision_score
cmetrics['recall_score'] = recall_score

for k, v in cmetrics.items():
    metric = v(ytest, ypred)
    print(f'{k:16} {metric:.3f}')

In [None]:


auc = roc_auc_score(ytest, yprob[:, 1])

print(f'AUC: {auc:.2f}')

## Confusion Matrix for one threshold

In [None]:
# plot confusion matrix
# default treshold?

tn, fp, fn, tp = confusion_matrix(ytest, ypred, normalize='all').ravel()

fig, ax = plt.subplots()
sns.heatmap([[tp, fp],[fn, tn]], cmap='Blues', vmax=1, annot=True, xticklabels=[1, 0], yticklabels=[1, 0], ax=ax);

ax.xaxis.tick_top();
ax.xaxis.set_label_position('top');
ax.set_xlabel('Actual');
ax.set_ylabel('Predicted');

## Only Imputation Flags

In [None]:
# filter for imputation flags

pattern = '^imp.*'
ldata = cdata.filter(regex=pattern)

# add target back in
ldata = ldata.join(cdata[target])

In [None]:
# train test split

x = ldata.drop(target, axis=1)
y = ldata[target]
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=0)


# apply logistic regression

logreg = LogisticRegression(solver='liblinear', random_state=0)
logreg.fit(xtrain, ytrain);


# make predictions

ypred = logreg.predict(xtest)
yprob = logreg.predict_proba(xtest)

# plot roc

roc_wrapper(ytest, yprob)

In [None]:
acc = accuracy_score(ytest, ypred)
auc = roc_auc_score(ytest, yprob[:, 1])

print(f'AUC: {auc:.2f}')
print(f'ACC: {acc:.2f}')

---

# Decison Trees

---

# Ensemble Methods