# Classification



In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn import datasets

from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split

In [None]:
higgs = datasets.fetch_openml(name='higgs', version=2)

In [None]:
df = pd.DataFrame(higgs.data, columns=higgs.feature_names)

In [None]:
df.head()

In [None]:
df.iloc[-1]

In [None]:
df.columns

In [None]:
X = df.loc[:, ['m_jj', 'm_jjj', 'm_lv', 'm_jlv', 'm_bb', 'm_wbb', 'm_wwbb']].iloc[:-1].values

y = higgs.target[:-1].astype(np.int)

In [None]:
y

Binnary Classification: Two Classes (0 and 1)

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)

In [None]:
GaussianNB?

In [None]:
GNB = GaussianNB()

In [None]:
GNB.fit(X_train, y_train)

In [None]:
GNB.score(X_train, y_train)

In [None]:
GNB.score(X_test, y_test)

In [None]:
help(GNB.score)

In [None]:
y_pred = GNB.predict(X_test)

y_pred

In [None]:
y_test

## Confusion Matrix

Definition:

$$\mathrm{ CM = \left(\begin{matrix}
 \mathrm{tn} & \mathrm{fp} \\ 
 \mathrm{fn} & \mathrm{tp}
\end{matrix}\right) }$$

Where

    tp: True Positive values

    fp: False Positive values

    tn: True Negative values

    fn: False Negative values

In [None]:
y_pred == 0

In [None]:
p = y_pred == 1 # positives
n = y_pred == 0 # negatives

In [None]:
rp = y_test == 1 # real positives
rn = y_test == 0 # real negatives

In [None]:
tp = (p & rp).sum() # true positives
tn = (n & rn).sum() # true negatives
fp = (p & rn).sum() # false positives
fn = (n & rp).sum() # false negatives

In [None]:
tp, tn, fp, fn

In [None]:
(tp + tn + fp + fn) == len(y_test)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(y_test, y_pred)

cm

In [None]:
tp == cm[1, 1], tn == cm[0, 0], fp == cm[0, 1], fn == cm[1, 0]

In [None]:
y_test

In [None]:
y_proba = GNB.predict_proba(X_test)

y_proba

In [None]:
y_proba = y_proba[:, 1]

y_proba

In [None]:
np.unique?

In [None]:
probas = np.sort(y_proba)

In [None]:
probas

In [None]:
probas.shape

In [None]:
probas = probas[::10]

probas.shape

In [None]:
cms = np.stack([
    confusion_matrix(y_test, y_proba > proba)
    for proba in probas
])

In [None]:
tps = cms[:, 1, 1]
tns = cms[:, 0, 0]
fps = cms[:, 0, 1]
fns = cms[:, 1, 0]

In [None]:
plt.plot(probas, tps, label = 'True Positive')
plt.plot(probas, fps, label = 'False Positive')
plt.plot(probas, tns, label = 'True Negative')
plt.plot(probas, fns, label = 'False Negative')

plt.legend()

plt.xlabel('Treshold')

Acurácia:

$$ \mathrm{Accuracy = \frac{tp+tn}{tp+tn+fp+fn}} $$

In [None]:
acc = (tps + tns)/(tps + tns + fps + fns)

In [None]:
plt.plot(probas, acc)

In [None]:
acc.max(), probas[acc.argmax()]

## Receiver Operating Characteristic Curve (ROC curve)

True Positive Rate:

$$ \mathrm{TPR = \frac{tp}{tp + fn}} $$

False Positive Rate:

$$ \mathrm{FPR = \frac{fp}{fp + tn}} $$

In [None]:
from sklearn.metrics import roc_curve

In [None]:
fpr, tpr, treshold = roc_curve(y_test, y_proba)

In [None]:
plt.plot(fpr, tpr, label='GNB')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.legend()

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
KNR = KNeighborsClassifier(n_neighbors=5)

In [None]:
KNR.fit(X_train, y_train)

In [None]:
y_proba_knr = KNR.predict_proba(X_test)

y_proba_knr = y_proba_knr[:, 1]

In [None]:
fpr, tpr, _ = roc_curve(y_test, y_proba_knr)

plt.plot(fpr, tpr, label='KNR')

fpr, tpr, _ = roc_curve(y_test, y_proba)

plt.plot(fpr, tpr, label='GNB')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.legend()

Score: Area Under the ROC Curve (AUC)

Perfect Classifier: AUC = 1

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test, y_proba), roc_auc_score(y_test, y_proba_knr)

## Precision-Recall Curve

Precision:

$$ \mathcal{P} = \mathrm{\frac{tp}{tp + fp}} $$

Recall (True Positive Rate):

$$ \mathcal{R} = \mathrm{\frac{tp}{tp + fn}} $$

In [None]:
from sklearn.metrics import precision_recall_curve

In [None]:
y, x, _ = precision_recall_curve(y_test, y_proba_knr)

plt.plot(x, y, label='KNR')

y, x, _ = precision_recall_curve(y_test, y_proba)

plt.plot(x, y, label='GNB')

plt.xlabel('Recall')
plt.ylabel('Precision')

plt.legend()

Score: Average Precision(~Area Under the Precision-Recall Curve)

$$ \text{AP} = \sum_n (\mathcal{R}_n - \mathcal{R}_{n-1}) \mathcal{P}_n $$

In [None]:
from sklearn.metrics import average_precision_score

In [None]:
average_precision_score(y_test, y_proba), average_precision_score(y_test, y_proba_knr)