# MNIST classification (Cont'd)

In [None]:
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# MNIST

**Warning:** since Scikit-Learn 0.24, `fetch_openml()` returns a Pandas `DataFrame` by default. To avoid this and keep the same code as in the book, we use `as_frame=False`.

In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.keys()

In [None]:
X, y = mnist["data"], mnist["target"]

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
plt.imshow(some_digit_image, cmap=mpl.cm.binary)
plt.axis("off")
plt.show()

In [None]:
y[0]

In [None]:
y = y.astype(np.uint8)

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
# scale the input
from sklearn.preprocessing import StandardScaler

sc = StandardScaler() #center the distribution around zero (mean), with a standard deviation of 1.
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train) 


In [None]:
from sklearn.metrics import classification_report
y_test_pred = dt.predict(X_test)
print(classification_report(y_test, y_test_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_test_pred)

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train) 

In [None]:
from sklearn.metrics import classification_report
y_test_pred = rfc.predict(X_test)

print(classification_report(y_test, y_test_pred))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_test_pred)

### Simplies to a Binary classifier
Implement a _5 detector_
Prepare a data set for binary classification: 5 or not 5

In [None]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

Experiment with decision tree classifier


In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train_5) 

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
y_test_5_pred = dt_clf.predict(X_test)

print("Accuracy score: ", (accuracy_score(y_test_5, y_test_5_pred)))
print("Precision score: ", (precision_score(y_test_5, y_test_5_pred)))
print("Recall score: ", (recall_score(y_test_5, y_test_5_pred)))

In [None]:
from sklearn.metrics import confusion_matrix
CM = confusion_matrix(y_test_5, y_test_5_pred)
print(CM)

TN = CM[0][0]
FN = CM[1][0]
TP = CM[1][1]
FP = CM[0][1]

In [None]:
print(TN)
print(FN)
print(TP)
print(FP)

#### Task 1. Calculate True Positive Rate (recall) and False Positive Rate 

[predict_proba() in decision tree](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html?highlight=predict_proba#sklearn.tree.DecisionTreeClassifier.predict_proba)

In [None]:
y_test_5_scores = dt_clf.predict_proba(X_test)
y_test_5_scores

In [None]:
# We are interested in the positive/yes column so we will take all rows and only the second column
y_test_5_scores = dt_clf.predict_proba(X_test)[:,1]
y_test_5_scores

In [None]:
fpr, tpr, thresholds = roc_curve(y_test_5, y_test_5_scores)
print(thresholds)
print(tpr)
print(fpr)

In [None]:
# calculate AUC
roc_auc = auc(fpr, tpr)

Now plot out the ROC curve given the tpr and fpr values given above. Here we have only 3 points in the graph. 

In [None]:
# Plot ROC
plt.figure()
plt.plot(fpr, tpr, color='darkorange',
         lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

#### Task 2. Experiment with sklearn logistic regression classifier
[LogisticRegression()](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

You will train a logistic regression classifier for the 5 and not-5 classification problem, and plot the ROC curve. 

In [None]:
from sklearn.linear_model import LogisticRegression