<a href="https://colab.research.google.com/github/marziyeh-sa/Machine_Learning_Fall2020/blob/main/Hands_On/HO2_Task_Classification_ML2020.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

![alt text](Capture8.png "Title")

###  <font color=red>Working with MNIST Dataset</font> ###

__In this question, you are given the MNIST dataset, which consists of 70,000 images of digits handwritten by students and employees of US Census Bureau.__

You are supposed to perform a binary classification task, separating digit 4 from others (0,1,2,3,5,6,7,8,9). The data could be downloaded using the code below:

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.datasets import fetch_openml

# This will take less than a minute to run.

X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

### <font color=green>1:</font> ###

__In the first step, you are going to get acquainted with the dataset.__

Randomly, plot three digits from the dataset.

In [None]:
rnd = np.random.randint(1,len(X),3)

for i in range(3):
    img = X[rnd[i]].reshape((28,28))
    plt.imshow(img, cmap="Greys")
    plt.show()


### <font color=green>2:</font> ###
__In the second step, you should build your dataset. Do as follows:__
* Split your training and testing dataset (80% and 20%).
* Set the random_state to 10.
* As your goal is to identify digit 4, create the target vectors for this classification task  <font color=red>(note that you are training a binary classifier)</font>

In [None]:
# split data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)



In [None]:
y_train_4 = (y_train == 4)
y_test_4 = (y_test == 4)
print (y_train_4)

### <font color=green>3:</font> ###
__Do the classification task using a <font color=red>Gaussian Naive Bayes Classifier</font>__

In [None]:
from sklearn.naive_bayes import GaussianNB   

clf = GaussianNB()  

# fitting the classifier
clf.fit(X_train, y_train_4)

In [None]:
y_pred = clf.predict(X_test)

from sklearn.metrics import accuracy_score

print("The accuracy of the model is: %.1f%%" % (accuracy_score(y_test, y_pred)*100))

### <font color=green>4:</font> ###
__Calculate the following values:__
* Confusion Matrix
* Accuracy
* Sensitivity
* Specificity
* Precision
* Recall

In [None]:
# Confusion Matrix

from sklearn.metrics import confusion_matrix

confusion_mtx = confusion_matrix(y_test, y_pred)

print(confusion_mtx)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.figure(figsize = (5,5))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
class_names = ['0','1','2','3','4','5','6','7','8','9']

In [None]:
plot_confusion_matrix(confusion_mtx, class_names)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
# 2

FP = confusion_mtx.sum(axis=0) - np.diag(confusion_mtx)  
FN = confusion_mtx.sum(axis=1) - np.diag(confusion_mtx)
TP = np.diag(confusion_mtx)
TN = confusion_mtx.sum() - (FP + FN + TP)

# Sensitivity, hit rate, recall, or true positive rate
Sensitivity = TP/(TP+FN)
# Specificity or true negative rate
Specificity = TN/(TN+FP) 
# Precision or positive predictive value
Precision = TP/(TP+FP)
#Recall and sensitivity are one and the same
Recall = TP/(TP+FN)
# Negative predictive value
NPV = TN/(TN+FN)
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)
# False discovery rate
FDR = FP/(TP+FP)

# Overall accuracy
ACC = (TP+TN)/(TP+FP+FN+TN)
print ('Accuracy = ',ACC)
print ('Sensitivity = ',Sensitivity)
print ('Specificity = ', Specificity)
print ('Precision = ', Precision)
print ('Recall = ', Recall)


### <font color=green>5:</font> ###
__Now plot <font color=red>the ROC curve</font>, then calculate <font color=red>Area Under ROC (AUROC)</font>__

In [None]:

y_t = np.array([v.replace(',', '') for v in y_test], dtype=np.float32)
y_p = np.array([v.replace(',', '') for v in y_pred], dtype=np.float32)
 yt  = y_test.astype(np.float)
print (y_test)
print(yt)
print(y_p)

In [None]:
from sklearn.metrics import  roc_curve
from sklearn.metrics import roc_auc_score

def plot_ROC (y_test , y_pred):
    logit_roc_auc  = roc_auc_score(y_test , y_pred  ,  multi_class='ovo',average='macro')
    # calculate roc curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    plt.figure()
    plt.plot(fpr ,tpr , label = ' Guassian Naive bays (area = %0.2f)' %logit_roc_auc)
    plt.plot([0 , 1],[0 , 1], 'r--')
    plt.ylim([0.0 ,1.05])
    plt.xlim([0.0 ,1.0])
    plt.xlabel('False posetive Rate')
    plt.ylabel('True posetive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc = "lower right")
    plt.savefig('Log_ROC')
    plt.show


In [None]:
import pandas as pd

yy = pd.DataFrame(y_t)
yy

In [None]:

plot_ROC(yy , y_p )

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5,y_scores)

def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(12,8))
    plt.title('ROC curve')
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1],"k--")
    plt.xlim([0,1])
    plt.ylim([0,1])
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')

plot_roc_curve(fpr,tpr)
plt.show()

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_auc_score
myscore = make_scorer(roc_auc_score, multi_class='ovo',needs_proba=True)
myscore

In [None]:
ns_probs = [0 for _ in range(len(y_t))]
# calculate scores
ns_auc = roc_auc_score(y_t, ns_probs , multi_class='ovo')
lr_auc = roc_auc_score(y_t, y_p , multi_class='ovo')
# summarize scores
print('No Skill: ROC AUC=%.3f' % (ns_auc))
print('Logistic: ROC AUC=%.3f' % (lr_auc))
# calculate roc curves
ns_fpr, ns_tpr, _ = roc_curve(y_t, ns_probs)
lr_fpr, lr_tpr, _ = roc_curve(y_t, y_p)
# plot the roc curve for the model
pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()

In [None]:
from sklearn import multi_class_series 

label = class_names
roc = {label: [] for label in multi_class_series.unique()}
for label in multi_class_series.unique():
    roc[label] += roc_auc_score(y_test, y_pred[:,1])

In [None]:
# calculate AUC
auc = roc_auc_score(y_test, y_pred)
print('AUC: %.3f' % auc)

### <font color=green>6:</font> ###
__Finally, apply a 5-fold cross validation, and report your <font color=red>mean and std values</font>.__

In [None]:
from sklearn.model_selection import cross_val_score

acc = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print(acc)

print("Our accuracy is: %.2f%% +- %.2f%%" %(np.mean(acc)*100,np.std(acc)*100))

### <font color=green>7:</font> ###
__How are your results?__

__Is <font color=red>Gaussian Naive Bayes Classifier</font> a good one for this problem?__

 خیر ؛ همان گونه که مشاهده میکنیم دقت ما در این روش طبقه بند بسیار پایین است  وخطا بالاست  با مشاهده  خطای ولیدیشن میز مشاهده میکینم خطا بالاست 

### __Repeat all the steps using <font color=green>Nearest Centroid Classifier</font> and <font color=green>Logistic Regression Classifier</font>.__ ### 
* You just need to make some changes to step 3!
* You should report all the values in steps 4, 5, and 6

### __Which one is better? Report it.__ ###

_________________________________________________________