# MNIST Classifier

In this notebook you will create both, an mnist tabular dataset and a classifier.

## 1.- import the Operating System (os) module in python and any other library you need

In [5]:
import os
from PIL import Image
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.svm import SVC
import time
from sklearn import model_selection

## 2.- As you can see each class has its own folder (Do it only for train). 

    - Iterate folder by folder ( os.listdir() )
    - Inside each folder: 
        1.- Read the image
        2.- Reshape it into a flat array (784,)
        3.- Save the data into a pandas dataframe apending the column name as the class
    - Save the data into a CSV

    Note: if it takes to long try doing only 100 images per folder for the CSV.

In [6]:
def img_to_csv():
    # Create an empty dataframe
    df = pd.DataFrame()
    # Base Directory Path
    base_dir = './dataSet/trainingSet/trainingSet/'
    for folder in os.listdir(base_dir):
        #print(folder + "\n")
        # Creating a frame of the images with np.zeros 
        img_frame = np.zeros((len(os.listdir(base_dir+"/"+folder)), 785))
        for num, images in enumerate(os.listdir(base_dir+"/"+folder)):
            #print(num, images)
            # Open the images of 28x28 pixels
            img = Image.open(base_dir+str(folder)+"/"+images)
            img_arr = np.array(img, dtype=float)
            # Change the shape to make single dimension using flatten
            img_arr = img_arr.flatten()
            # Adding target labels in the image frame at first
            img_frame[num][0] = int(folder)
            # Adding remaining features in the image frame
            img_frame[num][1:] = img_arr
            # Converting the array into dataframe
        img_frame = pd.DataFrame(img_frame)
        # Concatenating with each iteration of the target variables/folders [0-9]
        df = pd.concat([df, img_frame])
    df.to_csv("train.csv", index=False)
    return df

## 3.- Load the CSV

In [8]:
df = img_to_csv()

In [9]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,0.0,3.0,0.0,0.0,3.0,7.0,3.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4183,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4184,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4185,9.0,0.0,3.0,6.0,1.0,0.0,2.0,2.0,0.0,0.0,...,4.0,7.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4186,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

## 4.- Create a dictionary of models (No preprocessing needed, it has already been done).
    
    Include both, tree models and mult models.

In [13]:
classifiers = {
    "Decision Tree": DecisionTreeClassifier(),
    "GaussianNB": GaussianNB(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "LightGBM": LGBMClassifier()   
}

## 5.- Using either cross validation or stratification find out which is the best model
    - Base your code on the previous two days examples

In [14]:
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [15]:
results = []
for name, model in classifiers.items():
    print("Training "+name+" Model")
    start_time = time.time()
    prediction = model_selection.cross_val_predict(model, X, y, cv=skf)
    total_time = time.time() - start_time
    print("Done Training "+name+" Model")
    
    results.append({
        'ModelName': name,
        'Accuracy': metrics.accuracy_score(y, prediction)*100,
        'Bal Acc.': metrics.balanced_accuracy_score(y, prediction)*100,
        'Time': total_time})
    
pd.DataFrame(results)

Training Decision Tree Model
Done Training Decision Tree Model
Training GaussianNB Model
Done Training GaussianNB Model
Training Random Forest Model
Done Training Random Forest Model
Training AdaBoost Model
Done Training AdaBoost Model
Training SVM Model
Done Training SVM Model
Training KNN Model
Done Training KNN Model
Training LightGBM Model
Done Training LightGBM Model


Unnamed: 0,ModelName,Accuracy,Bal Acc.,Time
0,Decision Tree,83.97381,83.790762,78.87711
1,GaussianNB,62.690476,61.976554,5.595452
2,Random Forest,95.92619,95.897088,148.903469
3,AdaBoost,72.071429,71.736004,235.50824
4,SVM,97.557143,97.545225,1117.095925
5,KNN,96.62619,96.574708,93.208538
6,LightGBM,97.1,97.091736,655.01292


## Optional: Can you rotate an image?

In [None]:
img = Image.open('./dataSet/trainingSet/trainingSet/2/img_10000.jpg')

In [None]:
img_rotate = img.rotate(180)

In [None]:
img_rotate.show()