# MNIST Classifier

In this notebook you will create both, an mnist tabular dataset and a classifier.

## 1.- import the Operating System (os) module in python and any other library you need

In [1]:
import os
from PIL import Image
import numpy as np
import pandas as pd

## 2.- As you can see each class has its own folder (Do it only for train). 

    - Iterate folder by folder ( os.listdir() )
    - Inside each folder: 
        1.- Read the image
        2.- Reshape it into a flat array (784,)
        3.- Save the data into a pandas dataframe apending the column name as the class
    - Save the data into a CSV

    Note: if it takes to long try doing only 100 images per folder for the CSV.

In [2]:
directory = 'D:/AI-Course/AI-Engineering/AI-Engineering/Chapter 2/12. Images/archive/trainingSet/trainingSet/'

dir_list = os.listdir(directory)                  # determine the number of folder in the given directory

df1 = pd.DataFrame()                              # initial DF

for file in dir_list:
    imgs = os.listdir(directory+file)             # reading from directory --> Files, the images.

    arr = np.zeros((len(imgs),785))               # builds an array with 0 according to the dimensions of image and 785 (1+784)
    for i,img in enumerate(imgs):
        imag = Image.open(directory+file+'/'+img) # in iteration loop, open every image
        arry = np.array(imag,dtype=float)         
        # print(arry.shape)                       # to get the actual shape
        arry = arry.flatten()                     # changes the shape to (784,)
        arr[i,:784]=arry                          # reading from first column till 783 column
        arr[i,784]=int(file)                      # reading the last column 784
    df2 = pd.DataFrame(data=arr)                  # building a temporary DF
    df = pd.concat([df1,df2])                     # merging the results in original DF
    df1 = df
    #print(df.shape)

df1.to_csv('data.csv',index=False, header = False)

## 3.- Load the CSV

In [3]:
values = pd.read_csv ('data.csv',header = None)
X = values.iloc[:,:-1]
y = values.iloc[:,-1]
values.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,775,776,777,778,779,780,781,782,783,784
0,3.0,0.0,0.0,3.0,7.0,3.0,0.0,3.0,0.0,11.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4.- Create a dictionary of models (No preprocessing needed, it has already been done).
    
    Include both, tree models and mult models.

In [4]:
from sklearn.ensemble         import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree             import DecisionTreeClassifier
from catboost                 import CatBoostClassifier
from xgboost                  import XGBClassifier
from lightgbm                 import LGBMClassifier
from sklearn.svm              import SVC
from sklearn.linear_model     import LogisticRegression
from sklearn.neighbors        import KNeighborsClassifier
from sklearn.model_selection  import train_test_split
from sklearn                  import metrics
import time

tree_classifiers = {
  "Random Forest":        RandomForestClassifier(n_estimators=100),
  "AdaBoost":             AdaBoostClassifier(n_estimators=100),
  "Skl GBM":              GradientBoostingClassifier(n_estimators=100),
  "Decision Tree":        DecisionTreeClassifier(),
  "CatBoost":             CatBoostClassifier(n_estimators=100),
  "XGBoost":              XGBClassifier(n_estimators=100),
  "LightGBM":             LGBMClassifier(n_estimators=100),
  "SVM":                  SVC(),
  "Logistic Regression":  LogisticRegression(),
  "KNN":                  KNeighborsClassifier(n_neighbors=3)
}


## 5.- Using either cross validation or stratification find out which is the best model
    - Base your code on the previous two days examples

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0, stratify=y)


def predictions(data):
    results = pd.DataFrame(columns=['Model','Accuracy[%]','Bal Accuracy [%]','Time [s]'])

    for model_name, model in tree_classifiers.items():
        start_time = time.time()

        model.fit(X_train,y_train) 

        pred = model.predict(data) # get the predicitons using x values

        total_time = time.time() - start_time

        results = results.append({"Model":model_name, "Accuracy[%]": metrics.accuracy_score(y_test, pred)*100, "Bal Accuracy [%]": metrics.balanced_accuracy_score(y_test, pred)*100,"Time [s]":total_time},ignore_index=True)
    return results
final_results = predictions(X_test)
final_results

Learning rate set to 0.5
0:	learn: 1.4307139	total: 6.14s	remaining: 10m 8s
1:	learn: 1.0650109	total: 10.8s	remaining: 8m 48s
2:	learn: 0.8672597	total: 14.7s	remaining: 7m 55s
3:	learn: 0.7117051	total: 18.7s	remaining: 7m 29s
4:	learn: 0.6363522	total: 22.8s	remaining: 7m 12s
5:	learn: 0.5581984	total: 26.8s	remaining: 6m 59s
6:	learn: 0.5070194	total: 30.8s	remaining: 6m 48s
7:	learn: 0.4589989	total: 34.7s	remaining: 6m 39s
8:	learn: 0.4270370	total: 38.8s	remaining: 6m 31s
9:	learn: 0.3959495	total: 42.7s	remaining: 6m 24s
10:	learn: 0.3674313	total: 46.7s	remaining: 6m 17s
11:	learn: 0.3421192	total: 50.7s	remaining: 6m 12s
12:	learn: 0.3208680	total: 54.8s	remaining: 6m 6s
13:	learn: 0.3016118	total: 59.1s	remaining: 6m 3s
14:	learn: 0.2904509	total: 1m 3s	remaining: 5m 57s
15:	learn: 0.2765036	total: 1m 7s	remaining: 5m 51s
16:	learn: 0.2673874	total: 1m 10s	remaining: 5m 46s
17:	learn: 0.2594287	total: 1m 14s	remaining: 5m 41s
18:	learn: 0.2534564	total: 1m 18s	remaining: 5m 





STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Model,Accuracy[%],Bal Accuracy [%],Time [s]
0,Random Forest,95.833333,95.80515,43.020005
1,AdaBoost,73.130952,72.836665,175.419226
2,Skl GBM,93.785714,93.725907,3130.605723
3,Decision Tree,83.940476,83.73995,19.304379
4,CatBoost,93.833333,93.790959,390.651377
5,XGBoost,96.821429,96.810514,561.856497
6,LightGBM,97.202381,97.186757,103.048588
7,SVM,97.547619,97.524004,138.337261
8,Logistic Regression,91.72619,91.634824,9.015657
9,KNN,96.25,96.194329,13.198211


## Optional: Can you rotate an image?