# TP2: Modelado sobre dataset Fashion MNIST

## Librerías

Fuente: https://pjreddie.com/projects/mnist-in-csv/

In [2]:
import pandas as pd
import os
import gzip
import numpy as np
from tqdm import tqdm
import requests

from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score

Correr una sola vez para descargar toda la data luego mutear todo el chunk. 

In [2]:
# def download_gz_from_url(url, output_folder): 
#     file_name = url.split("/")[-1]
#     r = requests.get(url, stream=True)
#     with open(output_folder+"/"+file_name, 'wb') as f:
#         for chunk in r.raw.stream(1024, decode_content=False):
#             if chunk:
#                 f.write(chunk)

# output_folder= "./data"
                
# from pathlib import Path
# Path(output_folder).mkdir(parents=True, exist_ok=True)
                
# urls = ["http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz", #x_train
#         "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-labels-idx1-ubyte.gz", #y_train
#         "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz", #x_test
#         "http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-labels-idx1-ubyte.gz" #y_test
#        ]
                
# for url in tqdm(urls):
#     download_gz_from_url(url,output_folder)

## Carga de Datos

In [3]:
import mnist_fashion_reader as mnist

X_train, y_train = mnist.load_mnist('data/', kind='train') #devuelve amigables objetos de Pandas
X_test, y_test = mnist.load_mnist('data/', kind='t10k') #devuevle amigables objetos de Pandas

## Punto 1: Exploración del dataset

Realizar una breve descripción del dataset. ¿Qué cantidad de datos tiene este conjunto? ¿Cuántos atributos? ¿Qué rangos de valores toman los atributos? ¿Qué cantidad de clases hay para clasificar? ¿Están balanceadas? Realizar un gráfico donde se muestran ejemplos de las clases (Puede ir a un anexo).

### Cantidad de datos del conjunto

In [4]:
df_entrenamiento = pd.concat([X_train.reset_index(drop=True), y_train], axis=1)
df_test = pd.concat([X_test.reset_index(drop=True), y_test], axis=1)
df_completo = pd.concat([df_entrenamiento, df_test])

In [5]:
print('Entrenamiento:', df_entrenamiento.shape,'  +  ' ,'Test:', df_test.shape, '  -->  ' , 'Completo:', df_completo.shape)

Entrenamiento: (60000, 785)   +   Test: (10000, 785)   -->   Completo: (70000, 785)


In [6]:
print(f"El dataset completo posee {len(df_completo)} registros. Cada registro corresponde a una imagen del dataset.")

El dataset completo posee 70000 registros. Cada registro corresponde a una imagen del dataset.


### Atributos

#### Cantidad de atributos

In [7]:
print(f"Cada atributo representa el valor de cada uno de los pixeles de cada imagen. Dado que cada imagen tiene un formato de 28 x 28 pixeles en escala de grises, el dataset posee {len(df_completo.columns)-1} atributos por registro, correspondientes al valor en escala de gris de cada pixel que compone la imagen")

print("+ UN (1) atributo con valores de 0 a 9 correspondiente a las clases a clasificar" )

Cada atributo representa el valor de cada uno de los pixeles de cada imagen. Dado que cada imagen tiene un formato de 28 x 28 pixeles en escala de grises, el dataset posee 784 atributos por registro, correspondientes al valor en escala de gris de cada pixel que compone la imagen
+ UN (1) atributo con valores de 0 a 9 correspondiente a las clases a clasificar


#### Rangos de valores

In [8]:
max_val = df_completo.max().max()
min_val = df_completo.min().min()

In [9]:
print(f"El valor mínimo de los atributos es {min_val} , y el máximo es {max_val}. Cada valor es un integer entre {min_val} y {max_val}")

El valor mínimo de los atributos es 0 , y el máximo es 255. Cada valor es un integer entre 0 y 255


#### Clases a clasificar

In [10]:
clases = np.sort(df_completo[0].unique())

clases

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8)

In [11]:
print(f"El dataset está compuesto por {len(clases)} clases")

El dataset está compuesto por 10 clases


##### N de clases

In [12]:
print(f"Las clases son: {clases}")

Las clases son: [0 1 2 3 4 5 6 7 8 9]


##### Balanceo de clases

In [13]:
Q = df_completo[0].value_counts().rename("Cant")
P = df_completo[0].value_counts(normalize=True).rename("%")

pd.concat([Q, P], axis=1).sort_index()


Unnamed: 0,Cant,%
0,7000,0.1
1,7000,0.1
2,7000,0.1
3,7000,0.1
4,7000,0.1
5,7000,0.1
6,7000,0.1
7,7000,0.1
8,7000,0.1
9,7000,0.1


##### Visualización 

## Separación en desarrollo y testeo

Separación de datos respetando la división presente en el github oficial de desarrollo y testeo

In [14]:
X_train.shape

(60000, 784)

In [15]:
y_train.shape

(60000,)

In [16]:
X_test.shape

(10000, 784)

In [17]:
y_test.shape

(10000,)

In [18]:
Q1 = y_train.value_counts().rename("Cant")
P1 = y_train.value_counts(normalize=True).rename("%")

pd.concat([Q1, P1], axis=1).sort_index().rename_axis('train')

Unnamed: 0_level_0,Cant,%
train,Unnamed: 1_level_1,Unnamed: 2_level_1
0,6000,0.1
1,6000,0.1
2,6000,0.1
3,6000,0.1
4,6000,0.1
5,6000,0.1
6,6000,0.1
7,6000,0.1
8,6000,0.1
9,6000,0.1


In [19]:
Q2 = y_test.value_counts().rename("Cant")
P2 = y_test.value_counts(normalize=True).rename("%")

pd.concat([Q2, P2], axis=1).sort_index().rename_axis('test')

Unnamed: 0_level_0,Cant,%
test,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1000,0.1
1,1000,0.1
2,1000,0.1
3,1000,0.1
4,1000,0.1
5,1000,0.1
6,1000,0.1
7,1000,0.1
8,1000,0.1
9,1000,0.1


## Separación del conjunto de desarrollo en entrenamiento y validación

Conjunto de entrenamiento (85.71%):
* x_train, y_train = 80% del conjunto de entrenamiento
* x_val, y_val = 20% del conjunto de entrenamiento

Conjunto de testing(14.29%)
* X_test, y_test = Conjunto held-out de testing

In [5]:
from sklearn.model_selection import train_test_split
seed = 42

x_train, x_val, y_train, y_val = train_test_split(X_train,y_train, test_size=0.2, random_state=seed, stratify = y_train) 

## Entrenamiento inicial de modelos

Entrenamiento de 4 modelos con sus hiperparámetros por defecto

In [7]:
#https://stackoverflow-com.translate.goog/questions/39685740/calculate-sklearn-roc-auc-score-for-multi-class?_x_tr_sl=es&_x_tr_tl=en&_x_tr_hl=es&_x_tr_pto=wapp
def roc_auc_score_multiclass(actual_class, pred_class, average = "macro"):

  #creating a set of all the unique classes using the actual class list
  unique_class = set(actual_class)
  roc_auc_dict = {}
  for per_class in unique_class:
    #creating a list of all the classes except the current class 
    other_class = [x for x in unique_class if x != per_class]

    #marking the current class as 1 and all other classes as 0
    new_actual_class = [0 if x in other_class else 1 for x in actual_class]
    new_pred_class = [0 if x in other_class else 1 for x in pred_class]

    #using the sklearn metrics method to calculate the roc_auc_score
    roc_auc = roc_auc_score(new_actual_class, new_pred_class, average = average)
    roc_auc_dict[per_class] = roc_auc

  return roc_auc_dict

### Boosting

#### GradientBoostingClassifier

In [22]:
gbc = GradientBoostingClassifier()
gbc.fit(x_train, y_train)
y_pred = gbc.predict(x_val)

print("Accuracy training : {:.3f}".format(gbc.score(x_train, y_train)))
print("Accuracy val: {:.3f}".format(gbc.score(x_val, y_val)))
print(classification_report(y_val,y_pred))
gbc_roc_auc_multiclass = roc_auc_score_multiclass(y_val,y_pred)
print(f"ROC Score multiclass (macro) : {gbc_roc_auc_multiclass}")

Accuracy training : 0.908
Accuracy val: 0.875
              precision    recall  f1-score   support

           0       0.82      0.85      0.83      1200
           1       0.99      0.97      0.98      1200
           2       0.76      0.78      0.77      1200
           3       0.89      0.90      0.89      1200
           4       0.77      0.80      0.79      1200
           5       0.97      0.95      0.96      1200
           6       0.70      0.64      0.67      1200
           7       0.94      0.95      0.94      1200
           8       0.97      0.97      0.97      1200
           9       0.95      0.95      0.95      1200

    accuracy                           0.88     12000
   macro avg       0.87      0.88      0.87     12000
weighted avg       0.87      0.88      0.87     12000

ROC Score multiclass (macro) : {0: 0.913101851851852, 1: 0.9844907407407407, 2: 0.8776851851851851, 3: 0.9411574074074075, 4: 0.8869907407407407, 5: 0.9756481481481483, 6: 0.8025925925925925, 7: 

### Bagging

#### Random Forest Classifier

In [23]:
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_val)

print("Accuracy training : {:.3f}".format(rf.score(x_train, y_train)))
print("Accuracy val: {:.3f}".format(rf.score(x_val, y_val)))
print(classification_report(y_val,y_pred))
rfc_roc_auc_multiclass = roc_auc_score_multiclass(y_val,y_pred)
print(f"ROC Score multiclass (macro) : {rfc_roc_auc_multiclass}")

Accuracy training : 1.000
Accuracy val: 0.883
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      1200
           1       0.99      0.97      0.98      1200
           2       0.77      0.81      0.79      1200
           3       0.88      0.91      0.89      1200
           4       0.77      0.82      0.80      1200
           5       0.96      0.96      0.96      1200
           6       0.75      0.62      0.68      1200
           7       0.94      0.94      0.94      1200
           8       0.96      0.97      0.97      1200
           9       0.95      0.95      0.95      1200

    accuracy                           0.88     12000
   macro avg       0.88      0.88      0.88     12000
weighted avg       0.88      0.88      0.88     12000

ROC Score multiclass (macro) : {0: 0.9253240740740741, 1: 0.984212962962963, 2: 0.8905555555555557, 3: 0.9473148148148148, 4: 0.8965740740740739, 5: 0.9797222222222222, 6: 0.8005555555555555, 7: 

#### Bagging decision trees

Alternativa al random forest

In [8]:
dt = DecisionTreeClassifier(random_state=42)
bdt = BaggingClassifier(dt, random_state=42)

bdt.fit(x_train, y_train)
y_pred = bdt.predict(x_val)

print("Accuracy training : {:.3f}".format(bdt.score(x_train, y_train)))
print("Accuracy val: {:.3f}".format(bdt.score(x_val, y_val)))
print(classification_report(y_val,y_pred))
rfc_roc_auc_multiclass = roc_auc_score_multiclass(y_val,y_pred)
print(f"ROC Score multiclass (macro) : {rfc_roc_auc_multiclass}")

Accuracy training : 0.994
Accuracy val: 0.858
              precision    recall  f1-score   support

           0       0.79      0.85      0.82      1200
           1       0.98      0.97      0.97      1200
           2       0.73      0.80      0.77      1200
           3       0.86      0.88      0.87      1200
           4       0.76      0.74      0.75      1200
           5       0.94      0.94      0.94      1200
           6       0.70      0.58      0.63      1200
           7       0.91      0.93      0.92      1200
           8       0.95      0.96      0.95      1200
           9       0.95      0.92      0.94      1200

    accuracy                           0.86     12000
   macro avg       0.86      0.86      0.86     12000
weighted avg       0.86      0.86      0.86     12000

ROC Score multiclass (macro) : {0: 0.9139351851851851, 1: 0.9845833333333334, 2: 0.8843055555555556, 3: 0.932175925925926, 4: 0.8558796296296296, 5: 0.9677777777777777, 6: 0.7762037037037037, 7: 

### Árbol de decisión

In [24]:
ad = DecisionTreeClassifier(random_state=42)
ad.fit(x_train, y_train)
y_pred = ad.predict(x_val)

print("Accuracy training : {:.3f}".format(ad.score(x_train, y_train)))
print("Accuracy val: {:.3f}".format(ad.score(x_val, y_val)))
print(classification_report(y_val,y_pred))
ad_roc_auc_multiclass = roc_auc_score_multiclass(y_val,y_pred)
print(f"ROC Score multiclass (macro) : {ad_roc_auc_multiclass}")

Accuracy training : 1.000
Accuracy val: 0.797
              precision    recall  f1-score   support

           0       0.74      0.75      0.75      1200
           1       0.94      0.95      0.95      1200
           2       0.66      0.68      0.67      1200
           3       0.83      0.78      0.80      1200
           4       0.67      0.66      0.66      1200
           5       0.91      0.90      0.90      1200
           6       0.55      0.56      0.55      1200
           7       0.88      0.88      0.88      1200
           8       0.91      0.92      0.92      1200
           9       0.90      0.89      0.90      1200

    accuracy                           0.80     12000
   macro avg       0.80      0.80      0.80     12000
weighted avg       0.80      0.80      0.80     12000

ROC Score multiclass (macro) : {0: 0.8625, 1: 0.9734722222222222, 2: 0.8216666666666668, 3: 0.8800462962962963, 4: 0.8097685185185185, 5: 0.9443981481481482, 6: 0.752361111111111, 7: 0.9337037037

### Naive Bayes

In [25]:
nb = MultinomialNB()
nb.fit(x_train, y_train)

y_pred = nb.predict(x_val)
print("Accuracy training : {:.3f}".format(nb.score(x_train, y_train)))
print("Accuracy val: {:.3f}".format(nb.score(x_val, y_val)))
print(classification_report(y_val,y_pred))
nb_roc_auc_multiclass = roc_auc_score_multiclass(y_val,y_pred)
print(f"ROC Score multiclass (macro) : {nb_roc_auc_multiclass}")

Accuracy training : 0.666
Accuracy val: 0.668
              precision    recall  f1-score   support

           0       0.75      0.79      0.77      1200
           1       0.98      0.89      0.94      1200
           2       0.62      0.60      0.61      1200
           3       0.67      0.89      0.76      1200
           4       0.46      0.63      0.53      1200
           5       0.69      0.16      0.26      1200
           6       0.32      0.16      0.21      1200
           7       0.60      0.91      0.72      1200
           8       0.89      0.83      0.86      1200
           9       0.69      0.83      0.75      1200

    accuracy                           0.67     12000
   macro avg       0.67      0.67      0.64     12000
weighted avg       0.67      0.67      0.64     12000

ROC Score multiclass (macro) : {0: 0.879861111111111, 1: 0.9462962962962963, 2: 0.77875, 3: 0.9185648148148148, 4: 0.7739351851851851, 5: 0.5744444444444444, 6: 0.5597222222222221, 7: 0.922175925

## Métricas relevantes para el problema en el conjunto de validación

### Métricas

### Justificación

## Exploración de modelo Random Forest

## Exploración de modelo Gradient Boosting Machine

## Análisis de mejores modelos obtenidos de Random Forest y Gradient Boosting Machine

### Evaluación con conjunto de test

### Análisis de resultados en base a métricas elegidas

#### Análisis de métricas

### Conclusión sobre mejor modelo

### Matriz de confusión de mejor modelo

## Estabilidad del modelo frente al ruido