# Práctica 2. Métricas de evaluación

In [1]:
from keras.datasets import mnist
import numpy as np

(x_train, y_train), (x_test, y_test) = mnist.load_data()

np.random.seed(32)

# Se obtiene una muestra con igual número de: cincos vs no cincos
# para el conjunto de entrenamiento y el de test
idx_y_train_5 = np.where(y_train == 5)[0]
idx_y_train_n5 = np.where(y_train != 5)[0]
idx_y_train_n5 = np.random.choice(idx_y_train_n5, idx_y_train_5.shape[0], replace=False)

idx_y_train = np.concatenate([idx_y_train_5, idx_y_train_n5])
np.random.shuffle(idx_y_train)

x_train = x_train[idx_y_train]
y_train = y_train[idx_y_train]
print(x_train.shape)

idx_y_test_5 = np.where(y_test == 5)[0]
idx_y_test_n5 = np.where(y_test != 5)[0]
idx_y_test_n5 = np.random.choice(idx_y_test_n5, idx_y_test_5.shape[0], replace=False)

idx_y_test = np.concatenate([idx_y_test_5, idx_y_test_n5])
np.random.shuffle(idx_y_test)

x_test = x_test[idx_y_test]
y_test = y_test[idx_y_test]
print(x_test.shape)

y_train_binary = (y_train == 5).astype(int)
y_test_binary = (y_test == 5).astype(int)

(10842, 28, 28)
(1784, 28, 28)


In [2]:
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.metrics import F1Score


def create_model(x_train, y_train, epoch, batch_size, validation_split):
  '''Creates the perceptron model.'''

  model = Sequential([
      Flatten(input_shape=(28, 28)),
      Dense(1)
  ])

  model.compile(loss='binary_crossentropy', metrics=['accuracy'])
  model.fit(x_train, y_train,
            epochs=epoch,
            batch_size=batch_size,
            validation_split=validation_split)

  return model


In [3]:
from sklearn.metrics import confusion_matrix, \
precision_score, recall_score,f1_score, accuracy_score

import pandas as pd

def get_metrics(model, x_test, y_test) -> pd.DataFrame:
  '''Calculates precision, recall and f1-score for
  both labels and stores the results in a DataFrame.'''

  y_pred = (model.predict(x_test) >= 0.5).astype(int)[:,0]

  accuracy = accuracy_score(y_test, y_pred)

  precision = precision_score(y_test, y_pred, pos_label=1)
  accuracy = accuracy_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred, pos_label=1)
  f1_s = f1_score(y_test, y_pred, pos_label=1)

  precision_0 = precision_score(y_test, y_pred, pos_label=0)
  recall_0 = recall_score(y_test, y_pred, pos_label=0)
  f1_s_0 = f1_score(y_test, y_pred, pos_label=0)

  results = {
      'accuracy': accuracy,
      'precision': [precision, precision_0],
      'recall': [recall, recall_0],
      'f1_s': [f1_s, f1_s_0]
  }

  return pd.DataFrame(data=results)

## Entrenamiento con los hiperparámetros dados

In [4]:
model = create_model(x_train, y_train_binary, epoch=4, batch_size=2000, validation_split=0.9)
metrics_model1 = get_metrics(model, x_test, y_test_binary)
print(metrics_model1)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
   accuracy  precision    recall      f1_s
0  0.552691   0.643293  0.236547  0.345902
1  0.552691   0.532280  0.868834  0.660136


In [5]:
model = create_model(x_train, y_train_binary, epoch=20, batch_size=2000, validation_split=0.9)
metrics_model2 = get_metrics(model, x_test, y_test_binary)
print(metrics_model2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
   accuracy  precision    recall      f1_s
0  0.574552   0.548364  0.845291  0.665196
1  0.574552   0.662592  0.303812  0.416603


In [6]:
model = create_model(x_train, y_train_binary, epoch=4, batch_size=2, validation_split=0.9)
metrics_model3 = get_metrics(model, x_test, y_test_binary)
print(metrics_model3)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
   accuracy  precision    recall      f1_s
0  0.581278   0.611026  0.447309  0.516505
1  0.581278   0.564103  0.715247  0.630746


In [7]:
model = create_model(x_train, y_train_binary, epoch=4, batch_size=2000, validation_split=0.01)
metrics_model4 = get_metrics(model, x_test, y_test_binary)
print(metrics_model4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
   accuracy  precision    recall      f1_s
0  0.648543   0.699849  0.520179  0.596785
1  0.648543   0.618198  0.776906  0.688525


A continuación, un resumen de los resultados por cada modelo:´

In [8]:
print('[ 1 ]  Modelo con epoch=4, batch_size=2000, validation_split=0.9:')
print(metrics_model1, '\n')

print('[ 2 ] Modelo con epoch=20, batch_size=2000, validation_split=0.9:')
print(metrics_model2, '\n')

print('[ 3 ] Modelo con epoch=4, batch_size=2, validation_split=0.9:')
print(metrics_model3, '\n')

print('[ 4 ] Modelo con epoch=4, batch_size=2000, validation_split=0.01:')
print(metrics_model4, '\n')


[ 1 ]  Modelo con epoch=4, batch_size=2000, validation_split=0.9:
   accuracy  precision    recall      f1_s
0  0.552691   0.643293  0.236547  0.345902
1  0.552691   0.532280  0.868834  0.660136 

[ 2 ] Modelo con epoch=20, batch_size=2000, validation_split=0.9:
   accuracy  precision    recall      f1_s
0  0.574552   0.548364  0.845291  0.665196
1  0.574552   0.662592  0.303812  0.416603 

[ 3 ] Modelo con epoch=4, batch_size=2, validation_split=0.9:
   accuracy  precision    recall      f1_s
0  0.581278   0.611026  0.447309  0.516505
1  0.581278   0.564103  0.715247  0.630746 

[ 4 ] Modelo con epoch=4, batch_size=2000, validation_split=0.01:
   accuracy  precision    recall      f1_s
0  0.648543   0.699849  0.520179  0.596785
1  0.648543   0.618198  0.776906  0.688525 



Podemos observar que en algunos modelos, ya sea para la clase 1 o la clase 0, el resultado de una métrica mejora, por ejemplo _precision_, pero disminuye el _recall_, como la clase 0 del modelo 3. De los cuatro modelos, el que tuve mejores resultados en promedio fue el 4, pues vio más datos de entrenamiento, en comparación a los demás.

## Optimizando métricas

In [9]:
model = create_model(x_train, y_train_binary, epoch=20, batch_size=32, validation_split=0.2)
metrics_model5 = get_metrics(model, x_test, y_test_binary)
print(metrics_model5)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
   accuracy  precision    recall      f1_s
0  0.861547   0.853231  0.873318  0.863158
1  0.861547   0.870264  0.849776  0.859898


Este modelo tuvo mejor resultado en promedio que los demás, destacando principalmente _f1-score_ y el _accuracy_.