In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn

import time

## 1 Análisis de dtos

In [175]:
# Leer los datos
data = pd.read_excel('Data_Ejercicio_Lab3_winequality-white.xlsx')
data.head(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [176]:
# Preprocesamiento
# Describir los tipos de datos  y la cantidad de datos y la cantidad de nulos
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4898 entries, 0 to 4897
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         4898 non-null   float64
 1   volatile acidity      4898 non-null   float64
 2   citric acid           4898 non-null   float64
 3   residual sugar        4898 non-null   float64
 4   chlorides             4898 non-null   float64
 5   free sulfur dioxide   4898 non-null   float64
 6   total sulfur dioxide  4898 non-null   float64
 7   density               4898 non-null   float64
 8   pH                    4898 non-null   float64
 9   sulphates             4898 non-null   float64
 10  alcohol               4898 non-null   float64
 11  quality               4898 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 459.3 KB


In [177]:
#cuantos datos distintos hay en qualyty y cuantos hay de cada uno
data['quality'].value_counts()

quality
6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: count, dtype: int64

## 2 Limpieza de datos

In [178]:
pd.DataFrame({"variable": data.isna().sum().index, "número_valores_ausentes": data.isna().sum().values}).sort_values(by="número_valores_ausentes")

Unnamed: 0,variable,número_valores_ausentes
0,fixed acidity,0
1,volatile acidity,0
2,citric acid,0
3,residual sugar,0
4,chlorides,0
5,free sulfur dioxide,0
6,total sulfur dioxide,0
7,density,0
8,pH,0
9,sulphates,0


## 3 Preprocesamiento de los datos

### Separación del conjunto de datos

In [179]:
# Separamos la variable a predecir del resto de variables 
y = data['quality'].copy()
x = data.drop('quality', axis=1)
print(x.shape)
print(y.shape)

(4898, 11)
(4898,)


In [180]:
# Separamos el conjunto de datos en entrenamiento y prueba
# Usamos 15% de los datos para el conjunto de test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.15, shuffle=True)

print('x_train:', x_train.shape)
print('y_train:', y_train.shape)
print('x_test:', x_test.shape)
print('y_test:', y_test.shape)

x_train: (4163, 11)
y_train: (4163,)
x_test: (735, 11)
y_test: (735,)


In [181]:
# Escalamos los datos usando StandarScaler
sc = StandardScaler()

# Realizamos el ajuste y transform sobre los datos de entrenamiento, pero solamente transformaamos los datos de prueba
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [182]:
# Separamos el conjunto de entrenamiento en conjuntos de entrenamiento y validación
# Usamos 20% de estos datos para la validación
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, shuffle=True)

print('x_train:', x_train.shape)
print('y_train:', y_train.shape)
print('x_val:', x_val.shape)
print('y_val:', y_val.shape)

x_train: (3330, 11)
y_train: (3330,)
x_val: (833, 11)
y_val: (833,)


In [183]:
# Utilizando estandarización los datos tienen un valor medio de 0 y desviación estandar 1
print("Valor medio entrenamiento: ", x_train[:,1].mean())
print("Valor desviación estandar entrenamiento: ", x_train[:,1].std())

print("Valor medio prueba: ", x_test[:,1].mean())
print("Valor desviación estandar prueba: ", x_test[:,1].std())

Valor medio entrenamiento:  0.003876391762554952
Valor desviación estandar entrenamiento:  1.0117485846302292
Valor medio prueba:  -0.0299398535031803
Valor desviación estandar prueba:  0.991834793552751


In [184]:
y

0       6
1       6
2       6
3       6
4       6
       ..
4893    6
4894    5
4895    6
4896    7
4897    6
Name: quality, Length: 4898, dtype: int64

## Modelo

In [185]:
# Dataset and DataLoader: define the function for the dataset
class WineDataset(Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x, dtype=torch.float32)
        self.y = torch.tensor(np.array(y).astype(np.float32), dtype=torch.float32)
    
    def __len__(self):
        # Devolver el número de ejemplos en el conjunto de datos
        return self.x.shape[0]
    
    def __getitem__(self, idx):
        # Devolver un ejemplo del dataset con la indice 'idx'
        return self.x[idx], self.y[idx]

In [186]:
# Definimos los datasets para el entrenamiento, validación y prueba
train_dataset = WineDataset(x_train, y_train)
val_dataset = WineDataset(x_val, y_val)
test_dataset = WineDataset(x_test, y_test)

In [187]:
# Ejemplo de datos con índice 11
train_dataset.x[11]

tensor([ 0.2937,  0.8058,  0.5428, -0.8667, -0.5897, -0.5448, -0.4718, -1.5544,
         0.0777, -0.3497,  2.4293])

In [188]:
train_dataset.y[11]

tensor(7.)

In [189]:
batch_size = 64 # Especificar este hiperparametro

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [190]:
class WineQualityModel(nn.Module):
    def __init__(self, input_shape):
        super(WineQualityModel, self).__init__()
        self.fc1 = nn.Linear(input_shape, 64)
        self.fc2 = nn.Linear(64, 128)
        self.fc3 = nn.Linear(128, 7) # 7 classes for quality

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)  # No activation here, CrossEntropyLoss will be used
        return x


In [191]:
# Vemos un batch de datos (un consiste en 64 ejemplos de datos)
for i_batch, (x_train, y_train) in enumerate(train_dataloader):
    print("número de batch :", i_batch)
    print("x_train en i_batch : ", x_train.size())
    print("y_train en i_batch :", y_train.size())
    break

número de batch : 0
x_train en i_batch :  torch.Size([64, 11])
y_train en i_batch : torch.Size([64])


In [192]:
model = WineQualityModel(input_shape=x.shape[1])
print(model)

WineQualityModel(
  (fc1): Linear(in_features=11, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=7, bias=True)
)


In [193]:
learning_rate = 0.01 # Especificar la tasa de aprendizaje
epochs = 100
# Especificar el optimizador con la tasa de aprendizaje y parametros del modelo
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Especificar la función de coste   
loss_fn = nn.CrossEntropyLoss()


In [194]:
def train(model, train_dataloader, optimizer, loss_fn):
    model.train()
    epoch_loss = 0
    for i_batch, (x_train, y_train) in enumerate(train_dataloader):
        
        # Pon a cero los gradientes para cada batch
        optimizer.zero_grad()

        # Calcula la salida (predicciones) del modelo para los ejemplos del batch
        predicciones = model(x_train)
        
        # Calcula el coste (función de coste con los valores predichos y reales)
        batch_loss = loss_fn(predicciones, y_train)
        
        # Calcula los gradientes
        batch_loss.backward()
        
        # Ajuste los pesos
        optimizer.step()

        correct_batch = (y_train.reshape(-1,1) == predicciones.round()).sum()


        epoch_loss += batch_loss.item()
        correct_train += correct_batch.detach().cpu().numpy() # Mover los datos al CPU
        elements += len(y_train)
        
    
    loss_train = epoch_loss / i_batch
    acc_train = correct_train / elements
    
    return loss_train, acc_train

In [195]:
def evaluation(model, val_dataloader, loss_fn):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():

        for i_batch, (x_val, y_val) in enumerate(val_dataloader):

            # Calcula la salida (predicciones) del modelo para los ejemplos del batch
            predicciones = model(x_val)

            # Calcula el coste (función de coste con los valores reales y predichos)
            batch_loss = loss_fn(predicciones, y_val.reshape(-1,1))
            # Guarda las metricas
            epoch_loss += batch_loss.item()
    
    # Calcular la metrica para la época entera
    loss_val = epoch_loss / i_batch
    
    return loss_val


In [196]:
def training_evaluation_loop(epochs, model, train_dataloader, val_dataloader, optimizer, loss_fn):
    # Empty loss and accuracy lists to track values
    start = time.time()

    loss_values_train = []
    loss_values_val = []

    for epoch in range(epochs):
        # Llama la función para el entrenamiento 
        loss_train = train(model, train_dataloader, optimizer, loss_fn)
        loss_values_train.append(loss_train)
        # Llama la función para la evaluación
        loss_val = evaluation(model, val_dataloader, loss_fn)
        loss_values_val.append(loss_val)

        # Imprime cada 10 épocas loss_train y loss_val
        if (epoch+1) % 10 == 0:
            print(f'Epoch {epoch+1}/{epochs} - loss_train: {loss_train:.4f} - loss_val: {loss_val:.4f}')
        
    end = time.time()
    total_time = end - start

    print(f'Total training time: {total_time}')

    return loss_values_train, loss_values_val

In [197]:
from sklearn.metrics import mean_squared_error, precision_score

def predictions(model, test_dataloader):
    predictions = []
    real_values = []

    model.eval()
    with torch.no_grad():
        for x_test, y_test in test_dataloader: 
            # Calculate the output (predictions) of the model for the batch examples
            outputs = model(x_test)
            predictions.append(outputs.round().numpy())
            real_values.append(y_test.numpy())

    predictions = np.vstack(predictions)
    real_values = np.hstack(real_values)

    # Calculate the precision metric
    precision = precision = precision_score(real_values, predictions.argmax(axis=1), average='weighted')

    # Print the precision metric
    print(f"The precision is: {precision}")

    return predictions, real_values

In [198]:
predictions, true_labels = predictions(model, test_dataloader)

The precision is: 0.0


  _warn_prf(average, modifier, msg_start, len(result))
