# Práctico 2 - Redes en escalera avanzadas

Este práctico es similar al práctico 1, pero agregará un paso extra que es el uso de redes en escalera avanzadas, ya sean Redes Convolucionales o Redes Recurrentes.

Se les dará, como base, el mismo conjunto de datos de la competencia "PetFinder" que se trabajó para el práctico 1, con el agregado de, en este caso, utilizar la descripción como un feature extra y todo el procesamiento que ello requiere.

Ahora bien, no es el único conjunto de datos que pueden trabajar. Si tienen un conjunto propio de datos que quieran utilizar y dicho conjunto se preste para el uso de alguna red en escalera avanzada (e.g. conjuntos que tengan imágenes o texto), son libres de hacerlo.


# Resolución

In [8]:
import os
import mlflow
import numpy
import pandas
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models
import time

In [9]:
TARGET_COL = 'AdoptionSpeed'

In [10]:
def process_features(df, one_hot_columns, numeric_columns, embedded_columns, test=False):
    direct_features = []

    # Create one hot encodings
    for one_hot_col, max_value in one_hot_columns.items():
        direct_features.append(tf.keras.utils.to_categorical(df[one_hot_col] - 1, max_value))

    for col_name in numeric_columns:
        direct_features.append(tf.keras.utils.normalize(df[col_name].values).reshape(-1,1))

    # Concatenate all features that don't need further embedding into a single matrix.
    features = {'direct_features': numpy.hstack(direct_features)}

    # Create embedding columns - nothing to do here. We will use the zero embedding for OOV
    for embedded_col in embedded_columns.keys():
        features[embedded_col] = df[embedded_col].values

    if not test:
        nlabels = df[TARGET_COL].unique().shape[0]
        # Convert labels to one-hot encodings
        targets = tf.keras.utils.to_categorical(df[TARGET_COL], nlabels)
    else:
        targets = None
    
    return features, targets

In [11]:
def load_dataset(dataset_dir, batch_size):

    # Read train dataset (and maybe dev, if you need to...)
    dataset, dev_dataset = train_test_split(
        pandas.read_csv(os.path.join(dataset_dir, 'train.csv')), test_size=0.2)
    
    test_dataset = pandas.read_csv(os.path.join(dataset_dir, 'test.csv'))
    
    print('Training samples {}, test_samples {}'.format(
        dataset.shape[0], test_dataset.shape[0]))
    
    return dataset, dev_dataset, test_dataset

In [12]:
dataset_dir = "./"
batch_size = 32
dataset, dev_dataset, test_dataset = load_dataset(dataset_dir, batch_size)
nlabels = dataset[TARGET_COL].unique().shape[0]

Training samples 8465, test_samples 4411


In [13]:
dataset

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,Description,AdoptionSpeed,PID
9201,1,24,307,0,1,2,7,0,2,1,1,1,1,1,1,0,41401,"Jack Jack, once had a so-called home. His owne...",3,13017
4039,2,1,283,0,3,1,6,7,2,1,2,2,2,1,2,0,41326,"They are so cute and adorable. They are twin, ...",1,5714
5921,2,2,266,292,1,1,7,0,2,1,2,2,2,1,1,0,41325,- Very active - Cute - Friendly - 3rd from 5 s...,2,8343
6764,1,1,307,307,1,1,0,0,2,2,2,1,2,1,1,0,41326,For Adoption,1,9512
5427,1,1,307,0,2,5,0,0,2,1,2,2,2,1,1,0,41326,Healthy Puppy for adoption. Available for adop...,1,7660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10178,1,2,307,0,2,1,2,7,2,1,2,2,2,1,1,0,41326,Mylo is among three siblings found by a friend...,2,14406
2993,2,2,266,0,3,2,3,0,2,1,2,2,2,1,3,0,41326,3 ginger kitten was found abandoned in front o...,1,4241
867,2,5,266,0,2,1,3,5,2,1,1,3,2,1,3,0,41326,"Super playful, Super adorable, Really cute and...",1,1239
202,2,6,265,266,2,2,5,0,2,2,2,2,2,1,1,0,41401,"dengan berat hatinya, sy terpaksa let go vincc...",1,291


### A continuación aplicamos one_hot_encoding a los features seleccionados.

In [19]:
one_hot_columns = {
    one_hot_col: dataset[one_hot_col].max()
    for one_hot_col in ['Gender', 'Color1', 'Sterilized', 'Vaccinated', 'Health', 'FurLength', 'Age', 'Fee', 'Breed1']
}
embedded_columns = {}
numeric_columns = []

X_train, y_train = process_features(dataset, one_hot_columns, numeric_columns, embedded_columns)
direct_features_input_shape = (X_train['direct_features'].shape[1],)
X_dev, y_dev = process_features(dev_dataset, one_hot_columns, numeric_columns, embedded_columns)
batch_size = 32
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
dev_ds = tf.data.Dataset.from_tensor_slices((X_dev, y_dev)).batch(batch_size)
test_ds = tf.data.Dataset.from_tensor_slices(process_features(
test_dataset, one_hot_columns, numeric_columns, embedded_columns, test=True)[0]).batch(batch_size)

tf.keras.backend.clear_session()
inputs = []

direct_features_input = layers.Input(shape=direct_features_input_shape, name='direct_features')
inputs.append(direct_features_input)
features = direct_features_input
dropout1 = layers.Dropout(0.1)(features)
flatten = layers.Flatten()(dropout1)
dense1 = layers.Dense(16, activation='relu')(flatten)
dropout2 = layers.Dropout(0.1)(dense1)
output_layer = layers.Dense(nlabels, activation='softmax')(dropout2)

model = models.Model(inputs=inputs, outputs=output_layer)

model.compile(loss='categorical_crossentropy', optimizer='adam',
          metrics=['accuracy'])
model.summary()

import mflow
mlflow.set_experiment("selected_features")

with mlflow.start_run(nested=True):
    # Log model hiperparameters first
    mlflow.log_param('one_hot_columns', one_hot_columns)

    # Train
    epochs = 100
    history = model.fit(train_ds, epochs=epochs, shuffle=True)

    # Evaluate
    loss, accuracy = model.evaluate(X_dev, y_dev)
    print("*** Test loss: {} - accuracy: {}".format(loss, accuracy))
    mlflow.log_metric('epochs', epochs)
    mlflow.log_metric('loss', loss)
    mlflow.log_metric('accuracy', accuracy)

    predictions = model.predict(test_ds)
    labels = numpy.argmax(predictions, axis=-1)
    timestr = time.strftime("%Y%m%d-%H%M%S")
    submission = pandas.DataFrame(list(zip(test_dataset["PID"], labels)), columns=["PID", "AdoptionSpeed"])
    filename = "./submissions/submission_" + timestr + ".csv"
    submission.to_csv(filename, header=True, index=False)
    mlflow.log_param('filename', filename)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
direct_features (InputLayer) [(None, 3584)]            0         
_________________________________________________________________
dropout (Dropout)            (None, 3584)              0         
_________________________________________________________________
flatten (Flatten)            (None, 3584)              0         
_________________________________________________________________
dense (Dense)                (None, 16)                57360     
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 85        
Total params: 57,445
Trainable params: 57,445
Non-trainable params: 0
_________________________________________________________

KeyboardInterrupt: 