# Searching for good hyperparameters for the basic model

In [1]:
# Reload imported code automatically.
%load_ext autoreload
%autoreload 2

In [2]:
import copy
import os
import sys
from typing import List, Tuple

import matplotlib
import matplotlib.pyplot as plt
import optuna
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from optuna.trial import TrialState
from sklearn.model_selection import train_test_split

sys.path.append("..")
from src import simplemodel, training, utils
from src.training import Vocabulary

In [3]:
matplotlib.rc("font", **{"size": 14})

## Params

In [4]:
# Data parameters.
DF_PATH = os.path.join("..", "data", "carsWithImageCleaned.csv")
MODEL_NAME = "firstmodel-tuned"
HELPER_FILENAME = f"{MODEL_NAME}-helper.pkl"

# Model parameters.
COLS_TO_SCALE = ["Anul", "Km", "Putere (CP)", "Capacitate cilindrica (cm3)", "Numar de portiere", "Consum (l/100km)"]
COLS_NORMAL = ["Fara accident in istoric", "Carte de service", "Filtru de particule", "Inmatriculat", "Primul proprietar"]
COLS_TO_EMBED = ["Marca", "Model", "Combustibil", "Cutie de viteze", "Tip Caroserie", "Culoare", "Tractiune"]

# Training parameters.
TRAIN_SIZE = 0.73
DEV_SIZE = 0.1
BATCH_SIZE = 128
EPOCHS = 60

# Search parameters.
TRIALS = 100
TIMEOUT = 600

# Miscellaneous.
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device {DEVICE}")
TORCH_SEED = 13
torch.manual_seed(TORCH_SEED)

Using device cuda


<torch._C.Generator at 0x7f19546de8b0>

## Loading and preparing the data

In [5]:
df = pd.read_csv(DF_PATH, index_col=0)

# Remove cars which are outliers when considering their price.
df = df.drop(df[~utils.inlier_mask(df["Pret (EUR)"])].index)

print(f"Using {len(df)} samples")
df.head()

Using 10985 samples


Unnamed: 0,Url,Autovit Id,Pret (EUR),Oferit de,Categorie,Marca,Model,Anul,Km,Combustibil,...,Tip Caroserie,Numar de portiere,Culoare,Fara accident in istoric,Carte de service,Tractiune,Filtru de particule,Inmatriculat,Primul proprietar,Consum (l/100km)
0,https://www.autovit.ro/anunt/suzuki-vitara-1-6...,7049990250,16999.0,Proprietar,Autoturisme,Suzuki,Vitara,2018,26000,Benzina,...,SUV,5,Negru,True,True,Nu e mentionat,False,False,False,5.8
1,https://www.autovit.ro/anunt/toyota-auris-1-8-...,7049960669,12500.0,Proprietar,Autoturisme,Toyota,Auris,2015,239465,Hibrid,...,Combi,5,Gri,True,True,Nu e mentionat,False,False,False,3.5
2,https://www.autovit.ro/anunt/skoda-octavia-1-6...,7049895868,14994.0,Firma,Autoturisme,Skoda,Octavia,2020,100000,Diesel,...,Sedan,5,Albastru,True,True,Fata,True,True,True,4.7
3,https://www.autovit.ro/anunt/ford-focus-1-6-td...,7049990021,5000.0,Proprietar,Autoturisme,Ford,Focus,2012,245000,Diesel,...,Combi,5,Alb,True,True,Fata,True,True,False,5.1
4,https://www.autovit.ro/anunt/opel-insignia-ID7...,7049990234,17800.0,Proprietar,Autoturisme,Opel,Insignia,2017,65000,Diesel,...,Sedan,4,Alb,True,False,Fata,False,True,False,4.7


Form the train/dev/test datasets.

In [6]:
df_train, df_test = train_test_split(df, train_size=TRAIN_SIZE + DEV_SIZE, random_state=TORCH_SEED)
df_train, df_dev = train_test_split(df_train, train_size=TRAIN_SIZE, random_state=TORCH_SEED)
print(f"Training set shape: {df_train.shape}")
print(f"Dev set shape:      {df_dev.shape}")
print(f"Test set shape:     {df_test.shape}")

Training set shape: (6655, 23)
Dev set shape:      (2462, 23)
Test set shape:     (1868, 23)


Prepare the data to be used with PyTorch.

In [7]:
input_helper = simplemodel.make_input_helper(
    cols_to_scale=df_train[COLS_TO_SCALE + ["Pret (EUR)"]],
    cols_to_embed=df_train[COLS_TO_EMBED],
)

# Training tensors.
inputs_train, indices_train = simplemodel.make_inputs(
    input_helper,
    cols_to_scale=df_train[COLS_TO_SCALE],
    cols_normal=df_train[COLS_NORMAL],
    cols_to_embed=df_train[COLS_TO_EMBED],
)
inputs_train, indices_train = inputs_train.to(DEVICE), indices_train.to(DEVICE)

prices_train = torch.tensor(df_train["Pret (EUR)"].values)
prices_train /= input_helper.maxes["Pret (EUR)"]
prices_train = prices_train.to(DEVICE)

# Dev tensors.
inputs_dev, indices_dev = simplemodel.make_inputs(
    input_helper,
    cols_to_scale=df_dev[COLS_TO_SCALE],
    cols_normal=df_dev[COLS_NORMAL],
    cols_to_embed=df_dev[COLS_TO_EMBED],
)
inputs_dev, indices_dev = inputs_dev.to(DEVICE), indices_dev.to(DEVICE)

prices_dev = torch.tensor(df_dev["Pret (EUR)"].values)
prices_dev /= input_helper.maxes["Pret (EUR)"]
prices_dev = prices_dev.to(DEVICE)

# Test tensors.
inputs_test, indices_test = simplemodel.make_inputs(
    input_helper,
    cols_to_scale=df_test[COLS_TO_SCALE],
    cols_normal=df_test[COLS_NORMAL],
    cols_to_embed=df_test[COLS_TO_EMBED],
)
inputs_test, indices_test = inputs_test.to(DEVICE), indices_test.to(DEVICE)

prices_test = torch.tensor(df_test["Pret (EUR)"].values)
prices_test /= input_helper.maxes["Pret (EUR)"]
prices_test = prices_test.to(DEVICE)

Build batched dataloaders to iterate easily through the dataset.

In [8]:
dataset_train = torch.utils.data.TensorDataset(inputs_train, indices_train, prices_train)
loader_train = torch.utils.data.DataLoader(dataset_train, batch_size=BATCH_SIZE)

dataset_dev = torch.utils.data.TensorDataset(inputs_dev, indices_dev, prices_dev)
loader_dev = torch.utils.data.DataLoader(dataset_dev, batch_size=BATCH_SIZE)

dataset_test = torch.utils.data.TensorDataset(inputs_test, indices_test, prices_test)
loader_test = torch.utils.data.DataLoader(dataset_test, batch_size=BATCH_SIZE)

## Training a model

The code below implements the trials run by Optuna.

You can change it to control the space that gets explored.

In [9]:
def build_model(trial: optuna.trial.Trial) -> simplemodel.SimpleModel:
    """Construct a model with hyperparameters suggested by Optuna."""
    embedding_dim = trial.suggest_int("embedding_dim", 2, 24)

    n_hidden_layers = trial.suggest_int("n_hidden_layers", 1, 3)
    hidden_sizes = [
        trial.suggest_int(f"hidden_size{i}", 4, 128)
        for i in range(n_hidden_layers)
    ]

    return simplemodel.SimpleModel(
        input_size=inputs_train.shape[1],
        vocab_lens=[len(input_helper.vocabs[col]) for col in COLS_TO_EMBED],
        embedding_dim=embedding_dim,
        hidden_sizes=hidden_sizes,
    )


def custom_loss(pred: torch.Tensor, real: torch.Tensor) -> torch.Tensor:
    """
    Compute the mean absolute (real) difference between predicted and real
    prices.
    """
    scale_coeff = input_helper.maxes["Pret (EUR)"]
    diffs = pred * scale_coeff - real * scale_coeff
    return diffs.abs().mean()


def objective(trial: optuna.trial.Trial) -> float:
    """Train the model and return the score Optuna should optimize."""
    model = build_model(trial).to(DEVICE)

    # loss_function = nn.MSELoss()
    loss_function = custom_loss

    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=trial.suggest_float("lr", 1e-5, 1e-1, log=True),
    )

    for epoch in range(EPOCHS):
        model.train()
        for x_inputs, x_indices, y in loader_train:
            x_inputs, x_indices, y = x_inputs.to(DEVICE), x_indices.to(DEVICE), y.to(DEVICE)
            optimizer.zero_grad()
            out = model(x_inputs, x_indices).view(-1)
            loss = loss_function(out, y)
            loss.backward()
            optimizer.step()

        model.eval()
        loss_valid = 0
        with torch.no_grad():
            # for x_inputs, x_indices, y in loader_dev:
            #     x_inputs, x_indices, y = x_inputs.to(DEVICE), x_indices.to(DEVICE), y.to(DEVICE)
            #     out = model(x_inputs, x_indices).view(-1)
            #     loss = loss_function(out, y)
            #     loss_valid += loss.item()
            out = model(inputs_dev, indices_dev).view(-1)
            loss_valid = custom_loss(out, prices_dev).item()

        trial.report(loss_valid, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    trial.set_user_attr(key="best_model", value=model)
    return loss_valid

## Running the hyperparameter search (Optuna study)

In [10]:
%%time

def save_best_model(study: optuna.study.Study, trial: optuna.trial.Trial) -> None:
    """Store the best model in the study, to use it later."""
    if study.best_trial.number == trial.number:
        study.set_user_attr(key="best_model", value=trial.user_attrs["best_model"])


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=TRIALS, timeout=TIMEOUT, callbacks=[save_best_model])
best_model = study.user_attrs["best_model"]

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])
best_trial = study.best_trial

print("Study statistics: ")
print(f"  Number of finished trials: {len(study.trials)}")
print(f"  Number of pruned trials: {len(pruned_trials)}")
print(f"  Number of complete trials: {len(complete_trials)}")

print("Best trial:")
print(f"  Value: {best_trial.value}")
print("  Params: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")
print()

[32m[I 2023-02-04 00:32:25,488][0m A new study created in memory with name: no-name-76988ad9-9aa2-436a-bdb4-fb3ce55f49c1[0m
[32m[I 2023-02-04 00:32:38,096][0m Trial 0 finished with value: 4192.058512352583 and parameters: {'embedding_dim': 5, 'n_hidden_layers': 3, 'hidden_size0': 110, 'hidden_size1': 127, 'hidden_size2': 16, 'lr': 0.003982249320510584}. Best is trial 0 with value: 4192.058512352583.[0m
[32m[I 2023-02-04 00:32:47,327][0m Trial 1 finished with value: 5542.365112824644 and parameters: {'embedding_dim': 5, 'n_hidden_layers': 1, 'hidden_size0': 80, 'lr': 4.9165183612027845e-05}. Best is trial 0 with value: 4192.058512352583.[0m
[32m[I 2023-02-04 00:32:56,545][0m Trial 2 finished with value: 8913.72624802152 and parameters: {'embedding_dim': 6, 'n_hidden_layers': 1, 'hidden_size0': 15, 'lr': 1.834048241080843e-05}. Best is trial 0 with value: 4192.058512352583.[0m
[32m[I 2023-02-04 00:33:08,685][0m Trial 3 finished with value: 8767.03718045134 and parameters: {

Study statistics: 
  Number of finished trials: 100
  Number of pruned trials: 75
  Number of complete trials: 25
Best trial:
  Value: 3416.3032383443824
  Params: 
    embedding_dim: 4
    n_hidden_layers: 2
    hidden_size0: 113
    hidden_size1: 25
    lr: 0.07631054992663061

CPU times: user 5min 12s, sys: 885 ms, total: 5min 13s
Wall time: 5min 14s


In [11]:
optuna.visualization.plot_intermediate_values(study).show()
optuna.visualization.plot_optimization_history(study).show()
optuna.visualization.plot_param_importances(study).show()
optuna.visualization.plot_slice(study).show()
optuna.visualization.plot_parallel_coordinate(study).show()

## Evaluating the best model

### Price differences

In [12]:
@torch.no_grad()
def prediction_errors(inputs: torch.Tensor, indices: torch.Tensor, prices: torch.Tensor) -> None:
    coeff = input_helper.maxes["Pret (EUR)"]
    predicted = best_model(inputs.to(DEVICE), indices.to(DEVICE)).view(-1) * coeff
    real = prices.to(DEVICE).view(-1) * coeff
    diffs = predicted - real

    print(f"Total absolute difference:  {diffs.abs().sum():.5f}")
    print(f"Mean absolute difference:   {diffs.abs().mean():.5f}")
    print(f"Median absolute difference: {diffs.abs().median():.5f}")
    print()


print("Training set:")
prediction_errors(inputs_train, indices_train, prices_train)
print("Dev set:")
prediction_errors(inputs_dev, indices_dev, prices_dev)
print("Test set:")
prediction_errors(inputs_test, indices_test, prices_test)

Training set:
Total absolute difference:  21363191.85839
Mean absolute difference:   3210.09645
Median absolute difference: 2166.54144

Dev set:
Total absolute difference:  8410938.57280
Mean absolute difference:   3416.30324
Median absolute difference: 2349.06622

Test set:
Total absolute difference:  6496319.07971
Mean absolute difference:   3477.68687
Median absolute difference: 2260.67422



### Predicting prices for unseen inputs

In [13]:
# Making a prediction for unseen input.
# Real input taken from https://www.autovit.ro/anunt/audi-a5-ID7H7Azi.html.
# The real price is 29 990 EUR.

fictional = pd.DataFrame({
    "Oferit de": "Proprietar",
    "Marca": "Audi",
    "Model": "A5",
    "Anul": 2019,
    "Km": 155_000.0,
    "Combustibil": "Diesel",
    "Putere (CP)": 190,
    "Capacitate cilindrica (cm3)": 1998,
    "Cutie de viteze": "Automata",
    "Tip Caroserie": "Sedan",
    "Numar de portiere": 4,
    "Culoare": "Maro",
    "Fara accident in istoric": True,
    "Carte de service": True,
    "Tractiune": "Fata",
    "Filtru de particule": True,
    "Inmatriculat": False,
    "Primul proprietar": False,
    "Consum (l/100km)": 4.8,
}, index=[0])

fictional_inputs, fictional_indices = simplemodel.make_inputs(
    input_helper,
    cols_to_scale=fictional[COLS_TO_SCALE],
    cols_normal=fictional[COLS_NORMAL],
    cols_to_embed=fictional[COLS_TO_EMBED],
)
fictional_inputs = fictional_inputs.to(DEVICE)
fictional_indices = fictional_indices.to(DEVICE)

best_model.eval()
with torch.no_grad():
    prediction = best_model(fictional_inputs, fictional_indices).item()

predicted_price = prediction * input_helper.maxes["Pret (EUR)"]
print(f"Predicted price: {predicted_price}")


Predicted price: 24865.04383922501


## Saving the model

In [14]:
utils.store_model_weights(best_model, MODEL_NAME)
utils.store_model_helper(input_helper, HELPER_FILENAME)

Saved model state dict to '../models/firstmodel-tuned-statedict.pt'
Pickled the object to '../models/firstmodel-tuned-helper.pkl'
