In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [2]:
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pytorch_tabular import TabularModel
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)

## Load Data

In [5]:
target_col = "price"

In [6]:
df = pd.read_csv("data/train_data_encoded.csv")
df = df.dropna()
y = df.pop(target_col)
X = df

In [7]:
# X.pop("ID_PRODUCT")
X.shape

(90333, 25)

In [8]:
X.dropna().shape

(90333, 25)

In [9]:
# defining inputs
# cont_cols = X.columns.tolist()
cont_cols = ["hours_online", "date_sold_day_sin", "date_sold_day_cos", "date_sold_dayofweek_sin", "date_sold_dayofweek_cos", "date_sold_month_sin", "date_sold_month_cos", "date_sold_year"]
cat_cols = ["id_universe", "id_category", "id_subcat", "id_sub_subcat", "id_brand", "id_model", "id_color", "id_material", "id_condition", "vintage", "id_bracelet", "id_box", "id_mechanism", "id_size_type", "geo2_seller", "order_currency"]

# appending label data
X[target_col] = y

# splitting
df_train, df_test = train_test_split(X, test_size=0.2, random_state=42)
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=42)

In [10]:
px.histogram(df_train, x=target_col, title="Histogram")

In [11]:
data_config = DataConfig(
    target=[target_col],
    continuous_cols=cont_cols,
    categorical_cols=cat_cols,
#         continuous_feature_transform="quantile_uniform"
# continuous_feature_transform="quantile_normal"
    # normalize_continuous_features=False
)
data_config

DataConfig(target=['price'], continuous_cols=['hours_online', 'date_sold_day_sin', 'date_sold_day_cos', 'date_sold_dayofweek_sin', 'date_sold_dayofweek_cos', 'date_sold_month_sin', 'date_sold_month_cos', 'date_sold_year'], categorical_cols=['id_universe', 'id_category', 'id_subcat', 'id_sub_subcat', 'id_brand', 'id_model', 'id_color', 'id_material', 'id_condition', 'vintage', 'id_bracelet', 'id_box', 'id_mechanism', 'id_size_type', 'geo2_seller', 'order_currency'], date_columns=[], encode_date_columns=True, validation_split=0.2, continuous_feature_transform=None, normalize_continuous_features=True, quantile_noise=0, num_workers=0, categorical_dim=16, continuous_dim=8)

In [74]:
trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=512,
    max_epochs=10,
    # early_stopping_patience=2,
    early_stopping=None,
    # gpus=-1,  #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
trainer_config

TrainerConfig(batch_size=512, fast_dev_run=False, max_epochs=10, min_epochs=1, max_time=None, gpus=None, accumulate_grad_batches=1, auto_lr_find=True, auto_select_gpus=True, check_val_every_n_epoch=1, gradient_clip_val=0.0, overfit_batches=0.0, deterministic=False, profiler=None, early_stopping=None, early_stopping_min_delta=0.001, early_stopping_mode='min', early_stopping_patience=3, checkpoints='valid_loss', checkpoints_path='saved_models', checkpoints_name=None, checkpoints_mode='min', checkpoints_save_top_k=1, load_best=True, track_grad_norm=-1)

In [75]:
# optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.005, "epochs": epochs, "steps_per_epoch":steps_per_epoch})
optimizer_config = OptimizerConfig(
    lr_scheduler="ReduceLROnPlateau",
    lr_scheduler_params={"patience":3},
    # lr_scheduler_monitor_metric="val_loss",
)
optimizer_config


OptimizerConfig(optimizer='Adam', optimizer_params={'weight_decay': 0, 'amsgrad': False}, lr_scheduler='ReduceLROnPlateau', lr_scheduler_params={'patience': 3}, lr_scheduler_monitor_metric='valid_loss')

In [84]:
# from model_test import MyAwesomeModelConfig, MyAwesomeRegressionModel
from model_dex import DexModelConfig, DexModel


In [94]:
model_config = DexModelConfig(
    task="regression",
    layers="512-128-64",
    # loss="PoissonNLLLoss",
    loss="L1Loss",
    activation="ReLU",
    batch_norm_continuous_input=True,
    use_batch_norm=False,
    embedding_dropout=0.3,
    dropout=0.3
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
    model_callable=DexModel
)


In [95]:
# tabular_model.config

## Training the Model

In [96]:
# from sklearn.preprocessing import PowerTransformer

tabular_model.fit(
    train=df_train,
    validation=df_valid,
    # target_transform=PowerTransformer(method="box-cox"),  # "yeo-johnson" "box-cox" "quantile_normal" "quantile_uniform" 
)


Global seed set to 42
2021-12-07 11:32:42.583 | INFO     | model_dex:__init__:161 - embedding categorical dims: 354
2021-12-07 11:32:42.591 | INFO     | model_dex:_build_network:166 - building embedding layers
2021-12-07 11:32:42.597 | INFO     | model_dex:_build_network:169 - building continuous layers
2021-12-07 11:32:42.599 | INFO     | model_dex:_build_network:173 - building backbone
2021-12-07 11:32:42.600 | INFO     | model_dex:__init__:108 - provided embedding model dims: [[6, 3], [14, 7], [90, 45], [280, 50], [2286, 50], [2140, 50], [46, 23], [73, 37], [7, 4], [3, 2], [20, 10], [8, 4], [6, 3], [222, 50], [18, 9], [13, 7]]
2021-12-07 11:32:42.601 | INFO     | model_dex:__init__:110 - embedding categorical dims: 354
2021-12-07 11:32:42.608 | INFO     | model_dex:_build_network:116 - building linear layers
2021-12-07 11:32:42.611 | INFO     | model_dex:_build_network:123 - adding dropout to embedding layer: 0.3
2021-12-07 11:32:42.629 | INFO     | model_dex:_build_network:126 - ad

                                                              

Global seed set to 42


Epoch 9: 100%|██████████| 142/142 [12:02<00:00, 16.79s/it, loss=137, valid_loss=141.0, valid_mean_squared_error=7.44e+4, train_loss=147.0, train_mean_squared_error=6.72e+4]
Epoch 9: 100%|██████████| 142/142 [00:04<00:00, 19.07it/s, loss=-767, valid_loss=-649., valid_mean_squared_error=1.03e+5, train_loss=-817., train_mean_squared_error=9.8e+4]

## Evaluation

In [97]:
result = tabular_model.evaluate(df_test)



The dataloader, test dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Testing: 100%|██████████| 36/36 [00:00<00:00, 61.21it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_mean_squared_error': 101263.2421875,
 'test_mean_squared_error_0': 101263.2421875}
--------------------------------------------------------------------------------


## Predictions

In [98]:
pred_df = tabular_model.predict(df_test)
pred_df.head()

Generating Predictions...: 100%|██████████| 36/36 [00:00<00:00, 51.93it/s]


Unnamed: 0,hours_online,date_sold_day_sin,date_sold_day_cos,date_sold_dayofweek_sin,date_sold_dayofweek_cos,date_sold_month_sin,date_sold_month_cos,date_sold_year,id_universe,id_category,...,vintage,id_bracelet,id_box,id_mechanism,id_size_type,geo2_seller,order_currency,id_product,price,price_prediction
10715,-0.863823,0.101168,-0.994869,-0.8660254,-0.5,0.5,-0.8660254,0.5,1.0,4.0,...,1.0,0.0,0.0,0.0,0.0,8.0,4.0,12801280.0,45.0,3.247004
89998,0.162434,0.485302,-0.874347,0.8660254,-0.5,0.5,-0.8660254,0.5,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,8.0,9.0,12990957.0,237.0,3.348106
69997,1.944323,-0.299363,-0.954139,1.224647e-16,-1.0,-1.0,-1.83697e-16,0.5,1.0,1.0,...,0.0,0.0,0.0,0.0,146.0,7.0,11.0,6453975.0,125.0,3.282322
77622,0.167783,0.790776,-0.612106,1.224647e-16,-1.0,-0.5,-0.8660254,0.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,5.0,11.0,10252176.0,109.0,3.209151
87045,-0.509495,0.101168,-0.994869,0.8660254,0.5,-1.0,-1.83697e-16,0.5,1.0,2.0,...,0.0,0.0,0.0,0.0,152.0,5.0,11.0,10328963.0,253.0,3.322331


In [99]:
pred_df.filter(regex=f"{target_col}*").sample(20).astype(int)

Unnamed: 0,price,price_prediction
15740,550,3
11660,170,3
69982,255,3
58215,562,3
19998,155,3
54105,92,3
10212,41,3
62102,31,3
3381,25,3
1194,37,3


In [102]:
def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = mean_squared_error(y_true, y_pred)
    val_f1 = mean_absolute_error(y_true, y_pred)
    print(f"{tag} MSE: {val_acc:.4f} | {tag} MAE: {val_f1:.4f}")

In [103]:
print_metrics(
    y_true=pred_df[target_col],
    y_pred=pred_df[f"{target_col}_prediction"],
    tag="Test"
)

Test MSE: 101263.2383 | Test MAE: 198.0030
Epoch 9: 100%|██████████| 142/142 [00:14<00:00,  5.87it/s, loss=-767, valid_loss=-649., valid_mean_squared_error=1.03e+5, train_loss=-817., train_mean_squared_error=9.8e+4]

## Visualization

In [40]:
# import scipy.stats as ss

# def plot_normal(x_range, mu=0, sigma=1, cdf=False, **kwargs):
#     '''
#     Plots the normal distribution function for a given x range
#     If mu and sigma are not provided, standard normal is plotted
#     If cdf=True cumulative distribution is plotted
#     Passes any keyword arguments to matplotlib plot function
#     '''
#     x = x_range
#     if cdf:
#         y = ss.norm.cdf(x, mu, sigma)
#     else:
#         y = ss.norm.pdf(x, mu, sigma)
#     return x,y

In [44]:
# import torch
# from torch import nn

# from torch.autograd import Variable
# from torch.distributions import Categorical

# def get_pdf(idx):
#     row = pred_df.iloc[idx]
#     pi = torch.from_numpy(row.filter(regex="pi_").values).unsqueeze(0)
#     mu = torch.from_numpy(row.filter(regex="mu_").values).unsqueeze(0)
#     sigma = torch.from_numpy(row.filter(regex="sigma_").values).unsqueeze(0)
#     softmax_pi = nn.functional.gumbel_softmax(pi, tau=1, dim=-1)
#     categorical = Categorical(softmax_pi)
#     pis = categorical.sample().unsqueeze(1)
#     sigma = sigma.gather(1, pis).item()
#     mu = mu.gather(1, pis).item()
#     x = np.linspace(row[f'{target_col}_prediction'].item() * 0.1, row[f'{target_col}_prediction'].item() * 1.9, 1000)
#     return plot_normal(x, mu=mu, sigma=sigma)

In [45]:
# idxs = [2, 173, 412, 365]

In [37]:
# traces = []
# for idx in idxs:
#     x,y = get_pdf(idx)
#     trace = go.Scatter(
#             name=f'House_{idx}',
#             x=x,
#             y=y,
#             mode='lines',
#             # line=dict(color='rgba(246, 76, 114, 1)'),
#         )
#     traces.append(trace)

# fig = go.Figure(traces)
# fig.update_layout(
#     yaxis_title='P(MEDV)',
#     xaxis_title='MEDV',
# #     yaxis_range=[-0.2,1],
#     title='PDFs of different Products',
#     hovermode="x"
# )
# fig.show()

## Saving the model

In [48]:
PATH = "models/model1.pt"

In [49]:
torch.save(tabular_model, PATH)

In [52]:
# Load
model = torch.load(PATH)
model.evaluate(df_test)


The dataloader, test dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 4 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Testing: 100%|██████████| 34/34 [00:03<00:00, 11.14it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_mean_squared_error': 73922.421875,
 'test_mean_squared_error_0': 73922.421875}
--------------------------------------------------------------------------------


[{'test_mean_squared_error': 73922.421875,
  'test_mean_squared_error_0': 73922.421875}]