In [1]:
import os
import torch
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
tt.msg_telegram(text=f"TRAINING START -> OK ✅")

In [2]:
import plotly.express as px

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:

np.__version__

'1.21.4'

In [25]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import CategoryEmbeddingModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)


## Load Data

In [6]:
target_col = "price"

In [7]:
df = pd.read_csv("data/train_data.csv")
df = df.dropna()
y = df.pop(target_col)
X = df

In [8]:
# X.pop("ID_PRODUCT")
X.shape

(92329, 26)

In [9]:
X.dropna().shape

(92329, 26)

In [29]:
# defining inputs
NUM_COLS = ["hours_online", "date_sold_day_sin", "date_sold_day_cos", "date_sold_dayofweek_sin",
            "date_sold_dayofweek_cos", "date_sold_month_sin", "date_sold_month_cos", "date_sold_year"]

CAT_COLS = ["id_universe", "id_category", "id_subcat", "id_sub_subcat", "id_brand", "id_model", "id_color", "id_material",
            "id_condition", "vintage", "id_bracelet", "id_box", "id_mechanism", "id_size_type", "geo2_seller", "order_currency"]


TARGET_COL = "price"


In [1]:
px.histogram(df, x=target_col, title="Histogram")

In [30]:
DATA_CONFIG = DataConfig(
    target=[TARGET_COL],
    continuous_cols=NUM_COLS,
    categorical_cols=CAT_COLS,
    validation_split=0.15,
    continuous_feature_transform="quantile_normal",
    normalize_continuous_features=True,
    num_workers=os.cpu_count(),
)
DATA_CONFIG


DataConfig(target=['price'], continuous_cols=[], categorical_cols=['id_sub_subcat'], date_columns=[], encode_date_columns=True, validation_split=0.15, continuous_feature_transform='quantile_normal', normalize_continuous_features=True, quantile_noise=0, num_workers=8, categorical_dim=1, continuous_dim=0)

In [31]:
TRAINER_CONFIG = TrainerConfig(
    auto_lr_find=True,  # Runs the LRFinder to automatically derive a learning rate
    batch_size=512,
    max_epochs=5,
    min_epochs=2,
    load_best=True,
    # track_grad_norm=2,
    # gpus=1,
)
TRAINER_CONFIG


TrainerConfig(batch_size=512, fast_dev_run=False, max_epochs=5, min_epochs=2, max_time=None, gpus=None, accumulate_grad_batches=1, auto_lr_find=True, auto_select_gpus=True, check_val_every_n_epoch=1, gradient_clip_val=0.0, overfit_batches=0.0, deterministic=False, profiler=None, early_stopping='valid_loss', early_stopping_min_delta=0.001, early_stopping_mode='min', early_stopping_patience=3, checkpoints='valid_loss', checkpoints_path='saved_models', checkpoints_name=None, checkpoints_mode='min', checkpoints_save_top_k=1, load_best=True, track_grad_norm=-1)

In [32]:
# optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.005, "epochs": epochs, "steps_per_epoch":steps_per_epoch})
OPTIMIZER_CONFIG = OptimizerConfig(lr_scheduler="ReduceLROnPlateau", lr_scheduler_params={"patience": 7},)
OPTIMIZER_CONFIG


OptimizerConfig(optimizer='Adam', optimizer_params={'weight_decay': 0, 'amsgrad': False}, lr_scheduler='ReduceLROnPlateau', lr_scheduler_params={'patience': 7}, lr_scheduler_monitor_metric='valid_loss')

In [33]:

MODEL_CONFIG = CategoryEmbeddingModelConfig(
    task="regression",
    layers="512-128-64",  # Number of nodes in each layer
    activation="LeakyReLU",  # Activation between each layers
    learning_rate=1e-3,
    # loss="BCELoss",
    embedding_dropout=0.3,
    batch_norm_continuous_input=False,
    dropout=0.3,
    # metrics=["accuracy", "f1_score"],
    # metrics_params=[{"num_classes": 2}, {"average": "micro"}],
)

MODEL_CONFIG

CategoryEmbeddingModelConfig(task='regression', embedding_dims=None, learning_rate=0.001, loss='MSELoss', metrics=['mean_squared_error'], metrics_params=[{}], target_range=None, seed=42, layers='512-128-64', batch_norm_continuous_input=False, activation='LeakyReLU', embedding_dropout=0.3, dropout=0.3, use_batch_norm=False, initialization='kaiming', _module_src='category_embedding', _model_name='CategoryEmbeddingModel', _config_name='CategoryEmbeddingModelConfig')

In [34]:
tabular_model = TabularModel(
    data_config=DATA_CONFIG,
    model_config=MODEL_CONFIG,
    optimizer_config=OPTIMIZER_CONFIG,
    trainer_config=TRAINER_CONFIG,
)


## Training the Model

In [35]:
# from sklearn.preprocessing import PowerTransformer

tabular_model.fit(df)


Global seed set to 42

Checkpoint directory saved_models exists and is not empty.

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name             | Type                | Params
---------------------------------------------------------
0 | embedding_layers | ModuleList          | 14.3 K
1 | backbone         | FeedForwardBackbone | 100 K 
2 | output_layer     | Linear              | 65    
3 | loss             | MSELoss             | 0     
---------------------------------------------------------
114 K     Trainable params
0         Non-trainable params
114 K     Total params
0.458     Total estimated model params size (MB)
Global seed set to 42
Finding best initial lr:  98%|█████████▊| 98/100 [00:01<00:00, 49.46it/s]
LR finder stopped early after 98 steps due to diverging loss.
Restored states from the checkpoint file at /Users/piotr.laczkowski/Desktop/PROJECTS/dex_pytorch/lr_find_temp_model.ckpt
Learning rate set to 5.75439937337157e-07

  | Name     

                                                                      

Global seed set to 42


Epoch 4: 100%|██████████| 182/182 [01:59<00:00,  1.41s/it, loss=3.62e+05, valid_loss=7.92e+5, valid_mean_squared_error=7.92e+5, train_loss=1.25e+5, train_mean_squared_error=4.88e+5]    

In [38]:
torch.save(tabular_model, "models/small_model.pt")


## Evaluation

In [62]:
result = tabular_model.evaluate(df_test)



The dataloader, test dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 8 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Testing: 100%|██████████| 36/36 [00:00<00:00, 72.23it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_mean_squared_error': 70869.75, 'test_mean_squared_error_0': 70869.75}
--------------------------------------------------------------------------------
Epoch 9: 100%|██████████| 142/142 [00:15<00:00,  2.86it/s, loss=137, valid_loss=141.0, valid_mean_squared_error=7.44e+4, train_loss=147.0, train_mean_squared_error=6.72e+4]

## Predictions

In [63]:
pred_df = tabular_model.predict(df_test)
pred_df.head()

Generating Predictions...: 100%|██████████| 36/36 [00:00<00:00, 73.38it/s]


Unnamed: 0,hours_online,date_sold_day_sin,date_sold_day_cos,date_sold_dayofweek_sin,date_sold_dayofweek_cos,date_sold_month_sin,date_sold_month_cos,date_sold_year,id_universe,id_category,...,vintage,id_bracelet,id_box,id_mechanism,id_size_type,geo2_seller,order_currency,id_product,price,price_prediction
10715,-0.863823,0.101168,-0.994869,-0.8660254,-0.5,0.5,-0.8660254,0.5,1.0,4.0,...,1.0,0.0,0.0,0.0,0.0,8.0,4.0,12801280.0,45.0,104.74839
89998,0.162434,0.485302,-0.874347,0.8660254,-0.5,0.5,-0.8660254,0.5,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,8.0,9.0,12990957.0,237.0,115.297653
69997,1.944323,-0.299363,-0.954139,1.224647e-16,-1.0,-1.0,-1.83697e-16,0.5,1.0,1.0,...,0.0,0.0,0.0,0.0,146.0,7.0,11.0,6453975.0,125.0,119.937683
77622,0.167783,0.790776,-0.612106,1.224647e-16,-1.0,-0.5,-0.8660254,0.0,1.0,4.0,...,0.0,0.0,0.0,0.0,0.0,5.0,11.0,10252176.0,109.0,111.639824
87045,-0.509495,0.101168,-0.994869,0.8660254,0.5,-1.0,-1.83697e-16,0.5,1.0,2.0,...,0.0,0.0,0.0,0.0,152.0,5.0,11.0,10328963.0,253.0,114.860733


In [64]:
pred_df.filter(regex=f"{target_col}*").sample(20).astype(int)

Unnamed: 0,price,price_prediction
15740,550,111
11660,170,111
69982,255,111
58215,562,115
19998,155,111
54105,92,114
10212,41,97
62102,31,111
3381,25,105
1194,37,101


In [65]:
def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = mean_squared_error(y_true, y_pred)
    val_f1 = mean_absolute_error(y_true, y_pred)
    print(f"{tag} MSE: {val_acc:.4f} | {tag} MAE: {val_f1:.4f}")

In [66]:
print_metrics(
    y_true=pred_df[target_col],
    y_pred=pred_df[f"{target_col}_prediction"],
    tag="Test"
)

Test MSE: 70869.7495 | Test MAE: 138.0806


In [40]:
# import scipy.stats as ss

# def plot_normal(x_range, mu=0, sigma=1, cdf=False, **kwargs):
#     '''
#     Plots the normal distribution function for a given x range
#     If mu and sigma are not provided, standard normal is plotted
#     If cdf=True cumulative distribution is plotted
#     Passes any keyword arguments to matplotlib plot function
#     '''
#     x = x_range
#     if cdf:
#         y = ss.norm.cdf(x, mu, sigma)
#     else:
#         y = ss.norm.pdf(x, mu, sigma)
#     return x,y

In [44]:
# import torch
# from torch import nn

# from torch.autograd import Variable
# from torch.distributions import Categorical

# def get_pdf(idx):
#     row = pred_df.iloc[idx]
#     pi = torch.from_numpy(row.filter(regex="pi_").values).unsqueeze(0)
#     mu = torch.from_numpy(row.filter(regex="mu_").values).unsqueeze(0)
#     sigma = torch.from_numpy(row.filter(regex="sigma_").values).unsqueeze(0)
#     softmax_pi = nn.functional.gumbel_softmax(pi, tau=1, dim=-1)
#     categorical = Categorical(softmax_pi)
#     pis = categorical.sample().unsqueeze(1)
#     sigma = sigma.gather(1, pis).item()
#     mu = mu.gather(1, pis).item()
#     x = np.linspace(row[f'{target_col}_prediction'].item() * 0.1, row[f'{target_col}_prediction'].item() * 1.9, 1000)
#     return plot_normal(x, mu=mu, sigma=sigma)

In [45]:
# idxs = [2, 173, 412, 365]

In [37]:
# traces = []
# for idx in idxs:
#     x,y = get_pdf(idx)
#     trace = go.Scatter(
#             name=f'House_{idx}',
#             x=x,
#             y=y,
#             mode='lines',
#             # line=dict(color='rgba(246, 76, 114, 1)'),
#         )
#     traces.append(trace)

# fig = go.Figure(traces)
# fig.update_layout(
#     yaxis_title='P(MEDV)',
#     xaxis_title='MEDV',
# #     yaxis_range=[-0.2,1],
#     title='PDFs of different Products',
#     hovermode="x"
# )
# fig.show()

## Saving the model

In [67]:
PATH = "models/model_regression.pt"

In [70]:
torch.save(tabular_model, PATH)

In [52]:
# Load
model = torch.load(PATH)
model.evaluate(df_test)


The dataloader, test dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 4 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Testing: 100%|██████████| 34/34 [00:03<00:00, 11.14it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_mean_squared_error': 73922.421875,
 'test_mean_squared_error_0': 73922.421875}
--------------------------------------------------------------------------------


[{'test_mean_squared_error': 73922.421875,
  'test_mean_squared_error_0': 73922.421875}]