In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


In [2]:
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import (
    CategoryEmbeddingMDNConfig,
    MixtureDensityHeadConfig,
)
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)

## Load Data

In [8]:
target_col = "PRICE"

In [23]:
df = pd.read_csv("data/data_encoded.csv")

sel_cols = [
    
]

df = df.dropna()

y = df.pop(target_col)
X = df

In [24]:
# X.pop("ID_PRODUCT")
X.shape

(172409, 26)

In [25]:
X.dropna().shape

(172409, 26)

In [26]:
# defining inputs
# cont_cols = X.columns.tolist()
cont_cols = ["HOURS_ONLINE", "DATE_SOLD_day_sin", "DATE_SOLD_day_cos", "DATE_SOLD_dayofweek_sin", "DATE_SOLD_dayofweek_cos", "DATE_SOLD_month_sin", "DATE_SOLD_month_cos", "DATE_SOLD_year"]
cat_cols = ["ID_UNIVERSE", "ID_CATEGORY", "ID_SUBCAT", "ID_SUB_SUBCAT", "ID_BRAND", "ID_MODEL", "ID_COLOR", "ID_MATERIAL", "ID_MATERIAL_TYPE", "ID_CONDITION", "VINTAGE", "ID_BRACELET", "ID_BOX", "ID_MECHANISM", "ID_SIZE_TYPE", "GEO2_SELLER", "ORDER_CURRENCY"]

# appending label data
X[target_col] = y

# splitting
df_train, df_test = train_test_split(X, test_size=0.2, random_state=42)
df_train, df_valid = train_test_split(df_train, test_size=0.2, random_state=42)

In [27]:
px.histogram(df_train, x=target_col, title="Histogram")

In [28]:
epochs = 50
batch_size = 1024
steps_per_epoch = int((len(df_train)//batch_size) * 0.9)
nr_components = 4

In [29]:
from pytorch_tabular.utils import get_gaussian_centers

mu_init = get_gaussian_centers(df_train[target_col], n_components=nr_components)
mu_init

[90.7467546128432, 793.5858795097431, 332.921168813427, 1688.0384180790882]

In [32]:
data_config = DataConfig(
    target=[target_col],
    continuous_cols=cont_cols,
    categorical_cols=cat_cols,
#         continuous_feature_transform="quantile_uniform"
# continuous_feature_transform="quantile_normal"
    # normalize_continuous_features=False
)

trainer_config = TrainerConfig(
    auto_lr_find=True, # Runs the LRFinder to automatically derive a learning rate
    batch_size=batch_size,
    max_epochs=epochs,
    early_stopping_patience=15,
#     early_stopping=None,
    # gpus=-1,  #index of the GPU to use. -1 means all available GPUs, None, means CPU
)
# optimizer_config = OptimizerConfig(lr_scheduler="OneCycleLR", lr_scheduler_params={"max_lr":0.005, "epochs": epochs, "steps_per_epoch":steps_per_epoch})
optimizer_config = OptimizerConfig(
    lr_scheduler="ReduceLROnPlateau",
    # lr_scheduler_params={"patience":3}
)

mdn_config = MixtureDensityHeadConfig(
    num_gaussian=nr_components, 
    weight_regularization=2, 
    lambda_pi=10,
    lambda_sigma=1, 
    mu_bias_init=mu_init
#, mu_bias_init=[0.3, 0.7]
)

model_config = CategoryEmbeddingMDNConfig(
    task="regression",
    mdn_config=mdn_config,
    layers="512-256-64-32",  # Number of nodes in each layer
    activation="ReLU",  # Activation between each layers
    learning_rate=1e-3,
    batch_norm_continuous_input=True,
    use_batch_norm=True,
    dropout=0.25,
    embedding_dropout=0.25,
    initialization="kaiming",
    target_range=[(int(df_train[col].min()), int(df_train[col].max())) for col in [target_col]]
)

experiment_config = ExperimentConfig(
    project_name="PyTorch VC First Tests",
    run_name="vc_first_dex_run",
    exp_watch="parameters",
    exp_log_freq=2,
    log_target="tensorboard",
    log_logits=False
)

tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config
)

## Training the Model

In [22]:
from sklearn.preprocessing import PowerTransformer

In [33]:
tabular_model.fit(
    train=df_train,
    validation=df_valid,
    # target_transform=PowerTransformer(method="box-cox"),  # "yeo-johnson" "box-cox" "quantile_normal" "quantile_uniform" 
)

Global seed set to 42
MDN does not use target range. Ignoring it.

Checkpoint directory saved_models exists and is not empty.

GPU available: False, used: False
TPU available: False, using: 0 TPU cores

  | Name                   | Type                | Params
---------------------------------------------------------------
0 | embedding_layers       | ModuleList          | 326 K 
1 | normalizing_batch_norm | BatchNorm1d         | 16    
2 | backbone               | FeedForwardBackbone | 343 K 
3 | mdn                    | MixtureDensityHead  | 392   
4 | loss                   | MSELoss             | 0     
---------------------------------------------------------------
669 K     Trainable params
0         Non-trainable params
669 K     Total params
2.679     Total estimated model params size (MB)

The dataloader, val dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 4 which is the number of cpus on thi

                                                                      

Global seed set to 42


Epoch 49: 100%|██████████| 135/135 [00:35<00:00,  1.02it/s, loss=12.9, valid_loss=13.10, valid_mean_squared_error=7.71e+4, train_loss=12.90, train_mean_squared_error=7.33e+4]

## Predictions

In [34]:
pred_df = tabular_model.predict(df_test, quantiles=[0.25,0.5,0.75], n_samples=100, ret_logits=True)
pred_df.head()

Generating Predictions...: 100%|██████████| 34/34 [00:03<00:00,  9.42it/s]


Unnamed: 0,HOURS_ONLINE,DATE_SOLD_day_sin,DATE_SOLD_day_cos,DATE_SOLD_dayofweek_sin,DATE_SOLD_dayofweek_cos,DATE_SOLD_month_sin,DATE_SOLD_month_cos,DATE_SOLD_year,ID_UNIVERSE,ID_CATEGORY,...,backbone_features_22,backbone_features_23,backbone_features_24,backbone_features_25,backbone_features_26,backbone_features_27,backbone_features_28,backbone_features_29,backbone_features_30,backbone_features_31
47180,0.339729,0.485302,-0.874347,-2.449294e-16,1.0,-1.0,-1.83697e-16,0.5,1.0,3.0,...,9.711349,26.0709,16.090479,0.0,57.697639,65.032875,0.0,150.130478,0.0,68.757851
13255,-0.077968,-0.201299,0.97953,-0.8660254,0.5,0.5,0.8660254,1.0,1.0,1.0,...,8.224525,27.829685,0.0,1.768472,27.838287,69.119537,0.0,160.986053,13.846747,76.294212
23134,-0.07029,0.571268,0.820763,0.0,1.0,0.5,-0.8660254,1.0,1.0,1.0,...,7.116515,26.7234,0.0,0.99474,20.370617,65.17955,0.0,163.071457,17.891754,76.937012
118001,1.056486,0.651373,-0.758758,-2.449294e-16,1.0,-0.5,-0.8660254,0.5,1.0,1.0,...,12.20401,33.829006,21.828615,0.0,55.80088,65.493309,1.063031,151.465897,0.0,66.347191
141015,2.144841,0.101168,-0.994869,0.8660254,0.5,-1.0,-1.83697e-16,0.5,1.0,2.0,...,9.0185,24.647177,12.170988,0.0,54.129765,66.600609,0.0,155.845825,2.860414,70.531281


In [55]:
pred_df.filter(regex=f"{target_col}*").sample(20).astype(int)

Unnamed: 0,PRICE,PRICE_prediction,PRICE_q25,PRICE_q50,PRICE_q75
113720,220,201,58,194,300
22645,30,128,10,122,263
34907,1077,278,158,275,421
114831,886,119,36,117,206
25006,55,220,74,188,387
95148,321,190,54,215,353
147515,21,206,98,223,343
169051,360,210,63,220,349
127719,45,215,62,231,391
67518,110,141,-25,146,314


In [38]:
def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = mean_squared_error(y_true, y_pred)
    val_f1 = mean_absolute_error(y_true, y_pred)
    print(f"{tag} MSE: {val_acc:.4f} | {tag} MAE: {val_f1:.4f}")

In [39]:
print_metrics(
    y_true=pred_df[target_col],
    y_pred=pred_df[f"{target_col}_prediction"],
    tag="Test"
)

Test MSE: 74021.9927 | Test MAE: 163.2061


## Visualization

In [40]:
import scipy.stats as ss

def plot_normal(x_range, mu=0, sigma=1, cdf=False, **kwargs):
    '''
    Plots the normal distribution function for a given x range
    If mu and sigma are not provided, standard normal is plotted
    If cdf=True cumulative distribution is plotted
    Passes any keyword arguments to matplotlib plot function
    '''
    x = x_range
    if cdf:
        y = ss.norm.cdf(x, mu, sigma)
    else:
        y = ss.norm.pdf(x, mu, sigma)
    return x,y

In [44]:
import torch
from torch import nn

from torch.autograd import Variable
from torch.distributions import Categorical

def get_pdf(idx):
    row = pred_df.iloc[idx]
    pi = torch.from_numpy(row.filter(regex="pi_").values).unsqueeze(0)
    mu = torch.from_numpy(row.filter(regex="mu_").values).unsqueeze(0)
    sigma = torch.from_numpy(row.filter(regex="sigma_").values).unsqueeze(0)
    softmax_pi = nn.functional.gumbel_softmax(pi, tau=1, dim=-1)
    categorical = Categorical(softmax_pi)
    pis = categorical.sample().unsqueeze(1)
    sigma = sigma.gather(1, pis).item()
    mu = mu.gather(1, pis).item()
    x = np.linspace(row[f'{target_col}_prediction'].item() * 0.1, row[f'{target_col}_prediction'].item() * 1.9, 1000)
    return plot_normal(x, mu=mu, sigma=sigma)

In [45]:
idxs = [2, 173, 412, 365]

In [53]:
traces = []
for idx in idxs:
    x,y = get_pdf(idx)
    trace = go.Scatter(
            name=f'House_{idx}',
            x=x,
            y=y,
            mode='lines',
            # line=dict(color='rgba(246, 76, 114, 1)'),
        )
    traces.append(trace)

fig = go.Figure(traces)
fig.update_layout(
    yaxis_title='P(MEDV)',
    xaxis_title='MEDV',
#     yaxis_range=[-0.2,1],
    title='PDFs of different Products',
    hovermode="x"
)
fig.show()

## Saving the model

In [48]:
PATH = "models/model1.pt"

In [49]:
torch.save(tabular_model, PATH)

In [52]:
# Load
model = torch.load(PATH)
model.evaluate(df_test)


The dataloader, test dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 4 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.



Testing: 100%|██████████| 34/34 [00:03<00:00, 11.14it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_mean_squared_error': 73922.421875,
 'test_mean_squared_error_0': 73922.421875}
--------------------------------------------------------------------------------


[{'test_mean_squared_error': 73922.421875,
  'test_mean_squared_error_0': 73922.421875}]