In [1]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import random
import numpy as np
import pandas as pd
import os

In [2]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import TabNetModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)

In [3]:
# load the data and save them into a three df
# Split columns by type (category/numeric) saving their name
def loadData():
    mean_rating = pd.read_csv('../csv_genereted/mean_rating.csv')
    movie_genre = pd.read_csv('../csv_genereted/movies_genre.csv')
    movie_tag_relevance = pd.read_csv('../csv_genereted/movies_tag_relevance.csv')

    final_df = movie_genre.merge(movie_tag_relevance, on='movieId').merge(mean_rating, on='movieId')
    final_df = final_df.drop('movieId', axis=1)

    df_col = list(final_df.columns)
    
    num_col_names = df_col[1:1149]
    cat_col_names = []
    cat_col_names.append(df_col[0])

    return final_df,  num_col_names, cat_col_names


In [4]:
# Utility Function to Print the metrics 
def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_mse = mean_squared_error(y_true, y_pred)
    val_mae = mean_absolute_error(y_true, y_pred)
    print(f"{tag} MSE: {val_mse} | {tag} MAE: {val_mae}")
    return val_mse, val_mae

In [5]:
data, num_col_names, cat_col_names = loadData()        # save the df and the division of columns
target_col = ['rating']                                # set the rating colums as the target
train, test = train_test_split(data, random_state=42)  # split the data into train and test set
train, val = train_test_split(train, random_state=42)  # split the train set into validation set

In [6]:
# Define the target column names, categorical and numerical column names
data_config = DataConfig(
    target=target_col, 
    continuous_cols=num_col_names, #numeric columns
    categorical_cols=cat_col_names, #categorical col
)

In [7]:
# Configure the training process 
trainer_config = TrainerConfig(
    auto_lr_find = True,  # Runs the LRFinder to automatically derive a learning rate
    batch_size = 1024,
    max_epochs = 500,
    min_epochs = 100,
    accelerator = "auto"
)

In [8]:
# Define and use different Optimizers and LearningRate Schedulers.
optimizer_config = OptimizerConfig()

In [9]:
# This determines which model we are going to train (TabNet) and also lets define the hyperparameters of the model
model_config = TabNetModelConfig(
    task="regression"
)

In [10]:
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)


2023-02-10 11:21:32,733 - {pytorch_tabular.tabular_model:102} - INFO - Experiment Tracking is turned off


In [11]:
#training the model -  By default, EarlyStopping is enabled and is monitoring Validation Loss with a patience of 3 epochs.
tabular_model.fit(train=train, validation=val)

Global seed set to 42
2023-02-10 11:21:36,989 - {pytorch_tabular.tabular_model:465} - INFO - Preparing the DataLoaders
2023-02-10 11:21:37,017 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for regression task
2023-02-10 11:21:39,249 - {pytorch_tabular.tabular_model:508} - INFO - Preparing the Model: TabNetModel
2023-02-10 11:21:39,407 - {pytorch_tabular.tabular_model:264} - INFO - Preparing the Trainer
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-02-10 11:21:39,661 - {pytorch_tabular.tabular_model:558} - INFO - Auto LR Find Started
  rank_zero_warn(
  rank_zero_warn(


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.13182567385564073
Restoring states from the checkpoint path at c:\Users\Admin\Desktop\DataAnalytics\data-analytics-project\TabNet\.lr_find_ea28e25b-6c35-40d1-b532-ebbaf15a35f3.ckpt
Restored all states from the checkpoint file at c:\Users\Admin\Desktop\DataAnalytics\data-analytics-project\TabNet\.lr_find_ea28e25b-6c35-40d1-b532-ebbaf15a35f3.ckpt
2023-02-10 11:22:35,759 - {pytorch_tabular.tabular_model:560} - INFO - Suggested LR: 0.13182567385564073. For plot and detailed analysis, use `find_learning_rate` method.
2023-02-10 11:22:35,763 - {pytorch_tabular.tabular_model:566} - INFO - Training Started


Output()

Trainer was signaled to stop but the required `min_epochs=100` or `min_steps=None` has not been met. Training will continue...


2023-02-10 11:40:12,485 - {pytorch_tabular.tabular_model:568} - INFO - Training the model completed
2023-02-10 11:40:12,486 - {pytorch_tabular.tabular_model:1207} - INFO - Loading the best model


<pytorch_lightning.trainer.trainer.Trainer at 0x1228e5df0d0>

In [12]:
# Evaluate the model on new data on the same metrics/loss that was used during training
result = tabular_model.evaluate(test)

  rank_zero_warn(


Output()

In [13]:
#Use of the trained model to predict on new data and return as a dataframe
pred_df = tabular_model.predict(test)
pred_df.head()

Output()

Unnamed: 0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,1121,1122,1123,1124,1125,1126,1127,1128,rating,rating_prediction
6459,Going in Style (1979),0,0,0,1,0,0,1,0,0,...,0.01525,0.1435,0.34225,0.21975,0.0195,0.019,0.0645,0.0135,3.46,3.634493
10323,Nanny McPhee Returns (a.k.a. Nanny McPhee and ...,0,0,1,1,1,0,0,0,0,...,0.13675,0.017,0.13675,0.115,0.0205,0.08,0.068,0.0125,3.012953,2.911516
12589,Our Brand Is Crisis (2015),0,0,0,1,0,0,1,0,0,...,0.0055,0.0235,0.18075,0.04225,0.027,0.007,0.22275,0.029,2.960396,3.064149
8732,Tenacious D in The Pick of Destiny (2006),1,0,0,1,0,0,0,0,0,...,0.012,0.01875,0.1235,0.06275,0.031,0.008,0.1215,0.0205,3.22492,2.964866
7359,Play Time (a.k.a. Playtime) (1967),0,0,0,1,0,0,0,0,0,...,0.01625,0.1305,0.20975,0.0595,0.04575,0.0095,0.075,0.019,3.898664,3.773359


In [19]:
tabular_model.save_model("examples/tabnet")
#loaded_model = TabularModel.load_from_checkpoint("examples/tabnet")

In [18]:
print("--- Test Metrics ---")
val_mse_1, val_mae_1 = print_metrics(test['rating'], pred_df["rating_prediction"], tag="")
print(" R2: ", r2_score(test['rating'], pred_df['rating_prediction']))

--- Test Metrics ---
 MSE: 0.043966363347998964 |  MAE: 0.1651230036436534
 R2:  0.803270316811896
