In [1]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import random
import numpy as np
import pandas as pd
import os

In [1]:
from pytorch_tabular import TabularModel
from pytorch_tabular.models import TabNetModelConfig
from pytorch_tabular.config import (
    DataConfig,
    OptimizerConfig,
    TrainerConfig,
    ExperimentConfig,
)

In [None]:
def preprocess(X):
    standardScaler = StandardScaler()
    X = standardScaler.fit_transform(X)

In [2]:
def loadData():
    mean_rating = pd.read_csv('../csv_genereted/mean_rating.csv')
    movie_genre = pd.read_csv('../csv_genereted/movies_genre.csv')
    movie_tag_relevance = pd.read_csv('../csv_genereted/movies_tag_relevance.csv')

    final_df = movie_genre.merge(movie_tag_relevance, on='movieId').merge(mean_rating, on='movieId')
    final_df = final_df.drop('movieId', axis=1)

    df_col = list(final_df.columns)
    
    num_col_names = df_col[1:1149]
    cat_col_names = []
    cat_col_names.append(df_col[0])

    #X = preprocess(X)

    return final_df,  num_col_names, cat_col_names


In [4]:
def print_metrics(y_true, y_pred, tag):
    if isinstance(y_true, pd.DataFrame) or isinstance(y_true, pd.Series):
        y_true = y_true.values
    if isinstance(y_pred, pd.DataFrame) or isinstance(y_pred, pd.Series):
        y_pred = y_pred.values
    if y_true.ndim>1:
        y_true=y_true.ravel()
    if y_pred.ndim>1:
        y_pred=y_pred.ravel()
    val_acc = mean_squared_error(y_true, y_pred)
    val_f1 = mean_absolute_error(y_true, y_pred)
    print(f"{tag} MSE: {val_acc} | {tag} MAE: {val_f1}")
    return val_acc, val_f1

In [3]:
data, num_col_names, cat_col_names = loadData()
target_col = ['rating']
train, test = train_test_split(data, random_state=42)
train, val = train_test_split(train, random_state=42)

In [22]:
data_config = DataConfig(
    target=target_col, #target should always be a list. Multi-targets are only supported for regression. Multi-Task Classification is not implemented
    continuous_cols=num_col_names,
    categorical_cols=cat_col_names,
)

In [23]:
trainer_config = TrainerConfig(
    auto_lr_find=True,  # Runs the LRFinder to automatically derive a learning rate
    batch_size=1024,
    max_epochs=100,
    accelerator="auto"
)

In [24]:
optimizer_config = OptimizerConfig()

In [25]:

model_config = TabNetModelConfig(
    task="regression"
)



In [26]:
tabular_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    optimizer_config=optimizer_config,
    trainer_config=trainer_config,
)


2023-02-06 18:28:28,561 - {pytorch_tabular.tabular_model:102} - INFO - Experiment Tracking is turned off


In [27]:
tabular_model.fit(train=train, validation=val)
result = tabular_model.evaluate(test)
pred_df = tabular_model.predict(test)
tabular_model.save_model("examples/basic")
#loaded_model = TabularModel.load_from_checkpoint("examples/basic")

Global seed set to 42
2023-02-06 18:28:36,352 - {pytorch_tabular.tabular_model:465} - INFO - Preparing the DataLoaders
2023-02-06 18:28:36,401 - {pytorch_tabular.tabular_datamodule:286} - INFO - Setting up the datamodule for regression task
2023-02-06 18:28:39,286 - {pytorch_tabular.tabular_model:508} - INFO - Preparing the Model: TabNetModel
2023-02-06 18:28:39,815 - {pytorch_tabular.tabular_model:264} - INFO - Preparing the Trainer
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
2023-02-06 18:28:40,264 - {pytorch_tabular.tabular_model:558} - INFO - Auto LR Find Started
  rank_zero_warn(
  rank_zero_warn(


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_steps=100` reached.
Learning rate set to 0.13182567385564073
Restoring states from the checkpoint path at c:\Users\Admin\Desktop\DA\TabNet\.lr_find_1d2b87fb-652e-4c53-a4d2-4d98890b1006.ckpt
Restored all states from the checkpoint file at c:\Users\Admin\Desktop\DA\TabNet\.lr_find_1d2b87fb-652e-4c53-a4d2-4d98890b1006.ckpt
2023-02-06 18:29:49,363 - {pytorch_tabular.tabular_model:560} - INFO - Suggested LR: 0.13182567385564073. For plot and detailed analysis, use `find_learning_rate` method.
2023-02-06 18:29:49,365 - {pytorch_tabular.tabular_model:566} - INFO - Training Started


Output()

2023-02-06 18:31:53,085 - {pytorch_tabular.tabular_model:568} - INFO - Training the model completed
2023-02-06 18:31:53,086 - {pytorch_tabular.tabular_model:1207} - INFO - Loading the best model
  rank_zero_warn(


Output()

Output()

In [28]:
print("Ratings")
val_mse_1, val_mae_1 = print_metrics(test['rating'], pred_df["rating_prediction"], tag="Holdout")

Ratings
Holdout MSE: 0.07217625828614503 | Holdout MAE: 0.20495625652494112


In [None]:
print("R2_score: ", r2_score(test['result'], pred_df['rating_prediction']))