# Example 0. Training a Matrix Factorization in ToR[e]cSys

In [1]:
from sklearn.model_selection import train_test_split
import torch.utils.data
import torecsys as trs

In [2]:
# get samples data from movielens as a example
_, _, ratings_df, _ = trs.data.sample_data.load_ml_data(
    size="latest-small", directory="./data", force=True)

In [3]:
ratings_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [4]:
# set hyperparameters of model
user_size = ratings_df.userId.max() + 1
item_size = ratings_df.movieId.max() + 1
embed_size = 16
num_fields = 2

In [5]:
# split data into training set and testing set
train_df, test_df = train_test_split(ratings_df, test_size=0.1)

# initialize training and testing dataset
columns = ["userId", "movieId", "rating"]
train_set = trs.data.dataset.DataFrameToDataset(train_df, columns=columns)
test_set = trs.data.dataset.DataFrameToDataset(test_df, columns=columns)

In [6]:
# define inputs' schema and collect_fn for dataloader
schema = {
    "user_id": ["userId", "indices"],
    "movie_id": ["movieId", "indices"],
    "labels": ["rating", "values"]
}
data_loader = trs.data.dataloader.CollateFunction(schema=schema, device='cuda')

In [7]:
# initialize training and testing dataloader
train_dl = torch.utils.data.DataLoader(
    train_set,
    batch_size=1024,
    shuffle=True,
    num_workers=0,
    collate_fn=data_loader.to_tensor
)

test_dl = torch.utils.data.DataLoader(
    test_set,
    batch_size=64,
    shuffle=False,
    num_workers=0,
    collate_fn=data_loader.to_tensor
)

In [18]:
# initialize embedding fields
feat_inputs_embedding = trs.inputs.base.MultiIndicesEmbedding(
    1, [user_size, item_size]
)

# define schema of wrapper and initialize InputsWrapper
feat_inputs_embedding.set_schema(inputs=['user_id', 'movie_id'])

# initialize inputs wrapper
schema = { 'emb_inputs': feat_inputs_embedding }
inputs = trs.inputs.Inputs(schema=schema)

In [None]:
# TODO: add evaluation methods to pipeline
occurrence = dict(ratings_df['movieId'].value_counts())
k = 0
metric = trs.metrics.Novelty(occurrence=occurrence, k=k, num_users=user_size)

In [19]:
pipeline = trs.trainer.TorecsysPipeline() \
    .set_model('MatrixFactorizationModel') \
    .set_inputs(inputs) \
    .set_criterion('MSELoss', reduction='mean') \
    .set_optimizer('SGD', lr=1e-4) \
    .set_target_fields('labels') \
    .build_sequential()

In [20]:
print(pipeline.summary())

+------------------------------------------+
|     Name                 Value           |
| Objective       Click Through Rate       |
| Inputs          Inputs                   |
| Model           MatrixFactorizationModel |
| Reg norm        None                     |
| Reg lambda      None                     |
| Loss            MSELoss                  |
| Optimizer       SGD                      |
| Target fields   labels                   |
+------------------------------------------+


In [21]:
trainer = trs.trainer.TorecsysTrainer(gpus=1)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
trainer.fit(pipeline, train_dl, test_dl)

  rank_zero_warn(f'you passed in a {loader_name} but have no {step_name}. Skipping {stage} loop')
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type                     | Params
---------------------------------------------------------
0 | _model      | MatrixFactorizationModel | 0     
1 | _inputs     | Inputs                   | 194 K 
2 | _criterion  | MSELoss                  | 0     
3 | _sequential | Sequential               | 194 K 
---------------------------------------------------------
194 K     Trainable params
0         Non-trainable params
194 K     Total params
0.777     Total estimated model params size (MB)


+------------------------------------------+
|     Name                 Value           |
| Objective       Click Through Rate       |
| Inputs          Inputs                   |
| Model           MatrixFactorizationModel |
| Reg norm        None                     |
| Reg lambda      None                     |
| Loss            MSELoss                  |
| Optimizer       SGD                      |
| Target fields   labels                   |
+------------------------------------------+


  rank_zero_warn(


Training: 0it [00:00, ?it/s]