In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
from typing import Tuple
from collections import defaultdict

In [2]:
df = pd.read_csv('./datasets/ml-1m/ratings.dat', delimiter='::', header=None)
df.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
df = df.drop(columns=['Timestamp'])
df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [3]:
df.MovieID.max()

3952

In [4]:
rating_matrix = df.pivot(index='UserID', columns='MovieID', values='Rating')
n_users, n_movies = rating_matrix.shape
print(f'num of users: {n_users}  num of items: {n_movies}')

sparcity = rating_matrix.notna().sum().sum() / (n_users * n_movies)
print(f'Sparcity: {sparcity:0.2%}')

rating_matrix

num of users: 6040  num of items: 3706
Sparcity: 4.47%


MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [5]:
class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_item, k=20):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, k, sparse=True)
        self.item_factors = nn.Embedding(n_item, k, sparse=True)

    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

In [6]:
class MovieLens1mDataset(Dataset):
    USER_ID = 0
    MOVIE_ID = 1
    RATING = 2

    def __init__(self, rating_path: str) -> None:
        super().__init__()
        self.df = pd.read_csv(rating_path, delimiter='::', header=None)
        self.df.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
        self.df = self.df.drop(columns=['Timestamp'])

    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        values = self.df.iloc[index].values
        # minus 1 to use user_id and movie_id for matrix indexes
        user_id = values[self.USER_ID] - 1
        movie_id = values[self.MOVIE_ID] - 1
        target = np.float32(values[self.RATING])
        return user_id, movie_id, target

    def __len__(self) -> int:
        return len(self.df)

In [7]:
dataset = MovieLens1mDataset('./datasets/ml-1m/ratings.dat')
n_train = int(len(dataset)*0.7)
n_val = len(dataset) - n_train
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [n_train, n_val])

  


In [8]:
BATCH_SIZE = 64

train_dataloader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    pin_memory=True
)

val_dataloader = DataLoader(
    val_dataset, 
    batch_size=1, 
    shuffle=True,
    pin_memory=True # faster read
)

dataloaders = dict(train=train_dataloader, val=val_dataloader)

In [9]:
def train_model(model, dataloaders: dict, n_epoch: int, optimizer, criterion):
    loss_results = defaultdict(list)

    for epoch in range(n_epoch):
        loss_per_epoch = dict(train=0, val=0)

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
        
            for users, items, targets in tqdm(dataloaders[phase]):
                # 勾配を初期化
                optimizer.zero_grad()
                
                # 学習時のみ勾配を計算
                with torch.set_grad_enabled(phase == 'train'):
                    preds = model(users, items)
                    loss = criterion(preds, targets)
                    loss_per_epoch[phase] += loss

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
        loss_results[phase].append(loss_per_epoch[phase])
        
        print(f"[epoch {epoch+1}] train loss: {loss_per_epoch['train']}   val loss: {loss_per_epoch['val']}")

In [10]:
n_users, n_items = dataset.df.UserID.max(), dataset.df.MovieID.max()
matrix_factorization = MatrixFactorization(n_users, n_items, k=20)
criterion = nn.MSELoss()
optimizer = optim.SparseAdam(matrix_factorization.parameters(), lr=1e-2)
n_epoch = 10

train_model(matrix_factorization, dataloaders, n_epoch, optimizer, criterion)

100%|██████████| 10940/10940 [02:08<00:00, 85.03it/s]
100%|██████████| 300063/300063 [02:22<00:00, 2100.11it/s]
  0%|          | 1/10940 [00:00<32:45,  5.57it/s]

[epoch 1] train loss: 137863.203125   val loss: 730286.75


100%|██████████| 10940/10940 [02:19<00:00, 78.60it/s]
100%|██████████| 300063/300063 [02:02<00:00, 2444.76it/s]
  0%|          | 3/10940 [00:00<07:00, 26.01it/s]

[epoch 2] train loss: 14481.7509765625   val loss: 413263.6875


100%|██████████| 10940/10940 [02:04<00:00, 87.93it/s] 
100%|██████████| 300063/300063 [02:02<00:00, 2457.49it/s]
  0%|          | 2/10940 [00:00<09:36, 18.97it/s]

[epoch 3] train loss: 10399.3056640625   val loss: 363408.40625


100%|██████████| 10940/10940 [02:03<00:00, 88.64it/s] 
100%|██████████| 300063/300063 [02:04<00:00, 2407.35it/s]
  0%|          | 4/10940 [00:00<04:59, 36.57it/s]

[epoch 4] train loss: 9458.337890625   val loss: 343843.46875


100%|██████████| 10940/10940 [01:56<00:00, 94.23it/s] 
100%|██████████| 300063/300063 [01:51<00:00, 2699.06it/s]
  0%|          | 4/10940 [00:00<04:46, 38.11it/s]

[epoch 5] train loss: 9033.9375   val loss: 333403.9375


100%|██████████| 10940/10940 [02:00<00:00, 90.44it/s] 
100%|██████████| 300063/300063 [01:58<00:00, 2522.89it/s]
  0%|          | 6/10940 [00:00<03:35, 50.76it/s]

[epoch 6] train loss: 8765.46875   val loss: 326865.03125


100%|██████████| 10940/10940 [02:04<00:00, 88.20it/s] 
100%|██████████| 300063/300063 [01:54<00:00, 2626.80it/s]
  0%|          | 3/10940 [00:00<06:38, 27.44it/s]

[epoch 7] train loss: 8561.6259765625   val loss: 321965.5


100%|██████████| 10940/10940 [01:52<00:00, 97.10it/s] 
100%|██████████| 300063/300063 [01:57<00:00, 2551.96it/s]
  0%|          | 5/10940 [00:00<03:47, 48.11it/s]

[epoch 8] train loss: 8393.1181640625   val loss: 318977.65625


100%|██████████| 10940/10940 [01:49<00:00, 100.12it/s]
100%|██████████| 300063/300063 [01:49<00:00, 2732.87it/s]
  0%|          | 7/10940 [00:00<02:50, 64.02it/s]

[epoch 9] train loss: 8240.7080078125   val loss: 316371.90625


100%|██████████| 10940/10940 [01:50<00:00, 99.32it/s] 
100%|██████████| 300063/300063 [01:54<00:00, 2619.13it/s]

[epoch 10] train loss: 8115.3759765625   val loss: 314187.78125





In [25]:
matrix_factorization.eval()
users, items, targets = next(iter(dataloaders['val']))
print(f'predict: {matrix_factorization(users, items).detach().numpy()} target: {targets.numpy()}')

predict: [5.0426493] target: [5.]
