In [1]:
import os
import numpy as np # linear algebra
import pandas as pd 
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import SparseAdam,Adam,Adagrad,SGD
#from livelossplot import PlotLosses
from tqdm import tqdm

In [2]:
COLS = ['user_id', 'movie_id', 'rating', 'timestamp']
train_data = pd.read_csv("/kaggle/input/movielens-100k-dataset/ml-100k/u1.base",sep='\t', names=COLS).drop(columns=['timestamp']).astype(int)
test_data = pd.read_csv("/kaggle/input/movielens-100k-dataset/ml-100k/u1.test",sep='\t', names=COLS).drop(columns=['timestamp']).astype(int)
n_users, n_items = 1000,2000

In [3]:
train_data

Unnamed: 0,user_id,movie_id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3
...,...,...,...
79995,943,1067,2
79996,943,1074,4
79997,943,1188,3
79998,943,1228,3


In [4]:
train_data.movie_id.nunique()

1650

In [5]:
class Matrixfactorization(nn.Module):
    def __init__(self, n_users, n_items, n_dim=20):
        super().__init__()
        self.user_emb = nn.Embedding( n_users, n_dim)
        self.item_emb = nn.Embedding( n_items, n_dim)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        # print(0, u, v)
        u = self.user_emb(u)
        v = self.item_emb(v)
        # print(u.shape, v.shape)
        out = (u*v).sum(1)   
        # print(1, out.shape)
        return out

In [6]:
def train_epocs(model, train_data, epochs=10, lr=1e-3, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    avg = []
    states = {}
    model.train()
    for i in tqdm(range(epochs)):
        for it in range(len(train_data)//BATCH_SIZE):
            #---------------SETUP BATCH DATA-------------
            df = train_data.sample(frac=BATCH_SIZE/len(train_data))
            users = torch.LongTensor(df.user_id.values) # .cuda()
            items = torch.LongTensor(df.movie_id.values) #.cuda()
            ratings = torch.FloatTensor(df.rating.values) #.cuda()
            if unsqueeze:
                ratings = ratings.unsqueeze(1)
            y_hat = model(users, items)
            loss = F.mse_loss(y_hat, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg.append(loss.item())
            # print(loss.item()) 
        print(f"EPOCH {i+1}:",sum(avg)/len(avg))
        avg = []
        states[i+1] = model.state_dict()

In [7]:
def test_loss(model, test_data, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(test_data.user_id.values) #.cuda()
    items = torch.LongTensor(test_data.movie_id.values) #.cuda()
    ratings = torch.FloatTensor(test_data.rating.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [8]:
BATCH_SIZE = 32
model = Matrixfactorization(n_users,n_items)
print(model)
train_epocs(model, train_data, epochs=10, lr=0.1)
test_loss(model, test_data, )

Matrixfactorization(
  (user_emb): Embedding(1000, 20)
  (item_emb): Embedding(2000, 20)
)


 10%|█         | 1/10 [00:08<01:13,  8.16s/it]

EPOCH 1: 106.0292085407257


 20%|██        | 2/10 [00:16<01:04,  8.05s/it]

EPOCH 2: 87.8136522693634


 30%|███       | 3/10 [00:24<00:56,  8.08s/it]

EPOCH 3: 93.07011737632752


 40%|████      | 4/10 [00:32<00:48,  8.12s/it]

EPOCH 4: 85.55709312820434


 50%|█████     | 5/10 [00:40<00:40,  8.16s/it]

EPOCH 5: 86.23493764343262


 60%|██████    | 6/10 [00:48<00:32,  8.03s/it]

EPOCH 6: 89.98783042945861


 70%|███████   | 7/10 [00:56<00:24,  8.02s/it]

EPOCH 7: 87.9396920463562


 80%|████████  | 8/10 [01:04<00:15,  7.98s/it]

EPOCH 8: 94.84231297340393


 90%|█████████ | 9/10 [01:12<00:07,  7.97s/it]

EPOCH 9: 92.47773809318542


100%|██████████| 10/10 [01:20<00:00,  8.00s/it]

EPOCH 10: 92.20681475753784
test loss 74.020 



