In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from typing import List

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [None]:
!wget https://files.grouplens.org/datasets/movielens/ml-100k.zip

--2023-08-19 07:36:31--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip’


2023-08-19 07:36:32 (5.10 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [None]:
!unzip /content/ml-100k.zip

Archive:  /content/ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


In [None]:
user = pd.read_table('ml-100k/u.user', header=None, delimiter='|', index_col=0)
user = user.rename(columns={1: "age", 2: "gender", 3: "job", 4: "zip"})
user['zip'] = user['zip'].map(lambda x: x[0] if x[0].isdigit() else 9)  # only first digit, change letter to 9, since it's minority(<10)
user['gender'] = user['gender'].map(lambda x: 0 if x=='M' else 1)  # coding gender to binary
user['job'] = user['job'].astype('category').cat.codes  # coding job to integer
user = user.astype('int')
user

Unnamed: 0_level_0,age,gender,job,zip
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,0,19,8
2,53,1,13,9
3,23,0,20,3
4,24,0,19,4
5,33,1,13,1
...,...,...,...,...
939,26,1,18,3
940,32,0,0,0
941,20,0,18,9
942,48,1,10,7


In [None]:
genre = pd.read_table('ml-100k/u.item', header=None, delimiter='|', encoding='latin-1', index_col=0).iloc[:, -19:]
genre_name = pd.read_table('ml-100k/u.genre', header=None, delimiter='|')[0].values
genre.columns = genre_name
genre  # 1 indicates the item belongs to that category

Unnamed: 0_level_0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1679,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1680,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1681,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ua.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ua.test', sep='\t', names=r_cols, encoding='latin-1')
ratings_base

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
90565,943,1047,2,875502146
90566,943,1074,4,888640250
90567,943,1188,3,888640250
90568,943,1228,3,888640275




In [None]:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('ml-100k/u.item', sep='|', names=i_cols,encoding='latin-1')
items

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df = ratings_base.merge(user, left_on="user_id",right_index=True)
df = df.merge(items, left_on="movie_id",right_on="movie id")

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df:pd.DataFrame, user_features:List, item_features:List, label:List):
        self.user_id = df[user_features].values
        self.movie_id = df[item_features].values
        self.rating = df[label].values

    def __len__(self):
        return len(self.user_id)

    def __getitem__(self, idx):

        user_id = self.user_id[idx]
        movie_id = self.movie_id[idx]
        rating = self.rating[idx]

        return [torch.tensor(user_id), torch.tensor(movie_id)], torch.tensor(rating)

In [None]:
train_data, test_data = train_test_split(df, test_size=0.2)

In [None]:
user_features = ["user_id", "age"]
item_features = ["movie_id","Action"]
label = ["rating"]

train_dataloader = DataLoader(CustomDataset(
        train_data,
        user_features,
        item_features,
        label), batch_size=64, shuffle=True)

test_dataloader = DataLoader(CustomDataset(
      test_data,
      user_features,
      item_features,
      label), batch_size=64, shuffle=True)

In [None]:
import json

with open('config.json', 'r') as f:
  conf = json.load(f)

In [None]:
conf["user_embedding_num"]= 943+1
conf["item_embedding_num"]= 1682+1
conf["user_embedding_dim"]= 100
conf["item_embedding_dim"]= 100
conf

{'user_embedding_num': 944,
 'user_embedding_dim': 100,
 'item_embedding_num': 1683,
 'item_embedding_dim': 100,
 'user_dense': [50, 60, 20],
 'item_dense': [50, 60, 20],
 'activation': 'relu'}

In [None]:
class TwoTower(nn.Module):
    def __init__(self,conf):
        super(TwoTower,self).__init__()
        self.user_embedding_num = conf["user_embedding_num"]
        self.user_embedding_dim = conf["user_embedding_dim"]

        self.item_embedding_num = conf["item_embedding_num"]
        self.item_embedding_dim = conf["item_embedding_dim"]
        self.user_embedding_dim2 = 200
        self.item_embedding_dim2 = 200

        self.user_dense = [len(user_features)*self.user_embedding_dim ,*conf["user_dense"]]
        self.item_dense = [len(item_features)*self.item_embedding_dim ,*conf["item_dense"]]
        self.activation = conf["activation"]
        self.model2 = False
        user_dense_layers = []
        item_dense_layers = []
        self.flatten = nn.Flatten()
        self.user_embedding = nn.Embedding(self.user_embedding_num,self.user_embedding_dim)
        for i  in range(len(self.user_dense) - 1):
            dense = nn.Linear(self.user_dense[i],self.user_dense[i+1])
            if self.activation == "relu":
                act = nn.ReLU()

            user_dense_layers.append(dense)
            user_dense_layers.append(act)

        self.user_tower = nn.Sequential(*user_dense_layers)

        self.item_embedding = nn.Embedding(self.item_embedding_num,self.item_embedding_dim)
        for i  in range(len(self.item_dense) - 1):
            dense = nn.Linear(self.item_dense[i],self.item_dense[i+1])
            if self.activation == "relu":
                act = nn.ReLU()

            item_dense_layers.append(dense)
            item_dense_layers.append(act)

        self.item_tower = nn.Sequential(*item_dense_layers)
        if self.model2:
            self.user_embed2 = nn.Embedding(int(self.user_dense[-1]),self.user_embedding_dim2)
            self.item_embed2 = nn.Embedding(int(self.item_dense[-1]),self.item_embedding_dim2)

    def forward(self,X):
        user_embed = self.user_embedding(X[0])
        user_embed = self.flatten(user_embed)
        item_embed = self.item_embedding(X[1])
        item_embed = self.flatten(item_embed)

        user  = self.user_tower(user_embed)
        item  = self.item_tower(item_embed)
        if self.model2:
            user = self.user_embed2(user.long())
            item = self.item_embed2(item.long())
        score = torch.dot(user.reshape((-1,)), item.reshape((-1,)))
        return score


In [None]:
model = TwoTower(conf)
model

TwoTower(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (user_embedding): Embedding(944, 100)
  (user_tower): Sequential(
    (0): Linear(in_features=200, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=60, bias=True)
    (3): ReLU()
    (4): Linear(in_features=60, out_features=20, bias=True)
    (5): ReLU()
  )
  (item_embedding): Embedding(1683, 100)
  (item_tower): Sequential(
    (0): Linear(in_features=200, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=60, bias=True)
    (3): ReLU()
    (4): Linear(in_features=60, out_features=20, bias=True)
    (5): ReLU()
  )
)

In [None]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

In [None]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    print(size)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = (X[0].to(device), X[1].to(device)), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y.float())

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [None]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = (X[0].to(device), X[1].to(device)), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y.float()).item()
    test_loss /= num_batches
    print(f"Test Error: \n , Avg loss: {test_loss:>8f} \n")

In [None]:
def fit(model, loss_fn, optimizer, train_dataloader, test_dataloader, epochs=5):
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer)
        test(test_dataloader, model, loss_fn)
    print("Done!")

In [None]:
fit(model, loss_fn, optimizer, train_dataloader, test_dataloader,)

Epoch 1
-------------------------------
72456
loss: 1.336738  [    2/72456]


  return F.mse_loss(input, target, reduction=self.reduction)


loss: 2.673838  [  202/72456]
loss: 2.036902  [  402/72456]
loss: 1.176746  [  602/72456]
loss: 1.280070  [  802/72456]
loss: 1.071999  [ 1002/72456]
loss: 1.092917  [ 1202/72456]
loss: 1.681434  [ 1402/72456]
loss: 1.273935  [ 1602/72456]
loss: 1.191783  [ 1802/72456]
loss: 1.531917  [ 2002/72456]
loss: 1.126245  [ 2202/72456]


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Test Error: 
 , Avg loss: 1.658055 

Epoch 2
-------------------------------
72456
loss: 1.130953  [    2/72456]
loss: 1.437302  [  202/72456]
loss: 1.328029  [  402/72456]
loss: 1.234233  [  602/72456]
loss: 1.404228  [  802/72456]
loss: 1.432846  [ 1002/72456]
loss: 1.492566  [ 1202/72456]
loss: 1.397462  [ 1402/72456]
loss: 1.245881  [ 1602/72456]
loss: 1.540769  [ 1802/72456]
loss: 1.293558  [ 2002/72456]
loss: 1.467107  [ 2202/72456]
Test Error: 
 , Avg loss: 1.573842 

Epoch 3
-------------------------------
72456
loss: 1.283821  [    2/72456]
loss: 1.037575  [  202/72456]
loss: 1.591590  [  402/72456]
loss: 1.123113  [  602/72456]
loss: 1.375001  [  802/72456]
loss: 1.148336  [ 1002/72456]
loss: 1.580252  [ 1202/72456]
loss: 1.274748  [ 1402/72456]
loss: 1.381471  [ 1602/72456]
loss: 1.397665  [ 1802/72456]
loss: 1.354832  [ 2002/72456]
loss: 1.207889  [ 2202/72456]
Test Error: 
 , Avg loss: 1.546526 

Epoch 4
-------------------------------
72456
loss: 1.485845  [    2/72456]
l