In [2]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch.optim as optim

# Load data
# movies_df = pd.read_csv('/home1/chenpaul/EE557/MovieLens/movie.csv')
ratings_df = pd.read_csv('/home1/chenpaul/EE557/MovieLens/rating.csv')
# genome_scores_df = pd.read_csv('/home1/chenpaul/EE557/MovieLens/genome_scores.csv')


In [3]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [4]:

# Dataset class
class MovieLensDataset(Dataset):
    # def __init__(self, ratings_df, user_features_df, movie_features_df):
    #     self.user_features = torch.tensor(user_features_df.loc[ratings_df['userId']].values, dtype=torch.float32)
    #     self.movie_features = torch.tensor(movie_features_df.loc[ratings_df['movieId']].values, dtype=torch.float32)
    #     self.ratings = torch.tensor(ratings_df['rating'].values, dtype=torch.float32)

    # def __len__(self):
    #     return len(self.ratings)

    # def __getitem__(self, idx):
    #     return self.user_features[idx], self.movie_features[idx], self.ratings[idx]
    def __init__(self, df:pd.DataFrame, user_features, item_features, label):
        self.user_id = df[user_features].values
        self.movie_id = df[item_features].values
        self.rating = df[label].values

    def __len__(self):
        return len(self.user_id)

    def __getitem__(self, idx):

        user_id = self.user_id[idx]
        movie_id = self.movie_id[idx]
        rating = self.rating[idx]

        return [torch.tensor(user_id), torch.tensor(movie_id)], torch.tensor(rating)


In [5]:
# Specify the number of unique userIds and movieIds to keep
max_user_id = 10000  # for example, top 1000 users
max_movie_id = 5000  # for example, top 500 movies

# Get the top userIds and movieIds
filtered_df = ratings_df[(ratings_df['userId'] < max_user_id) & (ratings_df['movieId'] < max_movie_id)]



In [6]:
from scipy.sparse import csr_matrix

# rating_matrix = ratings_df.pivot_table(index='userId', columns='movieId', values='rating')
# rating_matrix = csr_matrix((ratings_df['rating'], (ratings_df['userId'], ratings_df['movieId'])))
# Load movie titles
df_title = pd.read_csv('/home1/chenpaul/EE557/MovieLens/movie.csv')#, encoding="ISO-8859-1", header=None, names=['movieId', 'Year', 'Name'], on_bad_lines='skip')
df_title.set_index('movieId', inplace=True)

# Merge ratings with movie titles
df_title.index = df_title.index.astype('int')
df_two_tower = filtered_df.merge(df_title, left_on='movieId', right_index=True)

In [7]:
print(df_two_tower['userId'].nunique())
print(df_two_tower['movieId'].nunique())

9998
4835


In [8]:
user_features = ["userId"]
item_features = ["movieId"]
label = ["rating"]
train_data, test_data = train_test_split(df_two_tower, test_size=0.2)

train_dataloader = torch.utils.data.DataLoader(MovieLensDataset(
        train_data,
        user_features,
        item_features,
        label), batch_size=64, shuffle=False)

test_dataloader = torch.utils.data.DataLoader(MovieLensDataset(
      test_data,
      user_features,
      item_features,
      label), batch_size=64, shuffle=False)

In [9]:
train_data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
110658,761,267,1.0,1998-08-15 01:29:33,Major Payne (1995),Comedy
130568,892,1545,3.0,2001-01-02 21:21:23,Ponette (1996),Drama
752723,5015,3593,1.0,2008-10-28 23:18:57,Battlefield Earth (2000),Action|Sci-Fi
780022,5192,3082,4.0,2015-01-31 14:10:49,"World Is Not Enough, The (1999)",Action|Adventure|Thriller
44334,343,2797,4.0,2004-06-27 16:08:38,Big (1988),Comedy|Drama|Fantasy|Romance
...,...,...,...,...,...,...
691218,4587,434,1.5,2006-01-28 10:19:44,Cliffhanger (1993),Action|Adventure|Thriller
1385870,9403,3100,3.0,2000-01-17 03:40:56,"River Runs Through It, A (1992)",Drama
1429202,9653,356,3.0,1999-09-18 10:18:39,Forrest Gump (1994),Comedy|Drama|Romance|War
458699,3138,1221,4.0,1997-05-26 08:17:00,"Godfather: Part II, The (1974)",Crime|Drama


In [10]:
class TwoTowerModel(nn.Module):
    def __init__(self, user_embedding_num, user_embedding_dim, item_embedding_num, item_embedding_dim):
        super(TwoTowerModel, self).__init__()
        self.user_embedding = nn.Embedding(user_embedding_num, user_embedding_dim)
        self.item_embedding = nn.Embedding(item_embedding_num, item_embedding_dim)
        self.flatten = nn.Flatten()

        # User Tower
        self.user_tower = nn.Sequential(
            nn.Linear(user_embedding_dim, 50),
            nn.ReLU(),
            nn.Linear(50, 20),
            nn.ReLU()
        )

        # Item Tower
        self.item_tower = nn.Sequential(
            nn.Linear(item_embedding_dim, 50),
            nn.ReLU(),
            nn.Linear(50, 20),
            nn.ReLU()
        )

    def forward(self, X):
        user_embed = self.flatten(self.user_embedding(X[0]))
        item_embed = self.flatten(self.item_embedding(X[1]))

        user = self.user_tower(user_embed)
        item = self.item_tower(item_embed)
        score = torch.mul(user, item).sum(1)
        return score


In [11]:
model = TwoTowerModel(max_user_id+1, 300, max_movie_id+1, 300)

In [12]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
model = model.to(device)

In [13]:
model

TwoTowerModel(
  (user_embedding): Embedding(10001, 300)
  (item_embedding): Embedding(5001, 300)
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (user_tower): Sequential(
    (0): Linear(in_features=300, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=20, bias=True)
    (3): ReLU()
  )
  (item_tower): Sequential(
    (0): Linear(in_features=300, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=20, bias=True)
    (3): ReLU()
  )
)

In [14]:
criterion = nn.MSELoss()  # Using Mean Squared Error Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    print(size)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = (X[0].to(device).long(), X[1].to(device).long()), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y.float())

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 1000 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = (X[0].to(device).long(), X[1].to(device).long()), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y.float()).item()
    test_loss /= num_batches
    print(f"Test Error: \n , Avg loss: {test_loss:>8f} \n")
def fit(model, loss_fn, optimizer, train_dataloader, test_dataloader, epochs=5):
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train(train_dataloader, model, loss_fn, optimizer)
        test(test_dataloader, model, loss_fn)
    print("Done!")
fit(model, criterion, optimizer, train_dataloader, test_dataloader,)

Epoch 1
-------------------------------
913523


  return F.mse_loss(input, target, reduction=self.reduction)


loss: 11.769237  [    2/913523]
loss: 1.438290  [ 2002/913523]
loss: 1.186611  [ 4002/913523]
loss: 1.271615  [ 6002/913523]
loss: 0.858905  [ 8002/913523]
loss: 1.358911  [10002/913523]
loss: 0.769211  [12002/913523]
loss: 1.047344  [14002/913523]
loss: 1.288637  [16002/913523]
loss: 1.472014  [18002/913523]
loss: 0.929650  [20002/913523]
loss: 1.011623  [22002/913523]
loss: 1.138462  [24002/913523]
loss: 0.951426  [26002/913523]
loss: 1.349313  [28002/913523]


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Test Error: 
 , Avg loss: 1.134348 

Epoch 2
-------------------------------
913523
loss: 1.605229  [    2/913523]
loss: 1.356050  [ 2002/913523]
loss: 1.135738  [ 4002/913523]
loss: 1.250906  [ 6002/913523]
loss: 0.826204  [ 8002/913523]
loss: 1.316488  [10002/913523]
loss: 0.749931  [12002/913523]
loss: 0.993272  [14002/913523]
loss: 1.293151  [16002/913523]
loss: 1.501116  [18002/913523]
loss: 0.927680  [20002/913523]
loss: 1.064257  [22002/913523]
loss: 1.119059  [24002/913523]
loss: 0.975105  [26002/913523]
loss: 1.339210  [28002/913523]
Test Error: 
 , Avg loss: 1.128046 

Epoch 3
-------------------------------
913523
loss: 1.567344  [    2/913523]
loss: 1.355565  [ 2002/913523]
loss: 1.138258  [ 4002/913523]
loss: 1.247387  [ 6002/913523]
loss: 0.819436  [ 8002/913523]
loss: 1.316662  [10002/913523]
loss: 0.750191  [12002/913523]
loss: 0.993463  [14002/913523]
loss: 1.292635  [16002/913523]
loss: 1.501451  [18002/913523]
loss: 0.928525  [20002/913523]
loss: 1.057030  [22002/913