<a href="https://colab.research.google.com/github/manjitullal/recommendation/blob/main/collaborative_filtering_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative Filtering with Neural Networks

In this notebook we will write a matrix factorization model in pytorch to solve a recommendation problem. Then we will write a more general neural model for the same problem.

The MovieLens dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. https://grouplens.org/datasets/movielens/. To get the data:

`wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip`

## MovieLens dataset

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
PATH = '/content/drive/MyDrive/datasets/ml-latest-small/'

In [5]:
data = pd.read_csv(PATH+"ratings.csv")

In [6]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Encoding data
We enconde the data to have contiguous ids for users and movies. You can think about this as a categorical encoding of our two categorical variables userId and movieId.

In [7]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [8]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [9]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [10]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

## Embedding layer

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [13]:
# an Embedding module containing 10 user or item embedding size 3
# embedding will be initialized at random
embed = nn.Embedding(10, 3)

In [14]:
# given a list of ids we can "look up" the embedding corresponing to each id
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[-0.7416,  0.9429, -0.0751],
         [-0.0500, -0.7619, -0.6172],
         [-0.0184,  0.8003,  1.4948],
         [-0.0960,  0.2742, -0.4801],
         [-0.5618, -0.1589,  0.3603],
         [-0.7416,  0.9429, -0.0751]]], grad_fn=<EmbeddingBackward>)

## Matrix factorization model

In [15]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)   

## Training MF model

In [16]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items) 

610 8998


In [17]:
model = MF(num_users, num_items, emb_size=100) # .cuda() if you have a GPU

In [18]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values) # .cuda()
        items = torch.LongTensor(df_train.movieId.values) #.cuda()
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

In [19]:
# Here is what unsqueeze does
ratings = torch.FloatTensor(df_train.rating.values)
print(ratings.shape)
ratings = ratings.unsqueeze(1) # .cuda()
print(ratings.shape)

torch.Size([80450])
torch.Size([80450, 1])


In [30]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) #.cuda()
    items = torch.LongTensor(df_val.movieId.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [31]:
train_epocs(model, epochs=10, lr=0.1)

12.913166999816895
4.8541460037231445
2.57963490486145
3.1019914150238037
0.848253071308136
1.8178449869155884
2.651867151260376
2.1297454833984375
1.0853337049484253
0.9746541976928711
test loss 1.850 


## Prediction

This is straigthforward, we pick a user and find the movies not rated by the user. Convert the user and the list of non rated movies as tensors and then pass to the model to get the output.

In [39]:
# predict 

movies_ratedby_1 = data[data.userId == 1]['movieId'].values
movies_notratedby_1 = list(set(data.movieId.values.tolist()) - set(movies_ratedby_1.tolist()))

print(len(movies_ratedby_1), len(movies_notratedby_1))

232 9492


In [42]:
# movies_notratedby_1[0:5]
# [2, 4, 5, 7, 8]

user = torch.LongTensor([1])
items = torch.LongTensor([2, 4, 5, 7, 8])
predicted_rating = model(user, items)
print(predicted_rating)

tensor([4.2586, 4.4711, 4.4432, 3.7018, 3.9871], grad_fn=<SumBackward1>)


In [None]:
train_epocs(model, epochs=15, lr=0.01)

1.6432433128356934
1.004770278930664
0.711879551410675
0.6606624126434326
0.7254241108894348
0.8038312792778015
0.8437075614929199
0.8357229828834534
0.7934749722480774
0.7378419041633606
0.6878331899642944
0.6556430459022522
0.6445807218551636
0.6497317552566528
0.6612839698791504
test loss 0.821 


In [None]:
train_epocs(model, epochs=15, lr=0.01)

0.6693901419639587
0.6312034130096436
0.6390069723129272
0.614273190498352
0.6052837371826172
0.6137663722038269
0.6116158366203308
0.5968126058578491
0.5848029255867004
0.5829773545265198
0.5840793251991272
0.5791772603988647
0.5685186386108398
0.5582433342933655
0.5519680380821228
test loss 0.759 


In [None]:
model.parameters

<bound method Module.parameters of MF_bias(
  (user_emb): Embedding(610, 100)
  (user_bias): Embedding(610, 1)
  (item_emb): Embedding(8998, 100)
  (item_bias): Embedding(8998, 1)
)>

In [None]:
len(train.userId.unique())

610

In [None]:
movies_watched = train[train.userId == 1].movieId.values
movies_not_watched = list(set(train.movieId.values.flatten()) - set(movies_watched.flatten()))

## MF with bias

In [None]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [None]:
model = MF_bias(num_users, num_items, emb_size=100) #.cuda()

In [None]:
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

12.908324241638184
9.145543098449707
4.37777853012085
1.1558018922805786
2.4738709926605225
3.7419700622558594
2.444751262664795
1.0767680406570435
0.8169639706611633
1.3199241161346436
test loss 2.070 


In [None]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

1.894714117050171
1.3260996341705322
0.9358387589454651
0.7452532649040222
0.7224956154823303
0.7773878574371338
0.8229359984397888
0.8220500946044922
0.7816600203514099
0.7278069853782654
test loss 0.798 


In [None]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

0.6854904890060425
0.6712726354598999
0.6593731045722961
0.6496144533157349
0.6417774558067322
0.635627269744873
0.6309210658073425
0.6274157762527466
0.6248804330825806
0.6231045126914978
test loss 0.751 


Note that these models are susceptible to weight initialization, optimization algorithm and regularization.

## Neural Network Model

In [None]:
# Note here there is no matrix multiplication, we could potentially make the embeddings of different sizes.
# Here we could get better results by keep playing with regularization.
    
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [None]:
model = CollabFNet(num_users, num_items, emb_size=100) #.cuda()

In [None]:
train_epocs(model, epochs=15, lr=0.05, wd=1e-6, unsqueeze=True) 

13.434941291809082
16.370405197143555
1.3257286548614502
5.508914470672607
6.2165207862854
4.431434154510498
2.1724636554718018
1.1197346448898315
1.831865906715393
2.8045129776000977
2.4782402515411377
1.460442066192627
0.932476282119751
1.0870381593704224
1.4916064739227295
test loss 1.760 


In [None]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-6, unsqueeze=True)

1.7271547317504883
0.938242495059967
0.888296902179718
1.159741997718811
1.1197892427444458
0.9154775738716125
0.7834805250167847
0.7914189696311951
0.8674263954162598
0.9142893552780151
test loss 0.938 


In [None]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.8928335905075073
0.8472162485122681
0.8096950054168701
0.7786527872085571
0.7578306198120117
0.7424415349960327
0.7362892031669617
0.7386234402656555
0.7381541132926941
0.7439978718757629
test loss 0.797 


In [None]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.6919353008270264
0.6934647560119629
0.6922585368156433
0.6942275762557983
0.6926798224449158
0.6916202902793884
0.6911264061927795
0.6923496127128601
0.6922929286956787
0.6904215812683105
test loss 0.795 
