<a href="https://colab.research.google.com/github/moushfiq/recommender_system/blob/master/recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from pathlib import Path
import pandas as pd
import numpy as np

In [5]:
PATH = Path("/content/drive/My Drive/ml-latest-small")
list(PATH.iterdir())

[PosixPath('/content/drive/My Drive/ml-latest-small/links.csv'),
 PosixPath('/content/drive/My Drive/ml-latest-small/README.txt'),
 PosixPath('/content/drive/My Drive/ml-latest-small/movies.csv'),
 PosixPath('/content/drive/My Drive/ml-latest-small/tags.csv'),
 PosixPath('/content/drive/My Drive/ml-latest-small/ratings.csv')]

In [0]:
data = pd.read_csv(PATH/"ratings.csv")

In [7]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [0]:
# split train and validation
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [0]:
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [0]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [11]:
# to check my new implementation
LOCAL_PATH = Path("/content/drive/My Drive/train-data")
df_t = pd.read_csv(LOCAL_PATH/"tiny_training2.csv")
df_v = pd.read_csv(LOCAL_PATH/"tiny_val2.csv")
print(df_t)
df_t_e = encode_data(df_t)
df_v_e = encode_data(df_v, df_t)
df_v_e
print(df_t_e)

    userId  movieId  rating
0       11        1       4
1       11       23       5
2        2       23       5
3        2        4       3
4       31        1       4
5       31       23       4
6        4        1       5
7        4        3       2
8       52        1       1
9       52        3       4
10      61        3       5
11       7       23       1
12       7        3       3
    userId  movieId  rating
0        0        0       4
1        0        1       5
2        1        1       5
3        1        2       3
4        2        0       4
5        2        1       4
6        3        0       5
7        3        3       2
8        4        0       1
9        4        3       4
10       5        3       5
11       6        1       1
12       6        3       3


In [0]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

**Import Torch and Embed layer**

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [0]:
# an Embedding module containing 10 user or item embedding size 3
# embedding will be initialized at random
embed = nn.Embedding(10, 3)

In [15]:
# given a list of ids we can "look up" the embedding corresponing to each id
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[-0.2171,  0.5320,  0.9569],
         [ 2.0369, -0.0475,  0.1550],
         [ 0.7295, -1.0892,  0.6157],
         [ 0.3676, -0.0295,  1.1777],
         [ 0.0460, -0.7413, -0.7258],
         [-0.2171,  0.5320,  0.9569]]], grad_fn=<EmbeddingBackward>)

**Matrix Factorization Model**


In [0]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

Test MF model

In [17]:
df_t_e

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [0]:
num_users = 7
num_items = 4
emb_size = 3

user_emb = nn.Embedding(num_users, emb_size)
item_emb = nn.Embedding(num_items, emb_size)
users = torch.LongTensor(df_t_e.userId.values)
items = torch.LongTensor(df_t_e.movieId.values)

In [0]:
U = user_emb(users)
V = item_emb(items)

In [20]:
U

tensor([[-1.6305,  0.0084,  0.5827],
        [-1.6305,  0.0084,  0.5827],
        [-0.5928, -0.9193,  1.7617],
        [-0.5928, -0.9193,  1.7617],
        [ 0.4858,  0.3749, -0.5343],
        [ 0.4858,  0.3749, -0.5343],
        [-1.2974, -0.1846, -0.0854],
        [-1.2974, -0.1846, -0.0854],
        [ 1.1381,  0.4540, -0.8327],
        [ 1.1381,  0.4540, -0.8327],
        [ 0.4697, -0.0931,  0.4023],
        [ 0.4336,  1.0022, -0.7400],
        [ 0.4336,  1.0022, -0.7400]], grad_fn=<EmbeddingBackward>)

In [21]:
# element wise multiplication
U*V

tensor([[-6.6118e-01, -1.6809e-02,  8.8889e-01],
        [-2.7001e+00,  1.2819e-05,  8.5163e-02],
        [-9.8176e-01, -1.4082e-03,  2.5749e-01],
        [-1.7797e-01, -8.4583e-01, -6.1206e-01],
        [ 1.9700e-01, -7.5297e-01, -8.1513e-01],
        [ 8.0450e-01,  5.7425e-04, -7.8096e-02],
        [-5.2610e-01,  3.7076e-01, -1.3030e-01],
        [-1.4311e-01, -2.8507e-01, -2.2352e-02],
        [ 4.6152e-01, -9.1201e-01, -1.2703e+00],
        [ 1.2554e-01,  7.0123e-01, -2.1790e-01],
        [ 5.1809e-02, -1.4379e-01,  1.0528e-01],
        [ 7.1807e-01,  1.5353e-03, -1.0816e-01],
        [ 4.7830e-02,  1.5479e+00, -1.9366e-01]], grad_fn=<MulBackward0>)

In [22]:
# dot product per row
(U*V).sum(1)

tensor([ 0.2109, -2.6150, -0.7257, -1.6359, -1.3711,  0.7270, -0.2856, -0.4505,
        -1.7208,  0.6089,  0.0133,  0.6114,  1.4020], grad_fn=<SumBackward1>)

Training Matrix Factorization

In [23]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items)

610 8998


In [0]:
model = MF(num_users, num_items, emb_size=100) 

In [0]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values) # .cuda()
        items = torch.LongTensor(df_train.movieId.values) #.cuda()
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

In [26]:
# Here is what unsqueeze does
ratings = torch.FloatTensor(df_train.rating.values)
print(ratings.shape)
ratings = ratings.unsqueeze(1) # .cuda()
print(ratings.shape)

torch.Size([80450])
torch.Size([80450, 1])


In [0]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) #.cuda()
    items = torch.LongTensor(df_val.movieId.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [28]:
train_epocs(model, epochs=10, lr=0.1)

12.911651611328125
4.852427005767822
2.6006085872650146
3.0972471237182617
0.849687397480011
1.8230549097061157
2.658811092376709
2.13855242729187
1.0933785438537598
0.9764396548271179
test loss 1.848 


In [29]:
train_epocs(model, epochs=15, lr=0.01)

1.640723466873169
1.0038890838623047
0.7118932008743286
0.6608598828315735
0.725455105304718
0.8034926652908325
0.843034029006958
0.8349044322967529
0.792697548866272
0.7371857762336731
0.6872811317443848
0.6551764607429504
0.6442149877548218
0.6494647264480591
0.6610665917396545
test loss 0.822 


In [0]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [31]:
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

0.6691606044769287
2.6905386447906494
0.9286358952522278
1.0641108751296997
1.5824449062347412
0.8749094009399414
0.6421241760253906
0.9863578081130981
1.1793698072433472
0.9905561208724976
test loss 0.859 


In [32]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

0.6787070035934448
0.6584814190864563
0.6407109498977661
0.6254310011863708
0.6126359701156616
0.6022632122039795
0.5941783785820007
0.5881704688072205
0.5839525461196899
0.5811811089515686
test loss 0.765 


**Neural Network Model**

In [0]:
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [0]:
model = CollabFNet(num_users, num_items, emb_size=100)

In [45]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-6, unsqueeze=True)

1.2090132236480713
1.965401291847229
1.1685296297073364
1.3018543720245361
1.5351026058197021
1.3853956460952759
1.117145299911499
1.012257695198059
1.1124073266983032
1.119170904159546
test loss 0.953 


In [46]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.967490553855896
0.9407780170440674
0.9513012766838074
0.9494496583938599
0.9390056133270264
0.9311290979385376
0.9280374050140381
0.928578794002533
0.9278706908226013
0.9206032752990723
test loss 0.909 


In [47]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.9161325097084045
0.9220607876777649
0.9086928963661194
0.9075403809547424
0.9093003273010254
0.9032639861106873
0.8971983790397644
0.8946829438209534
0.8934814929962158
0.8915942311286926
test loss 0.887 


In [48]:
train_epocs(model, epochs=15, lr=0.001, wd=1e-6, unsqueeze=True)

0.8873467445373535
0.8961063027381897
0.8819310069084167
0.8794198632240295
0.8811234831809998
0.8781795501708984
0.8729749917984009
0.8709647059440613
0.8711490035057068
0.869096577167511
0.865562379360199
0.8602393865585327
0.8608137965202332
0.8590076565742493
0.85671067237854
test loss 0.859 


In [49]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.8516144752502441
0.8659402132034302
0.8486312627792358
0.8486431837081909
0.8515843749046326
0.8479746580123901
0.8413427472114563
0.838903546333313
0.8398889899253845
0.8379942774772644
test loss 0.845 
