<a href="https://colab.research.google.com/github/moushfiq/recommender_system/blob/master/recommendation_system.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
PATH = Path("/content/drive/My Drive/ml-latest-small")
list(PATH.iterdir())

[PosixPath('/content/drive/My Drive/ml-latest-small/links.csv'),
 PosixPath('/content/drive/My Drive/ml-latest-small/README.txt'),
 PosixPath('/content/drive/My Drive/ml-latest-small/movies.csv'),
 PosixPath('/content/drive/My Drive/ml-latest-small/tags.csv'),
 PosixPath('/content/drive/My Drive/ml-latest-small/ratings.csv')]

In [0]:
data = pd.read_csv(PATH/"ratings.csv")

In [5]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [0]:
# split train and validation
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [0]:
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [0]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [9]:
# to check my new implementation
LOCAL_PATH = Path("/content/drive/My Drive/train-data")
df_t = pd.read_csv(LOCAL_PATH/"tiny_training2.csv")
df_v = pd.read_csv(LOCAL_PATH/"tiny_val2.csv")
print(df_t)
df_t_e = encode_data(df_t)
df_v_e = encode_data(df_v, df_t)
df_v_e
print(df_t_e)

    userId  movieId  rating
0       11        1       4
1       11       23       5
2        2       23       5
3        2        4       3
4       31        1       4
5       31       23       4
6        4        1       5
7        4        3       2
8       52        1       1
9       52        3       4
10      61        3       5
11       7       23       1
12       7        3       3
    userId  movieId  rating
0        0        0       4
1        0        1       5
2        1        1       5
3        1        2       3
4        2        0       4
5        2        1       4
6        3        0       5
7        3        3       2
8        4        0       1
9        4        3       4
10       5        3       5
11       6        1       1
12       6        3       3


In [0]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

**Import Torch and Embed layer**

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [0]:
# an Embedding module containing 10 user or item embedding size 3
# embedding will be initialized at random
embed = nn.Embedding(10, 3)

In [13]:
# given a list of ids we can "look up" the embedding corresponing to each id
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[-0.7145, -0.2365, -0.3669],
         [ 0.2975, -0.8711, -2.1919],
         [ 0.2703,  0.2981,  1.2537],
         [-0.7475,  0.7028,  0.1622],
         [ 0.5188,  1.1374,  0.2205],
         [-0.7145, -0.2365, -0.3669]]], grad_fn=<EmbeddingBackward>)

**Matrix Factorization Model**


In [0]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

Test MF model

In [16]:
df_t_e

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [0]:
num_users = 7
num_items = 4
emb_size = 3

user_emb = nn.Embedding(num_users, emb_size)
item_emb = nn.Embedding(num_items, emb_size)
users = torch.LongTensor(df_t_e.userId.values)
items = torch.LongTensor(df_t_e.movieId.values)

In [0]:
U = user_emb(users)
V = item_emb(items)

In [19]:
U

tensor([[-1.0718,  1.0890,  1.4075],
        [-1.0718,  1.0890,  1.4075],
        [ 0.8176,  0.9071,  0.9066],
        [ 0.8176,  0.9071,  0.9066],
        [-0.0457, -1.3057,  1.3616],
        [-0.0457, -1.3057,  1.3616],
        [ 1.3335, -0.4709,  0.1041],
        [ 1.3335, -0.4709,  0.1041],
        [-0.4479, -1.2355,  0.0057],
        [-0.4479, -1.2355,  0.0057],
        [-0.3311,  0.3739,  0.6540],
        [-1.2015, -0.0192,  1.0274],
        [-1.2015, -0.0192,  1.0274]], grad_fn=<EmbeddingBackward>)

In [20]:
# element wise multiplication
U*V

tensor([[-0.2199, -1.5846, -3.0574],
        [-0.5897, -0.5155,  1.8885],
        [ 0.4498, -0.4294,  1.2164],
        [-0.3797,  0.1497,  0.5067],
        [-0.0094,  1.9001, -2.9578],
        [-0.0251,  0.6181,  1.8270],
        [ 0.2736,  0.6852, -0.2261],
        [ 1.1015,  0.8178, -0.0710],
        [-0.0919,  1.7979, -0.0123],
        [-0.3700,  2.1456, -0.0039],
        [-0.2735, -0.6493, -0.4461],
        [-0.6611,  0.0091,  1.3786],
        [-0.9924,  0.0333, -0.7009]], grad_fn=<MulBackward0>)

In [21]:
# dot product per row
(U*V).sum(1)

tensor([-4.8620,  0.7833,  1.2368,  0.2767, -1.0671,  2.4200,  0.7328,  1.8482,
         1.6936,  1.7718, -1.3689,  0.7266, -1.6599], grad_fn=<SumBackward1>)

Training Matrix Factorization

In [22]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items)

610 8998


In [0]:
model = MF(num_users, num_items, emb_size=100).cuda() 

In [0]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values).cuda()
        items = torch.LongTensor(df_train.movieId.values).cuda()
        ratings = torch.FloatTensor(df_train.rating.values).cuda()
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item()) 
    test_loss(model, unsqueeze)

In [25]:
# Here is what unsqueeze does
ratings = torch.FloatTensor(df_train.rating.values)
print(ratings.shape)
ratings = ratings.unsqueeze(1).cuda()
print(ratings.shape)

torch.Size([80450])
torch.Size([80450, 1])


In [0]:
def test_loss(model, unsqueeze=False):
    model.eval()
    users = torch.LongTensor(df_val.userId.values).cuda()
    items = torch.LongTensor(df_val.movieId.values).cuda()
    ratings = torch.FloatTensor(df_val.rating.values).cuda()
    if unsqueeze:
        ratings = ratings.unsqueeze(1)
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [27]:
train_epocs(model, epochs=10, lr=0.1)

12.911471366882324
4.850771903991699
2.59645938873291
3.0972681045532227
0.8493105173110962
1.8213179111480713
2.6554224491119385
2.1343283653259277
1.0903929471969604
0.9768303036689758
test loss 1.850 


In [28]:
train_epocs(model, epochs=15, lr=0.01)

1.6416702270507812
1.0040184259414673
0.7117136716842651
0.6606287956237793
0.7251780033111572
0.8032897710800171
0.8429989218711853
0.8350509405136108
0.7930140495300293
0.7376703023910522
0.6878988742828369
0.6558050513267517
0.6446864604949951
0.6496678590774536
0.6610077023506165
test loss 0.820 


In [29]:
train_epocs(model, epochs=15, lr=0.01)

0.6689484715461731
0.6315425634384155
0.6387416124343872
0.6141030788421631
0.6055845022201538
0.6140269041061401
0.6115395426750183
0.5966463088989258
0.5848434567451477
0.5832251310348511
0.5843385457992554
0.5793509483337402
0.5686537623405457
0.558416485786438
0.5522000193595886
test loss 0.759 


In [0]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        b_u = self.user_bias(u).squeeze()
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [0]:
model = MF_bias(num_users, num_items, emb_size=100).cuda()

In [32]:
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

12.915071487426758
9.15540885925293
4.3873982429504395
1.1570138931274414
2.4682462215423584
3.7454419136047363
2.4498417377471924
1.0781590938568115
0.815197229385376
1.317561149597168
test loss 2.069 


In [33]:
train_epocs(model, epochs=15, lr=0.001, wd=1e-5)

1.8933647871017456
1.8297353982925415
1.7675554752349854
1.706886649131775
1.647789478302002
1.5903232097625732
1.534544587135315
1.48050856590271
1.428265929222107
1.3778650760650635
1.329349160194397
1.2827563285827637
1.2381196022033691
1.1954656839370728
1.1548138856887817
test loss 1.300 


In [34]:
train_epocs(model, epochs=15, lr=0.001, wd=1e-5)

1.1161766052246094
1.0768266916275024
1.0394872426986694
1.004220724105835
0.971075177192688
0.9400787353515625
0.91124027967453
0.884547233581543
0.8599644303321838
0.8374333381652832
0.8168718814849854
0.7981774210929871
0.7812314629554749
0.7659043669700623
0.7520604729652405
test loss 0.925 


**Neural Network Model**

In [0]:
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [0]:
model = CollabFNet(num_users, num_items, emb_size=100).cuda()

In [37]:
train_epocs(model, epochs=15, lr=0.01, wd=1e-6, unsqueeze=True)

10.527831077575684
4.3973388671875
1.4628932476043701
2.090796709060669
3.545438766479492
3.4488577842712402
2.4510934352874756
1.5476089715957642
1.205419659614563
1.373519778251648
1.7441908121109009
2.0090856552124023
2.033430337905884
1.830788493156433
1.5181148052215576
test loss 1.212 


In [38]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

1.2298457622528076
1.1454737186431885
1.0939867496490479
1.0723870992660522
1.0697979927062988
1.0800251960754395
1.088917851448059
1.091925024986267
1.0847903490066528
1.0724018812179565
test loss 1.021 


In [39]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

1.0574482679367065
1.0422446727752686
1.0422558784484863
1.0394492149353027
1.0314496755599976
1.0246169567108154
1.021031141281128
1.0195497274398804
1.013393521308899
1.0078397989273071
test loss 0.976 


In [40]:
train_epocs(model, epochs=15, lr=0.001, wd=1e-6, unsqueeze=True)

1.002827763557434
1.001441240310669
0.9950007796287537
0.9916871786117554
0.987365186214447
0.9804649353027344
0.9781243205070496
0.9741808772087097
0.971028745174408
0.9663987755775452
0.9646079540252686
0.9603044986724854
0.9572901129722595
0.9550058245658875
0.9489405751228333
test loss 0.929 


In [41]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.9465813040733337
0.9491854906082153
0.9396327137947083
0.941968560218811
0.9393324255943298
0.9348950982093811
0.9299494028091431
0.928834080696106
0.9300872087478638
0.9258442521095276
test loss 0.906 


In [42]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

0.9211081862449646
0.92958664894104
0.9156913161277771
0.9141028523445129
0.9168750643730164
0.9126095175743103
0.9072227478027344
0.9066547155380249
0.907153308391571
0.9032747149467468
test loss 0.891 


In [43]:
train_epocs(model, epochs=20, lr=0.001, wd=1e-6, unsqueeze=True)

0.9010000228881836
0.9051945805549622
0.8958633542060852
0.8930111527442932
0.8925940990447998
0.8900536894798279
0.8854279518127441
0.8847507238388062
0.8853355646133423
0.8823544979095459
0.8788034915924072
0.8771247267723083
0.8758488893508911
0.8734947443008423
0.8708720207214355
0.8691118359565735
0.866449236869812
0.8654439449310303
0.8622372150421143
0.8604443669319153
test loss 0.856 


In [44]:
train_epocs(model, epochs=15, lr=0.001, wd=1e-6, unsqueeze=True)

0.8564600944519043
0.8691964149475098
0.854751467704773
0.8572797179222107
0.8579795956611633
0.8561875224113464
0.8484776020050049
0.8457061648368835
0.846909761428833
0.8461892604827881
0.8435625433921814
0.8423859477043152
0.840636134147644
0.8368220329284668
0.8351836204528809
test loss 0.837 
