# Movielens Dataset and Collaborative Filtering

Dataset: https://grouplens.org/datasets/movielens/ -> [(ml-latest-small.zip)](http://files.grouplens.org/datasets/movielens/ml-latest-small.zip)

In [1]:
import pandas as pd
import numpy as np

In [103]:
from tensorflow.python.keras.layers import Input, Embedding, Flatten, Dot
from tensorflow.python.keras.layers import Reshape, Add, Concatenate, Dense, Dropout
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.regularizers import l2
from tensorflow.python.keras.optimizers import Adam

In [3]:
path="data/ml-latest-small/"

In [4]:
ratings = pd.read_csv(path+'ratings.csv')

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [6]:
len(ratings)

100004

In [7]:
n_users = ratings.userId.nunique()
n_users

671

In [8]:
n_movies = ratings.movieId.nunique()
n_movies

9066

In [9]:
g = ratings.groupby('userId')['rating'].count()

In [10]:
g[:10]

userId
1      20
2      76
3      51
4     204
5     100
6      44
7      88
8     116
9      45
10     46
Name: rating, dtype: int64

In [11]:
topUsers = g.sort_values(ascending=False)[:15]

In [12]:
topUsers

userId
547    2391
564    1868
624    1735
15     1700
73     1610
452    1340
468    1291
380    1063
311    1019
30     1011
294     947
509     923
580     922
213     910
212     876
Name: rating, dtype: int64

In [13]:
g = ratings.groupby('movieId')['rating'].count()

In [14]:
topMovies = g.sort_values(ascending=False)[:15]

In [15]:
top_r = ratings.join(topUsers, rsuffix='_r', how='inner', on='userId')

In [16]:
top_r[:25]

Unnamed: 0,userId,movieId,rating,timestamp,rating_r
962,15,1,2.0,997938310,1700
963,15,2,2.0,1134521380,1700
964,15,5,4.5,1093070098,1700
965,15,6,4.0,1040205753,1700
966,15,10,3.0,1093028290,1700
967,15,11,2.5,1093028381,1700
968,15,14,2.5,1166586286,1700
969,15,16,3.5,1093070150,1700
970,15,17,3.0,997939404,1700
971,15,19,1.0,1093028409,1700


In [17]:
top_r = top_r.join(topMovies, rsuffix='_r', how='inner', on='movieId')

In [18]:
top_r[:20]

Unnamed: 0,userId,movieId,rating,timestamp,rating_r,rating_r.1
962,15,1,2.0,997938310,1700,247
5048,30,1,4.0,944943070,1011,247
10214,73,1,5.0,1303464840,1610,247
28390,212,1,3.0,1218405007,876,247
29266,213,1,3.0,1462637445,910,247
40153,294,1,4.0,1047071649,947,247
43329,311,1,3.0,898007830,1019,247
51144,380,1,4.0,1048092869,1063,247
61432,452,1,3.5,1133735252,1340,247
65657,468,1,4.0,1296195523,1291,247


In [19]:
pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

movieId,1,110,260,296,318,356,480,527,589,593,608,1196,1198,1270,2571
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
15,2.0,3.0,5.0,5.0,2.0,1.0,3.0,4.0,4.0,5.0,5.0,5.0,4.0,5.0,5.0
30,4.0,5.0,4.0,5.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,3.0
73,5.0,4.0,4.5,5.0,5.0,5.0,4.0,5.0,3.0,4.5,4.0,5.0,5.0,5.0,4.5
212,3.0,5.0,4.0,4.0,4.5,4.0,3.0,5.0,3.0,4.0,,,3.0,3.0,5.0
213,3.0,2.5,5.0,,,2.0,5.0,,4.0,2.5,2.0,5.0,3.0,3.0,4.0
294,4.0,3.0,4.0,,3.0,4.0,4.0,4.0,3.0,,,4.0,4.5,4.0,4.5
311,3.0,3.0,4.0,3.0,4.5,5.0,4.5,5.0,4.5,2.0,4.0,3.0,4.5,4.5,4.0
380,4.0,5.0,4.0,5.0,4.0,5.0,4.0,,4.0,5.0,4.0,4.0,,3.0,5.0
452,3.5,4.0,4.0,5.0,5.0,4.0,5.0,4.0,4.0,5.0,5.0,4.0,4.0,4.0,2.0
468,4.0,3.0,3.5,3.5,3.5,3.0,2.5,,,3.0,4.0,3.0,3.5,3.0,3.0


In [20]:
# split train and validation
msk = np.random.rand(len(ratings)) < 0.8
train = ratings[msk]
valid = ratings[~msk]
print(len(train), len(valid))

79952 20052


## Dot Product

In [21]:
n_users, n_movies

(671, 9066)

In [22]:
n_factors = 128

In [23]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(n_users, n_factors, input_length=1, embeddings_regularizer=None, embeddings_initializer='glorot_normal')(user_in)
u = Reshape((n_factors,))(u)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, embeddings_regularizer=None, embeddings_initializer='glorot_normal')(movie_in)
m = Reshape((n_factors,))(m)

In [24]:
x = Dot(axes=(1))([u, m])
x = Flatten()(x)
model = Model([user_in, movie_in], x)
model.compile(Adam(0.000001), loss='mse')

In [25]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
movie_in (InputLayer)            (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1, 128)        85888       user_in[0][0]                    
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1, 128)        1160448     movie_in[0][0]                   
___________________________________________________________________________________________

In [26]:
batch_size=64

In [27]:
def fit_model(epochs=1):
    for i in range(epochs):
        model.fit(
            [train.userId, train.movieId], train.rating, batch_size=batch_size,
            validation_data=([valid.userId, valid.movieId], valid.rating))

In [28]:
fit_model()

Train on 79952 samples, validate on 20052 samples
Epoch 1/1


In [29]:
fit_model(10)

Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1


In [30]:
model.optimizer.lr = 0.001

In [31]:
fit_model(10)

Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1


## Dot product with bias

In [55]:
def create_bias(inp, n_in):
    x = Embedding(n_in, 1, input_length=1)(inp)
    return Flatten()(x)

In [92]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(n_users, n_factors, input_length=1, embeddings_regularizer=l2(1e-8), embeddings_initializer='glorot_normal')(user_in)
u = Reshape((n_factors,))(u)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, embeddings_regularizer=l2(1e-8), embeddings_initializer='glorot_normal')(movie_in)
m = Reshape((n_factors,))(m)

In [93]:
ub = create_bias(user_in, n_users)
mb = create_bias(movie_in, n_movies)

In [94]:
x = Dot(axes=(1))([u, m])
x = Flatten()(x)
x = Add()([x, ub])
x = Add()([x, mb])
model = Model([user_in, movie_in], x)

In [95]:
model.compile(Adam(0.000001), loss='mse')

In [96]:
fit_model(5)

Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1


In [97]:
model.optimizer.lr = 0.001

In [98]:
fit_model(5)

Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1


## Deep Neural Network Approach

In [107]:
user_in = Input(shape=(1,), dtype='int64', name='user_in')
u = Embedding(n_users, n_factors, input_length=1, embeddings_regularizer=l2(1e-8), embeddings_initializer='glorot_normal')(user_in)
u = Reshape((n_factors,))(u)
movie_in = Input(shape=(1,), dtype='int64', name='movie_in')
m = Embedding(n_movies, n_factors, input_length=1, embeddings_regularizer=l2(1e-8), embeddings_initializer='glorot_normal')(movie_in)
m = Reshape((n_factors,))(m)

In [108]:
x = Concatenate()([u, m])
x = Flatten()(x)
x = Dense(100, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(1)(x)
model = Model([user_in, movie_in], x)
model.compile(Adam(0.001), loss='mse')

In [109]:
fit_model(5)

Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
Train on 79952 samples, validate on 20052 samples
Epoch 1/1
