In [1]:
MYPROJECT = '/Users/jsaon/Work/Interview/Intowow/MovieRecommendation/'
import os, sys
sys.path.insert(0, MYPROJECT)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "local_settings.py")
import django
django.setup()

In [2]:
import numpy as np
import pandas as pd

from datetime import datetime
from django.utils import timezone
from django.db.models import Avg, Count
from django.contrib.auth import get_user_model
User = get_user_model()
from django.core.cache import cache

from movie.models import Movie, Genre, Rating

In [3]:
link_df = pd.read_csv('./ml-latest-small/links.csv')
movie_df = pd.read_csv('./ml-latest-small/movies.csv')
rating_df = pd.read_csv('./ml-latest-small/ratings.csv')

In [4]:
movie_df = pd.merge(movie_df, link_df)
movie_df['tmdbId'] = movie_df['tmdbId'].fillna(0).astype(int)
movie_df['pub_year'] = movie_df['title'].str.extract('\(([0-9]+)\)$', expand=False).fillna(0).astype(int)

In [5]:
movie_df.tail()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,pub_year
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance,3859980,402672,2016
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi,4262980,315011,2016
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary,2531318,391698,2016
9123,164977,The Gay Desperado (1936),Comedy,27660,137608,1936
9124,164979,"Women of '69, Unboxed",Documentary,3447228,410803,0


# Remap movie id

In [6]:
genres_set = set([genre for genres in movie_df['genres'].str.split('|') for genre in genres])

In [7]:
genre_dict = {}
for genre_text in genres_set:
    g = Genre(genre_text=genre_text)
    genre_dict[genre_text] = g
    g.save()

In [8]:
movie_dict = {}
movie_id_dict = {}
for id, row in movie_df.iterrows():
    m = Movie(title=row.title, imdb_id=row.imdbId)
    if row.tmdbId > 0:
        m.tmdb_id = row.tmdbId
    if row.pub_year > 0:
        m.pub_year = row.pub_year
        
    m.save()
    for genre_text in row.genres.split('|'):
        m.genres.add(genre_dict[genre_text])
        
    movie_dict[row.movieId] = m
    movie_id_dict[row.movieId] = id + 1

In [9]:
rating_df = pd.read_csv('./ml-latest-small/ratings.csv')
rating_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [10]:
user_dict = {}
for userId in rating_df['userId']:
    if userId not in user_dict:
        u = User(is_real=False, email='__fake%d' % userId)
        u.save()
        user_dict[userId] = u

In [11]:
for _, row in rating_df.iterrows():
    pub_date = timezone.make_aware(datetime.fromtimestamp(row.timestamp), timezone=timezone.get_current_timezone())
    rating = Rating(score=row.rating, pub_date=pub_date)
    rating.movie = movie_dict[row.movieId]
    rating.user = user_dict[row.userId]
    rating.save()

In [12]:
rating_df['movieId'] = rating_df['movieId'].apply(lambda x: movie_id_dict[x])

In [13]:
n_users = len(User.objects.all())
n_movies = len(Movie.objects.all())
k_factors = 30

# Pretrain Matrix Factorization

- Prediction
$$\widehat R_{u, i} = x_u y_i + \alpha_u + \beta_i$$

- Loss Function
$$\mathcal{L} = \frac{1}{2}(\sum_{(u, i)\in R}(\widehat R_{u, i} - R_{u, i})^2 + \lambda(\sum_u \|x_u\|^2 + \sum_i \|y_i\|^2))$$

- Gradient
$$\frac{\partial \mathcal{L}}{\partial x_u} = \sum_{i}(\widehat R_{u, i} - R_{u, i})y_i + \lambda x_u$$

$$\frac{\partial \mathcal{L}}{\partial y_i} = \sum_{u}(\widehat R_{u, i} - R_{u, i})x_u + \lambda y_i$$

$$\frac{\partial \mathcal{L}}{\partial \alpha_u} = \sum_{i}\widehat R_{u, i} - R_{u, i}$$

$$\frac{\partial \mathcal{L}}{\partial \beta_i} = \sum_{u}\widehat R_{u, i} - R_{u, i}$$

## Model

In [14]:
from keras import backend as K
from keras import regularizers, Model
from keras.layers import Input, Reshape, Lambda, Dot, Add
from keras.layers.embeddings import Embedding
from keras.initializers import RandomNormal, Constant
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [15]:
def get_cf_model(n_users, n_movies, k_factors):
    user_id = Input(shape=(1,), dtype='int32', name='user_id')
    movie_id = Input(shape=(1,), dtype='int32', name='movie_id')
    
    weight_u = Embedding(n_users + 1, k_factors, embeddings_initializer=RandomNormal(stddev=np.sqrt(1 / k_factors)),
                         embeddings_regularizer=regularizers.l2(0.1),
                         input_length=1, name='user_weight')(user_id)
    weight_m = Embedding(n_movies + 1, k_factors, embeddings_initializer=RandomNormal(stddev=np.sqrt(1 / k_factors)),
                         embeddings_regularizer=regularizers.l2(0.1),
                         input_length=1, name='movie_weight')(movie_id)

    bias_u = Embedding(n_users + 1, 1, embeddings_initializer=Constant(1.5),
                       input_length=1, name='user_bias')(user_id)
    bias_m = Embedding(n_movies + 1, 1, embeddings_initializer=Constant(1.5),
                       input_length=1, name='movie_bias')(movie_id)
    
    product = Dot(axes=2, name='dot')([weight_u, weight_m])
    score = Add(name='add')([product, bias_u, bias_m])
    score = Reshape((1,), name='reshape')(score)
    model = Model(inputs=(user_id, movie_id), outputs=score)
    
    return model

## Data Preparation

In [16]:
k_factors = 10
cf_model = get_cf_model(n_users, n_movies, k_factors)
cf_model.compile(loss='mse', optimizer='adam')

In [17]:
cf_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
movie_id (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
user_weight (Embedding)         (None, 1, 10)        6720        user_id[0][0]                    
__________________________________________________________________________________________________
movie_weight (Embedding)        (None, 1, 10)        91260       movie_id[0][0]                   
__________________________________________________________________________________________________
dot (Dot) 

In [18]:
x = [rating_df['userId'].values, rating_df['movieId'].values]
y = rating_df['rating'].values

early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')
cf_model.fit(x, y, epochs=20, batch_size=128, validation_split=0.1, callbacks=[early_stopping]);

Train on 90003 samples, validate on 10001 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
movie_vec = np.zeros((n_movies + 1, 1 + k_factors))

movie_vec[:, 1:] = cf_model.get_layer('movie_weight').get_weights()[0]
movie_vec[:, 0:1] = cf_model.get_layer('movie_bias').get_weights()[0]

In [21]:
user_vec = np.zeros((n_users + 1, 1 + k_factors))

user_vec[:, 1:] = cf_model.get_layer('user_weight').get_weights()[0]
user_vec[:, 0:1] = cf_model.get_layer('user_bias').get_weights()[0]

In [22]:
for movie in Movie.objects.all():
    movie.latent_factor = movie_vec[movie.id, :].tolist()
    movie.save(update_fields=['latent_factor'])
    
for user in User.objects.all():
    user.latent_factor = user_vec[user.id, :].tolist()
    user.save(update_fields=['latent_factor'])