In [1]:
MYPROJECT = '/Users/jsaon/Work/Interview/Intowow/MovieRecommendation/'
import os, sys
sys.path.insert(0, MYPROJECT)
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "local_settings.py")
import django
django.setup()

Loaded latent factors of movies.


In [2]:
import numpy as np
import pandas as pd

from datetime import datetime
from django.utils import timezone
from django.db.models import Avg, Count
from django.contrib.auth import get_user_model
User = get_user_model()
from django.core.cache import cache

from movie.models import Movie, Genre, Rating

In [70]:
link_df = pd.read_csv('./ml-latest/links.csv')
movie_df = pd.read_csv('./ml-latest/movies.csv')
rating_df = pd.read_csv('./ml-latest/ratings.csv')

In [71]:
movie_df = pd.merge(movie_df, link_df)
movie_df['tmdbId'] = movie_df['tmdbId'].fillna(0).astype(int)
movie_df['pub_year'] = movie_df['title'].str.extract('\(([0-9]+)\)$', expand=False).fillna(0).astype(int)

# Data Cleaning

- Drop the movies before 2010
- Drop the movies with less than 10 ratings

In [72]:
movie_df = movie_df[movie_df['pub_year'] >= 2010]
rating_df = rating_df.merge(movie_df, on='movieId', how='inner')[rating_df.columns]
rating_count = rating_df.groupby('movieId').size().to_frame(name='count').reset_index()
movie_df = movie_df.merge(rating_count, how='inner')
movie_df = movie_df[movie_df['count'] >= 10]
rating_df = rating_df.merge(movie_df, on='movieId', how='inner')[rating_df.columns]

In [73]:
print('Number of movies after cleaning: ', len(movie_df))
print('Number of ratings after cleaning: ', len(rating_df))

Number of movies after cleaning:  5127
Number of ratings after cleaning:  1892265


In [81]:
movie_df = movie_df.reset_index().drop('index', axis=1)

# Remap id
- Make the id dense, more suitable to apply embeddings

- Drop the user whose ratings were dropped

In [90]:
movie_id_dict = {}

for idx, row in movie_df.iterrows():
    movie_id_dict[row.movieId] = idx + 1

movie_df['movieId'] = movie_df['movieId'].apply(lambda x: movie_id_dict[x])
rating_df['movieId'] = rating_df['movieId'].apply(lambda x: movie_id_dict[x])

In [102]:
user_id_dict = {userId: i for i, userId in enumerate(rating_df['userId'].unique(), 1)}
rating_df['userId'] = rating_df['userId'].apply(lambda x: user_id_dict[x])

In [105]:
print('Number of users after cleaning: ', len(user_id_dict))

Number of users after cleaning:  66323


In [106]:
movie_df.to_csv('./ml-latest/movies_preprocessed.csv')
rating_df.to_csv('./ml-latest/ratings_preprocessed.csv')

# Database building

In [107]:
genres_set = set([genre for genres in movie_df['genres'].str.split('|') for genre in genres])

In [108]:
genre_dict = {}
for genre_text in genres_set:
    g = Genre(genre_text=genre_text)
    genre_dict[genre_text] = g
    g.save()

In [109]:
movie_dict = {}
for _, row in movie_df.iterrows():
    m = Movie(title=row.title, imdb_id=row.imdbId)
    if row.tmdbId > 0:
        m.tmdb_id = row.tmdbId
    if row.pub_year > 0:
        m.pub_year = row.pub_year
        
    m.save()
    for genre_text in row.genres.split('|'):
        m.genres.add(genre_dict[genre_text])
        
    movie_dict[row.movieId] = m

In [111]:
for _, row in rating_df.iterrows():
    pub_date = timezone.make_aware(datetime.fromtimestamp(row.timestamp), timezone=timezone.get_current_timezone())
    rating = Rating(score=row.rating, pub_date=pub_date)
    rating.movie = movie_dict[row.movieId]
    rating.save()

In [3]:
n_users = 66323
n_movies = 5127

# Matrix Factorization Model

- Prediction
$$\widehat R_{u, i} = x_u y_i + \alpha_u + \beta_i + \mu$$

$$\mu = \frac{1}{N} \sum R_{u,i}$$

- Loss Function
$$\mathcal{L} = \frac{1}{2}(\sum_{(u, i)\in R}(\widehat R_{u, i} - R_{u, i})^2 + \lambda(\sum_u \|x_u\|^2 + \alpha_u^2 + \sum_i \|y_i\|^2 + \beta_i^2))$$

- Gradient
$$\frac{\partial \mathcal{L}}{\partial x_u} = \sum_{i}(\widehat R_{u, i} - R_{u, i})y_i + \lambda x_u$$

$$\frac{\partial \mathcal{L}}{\partial y_i} = \sum_{u}(\widehat R_{u, i} - R_{u, i})x_u + \lambda y_i$$

$$\frac{\partial \mathcal{L}}{\partial \alpha_u} = \sum_{i}\widehat R_{u, i} - R_{u, i}$$

$$\frac{\partial \mathcal{L}}{\partial \beta_i} = \sum_{u}\widehat R_{u, i} - R_{u, i}$$

In [4]:
from keras import backend as K
from keras import regularizers, Model
from keras.models import load_model
from keras.layers import Input, Reshape, Lambda, Dot, Add, Dropout
from keras.layers.embeddings import Embedding
from keras.constraints import max_norm
from keras.initializers import RandomNormal
from keras.callbacks import EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [5]:
def get_cf_model(n_users, n_movies, k_factors):
    user_id = Input(shape=(1,), dtype='int32', name='user_id')
    movie_id = Input(shape=(1,), dtype='int32', name='movie_id')
    
    weight_u = Embedding(n_users + 1, k_factors,
                         embeddings_initializer=RandomNormal(stddev=np.sqrt(1 / k_factors)),
                         embeddings_regularizer=regularizers.l2(0.02 / n_users),
                         input_length=1, name='user_weight')(user_id)
    weight_m = Embedding(n_movies + 1, k_factors,
                         embeddings_initializer=RandomNormal(stddev=np.sqrt(1 / k_factors)),
                         embeddings_regularizer=regularizers.l2(0.02 / n_movies),
                         input_length=1, name='movie_weight')(movie_id)
    
    bias_u = Embedding(n_users + 1, 1, embeddings_initializer='zero',
                       embeddings_regularizer=regularizers.l2(0.02 / n_users),
                       input_length=1, name='user_bias')(user_id)
    bias_m = Embedding(n_movies + 1, 1, embeddings_initializer='zero',
                       embeddings_regularizer=regularizers.l2(0.02 / n_movies),
                       input_length=1, name='movie_bias')(movie_id)
    
    score = Dot(axes=2, name='dot')([weight_u, weight_m])
    score = Add(name='add')([score, bias_u, bias_m])
    score = Reshape((1,), name='reshape')(score)
    model = Model(inputs=(user_id, movie_id), outputs=score)
    
    return model

In [6]:
k_factors = 30
cf_model = get_cf_model(n_users, n_movies, k_factors)

def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_true - y_pred), axis=-1))

cf_model.compile(loss='mse', optimizer='adam', metrics=[rmse])

In [167]:
cf_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user_id (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
movie_id (InputLayer)           (None, 1)            0                                            
__________________________________________________________________________________________________
user_weight (Embedding)         (None, 1, 30)        1989720     user_id[0][0]                    
__________________________________________________________________________________________________
movie_weight (Embedding)        (None, 1, 30)        153840      movie_id[0][0]                   
__________________________________________________________________________________________________
dot (Dot) 

In [168]:
x = [rating_df['userId'].values, rating_df['movieId'].values]
y = rating_df['rating'].values
rating_mean = np.mean(y)
rating_mean

3.5533318536251528

In [None]:
checkpoint = ModelCheckpoint('./ml-latest/model_best.hdf5', monitor='val_loss', verbose=1,
                             save_best_only=True, mode='min')
early_stopping = EarlyStopping(monitor='val_loss', patience=5, mode='min')

cf_model.fit(x, y - rating_mean, epochs=20, batch_size=256,
             validation_split=0.05, callbacks=[checkpoint]);

In [7]:
cf_model = load_model('./ml-latest/model_best.hdf5', custom_objects={'rmse': rmse})
movie_vec = np.zeros((n_movies + 1, 1 + 30))

movie_vec[:, 1:] = cf_model.get_layer('movie_weight').get_weights()[0]
movie_vec[:, 0:1] = cf_model.get_layer('movie_bias').get_weights()[0]

user_vec = np.zeros((n_users + 1, 1 + 30))

user_vec[:, 1:] = cf_model.get_layer('user_weight').get_weights()[0]
user_vec[:, 0:1] = cf_model.get_layer('user_bias').get_weights()[0]

In [8]:
np.max(np.dot(movie_vec[:, 1:], user_vec[:, 1:].T))

2.4818730933288471

In [15]:
def get_top_k(title, k):
    m = Movie.objects.get(title=title)
    print(title)
    for genre in m.genres.all():
        print('-', genre)
    print()
    v = movie_vec[m.id, :]
    score = np.dot(movie_vec[:, 1:], v[1:])
    indices = np.argsort(score)[-1:-k-1:-1]
    for movie in Movie.objects.filter(id__in=indices):
        print(movie)
        for genre in movie.genres.all():
            print('    -', genre)

In [16]:
get_top_k('Inside Out (2015)', 10)

Inside Out (2015)
- Adventure
- Animation
- Children
- Comedy
- Drama
- Fantasy

How to Train Your Dragon (2010)
    - Adventure
    - Animation
    - Children
    - Fantasy
    - IMAX
Toy Story 3 (2010)
    - Adventure
    - Animation
    - Children
    - Comedy
    - Fantasy
    - IMAX
Tangled (2010)
    - Animation
    - Children
    - Comedy
    - Fantasy
    - IMAX
    - Musical
    - Romance
Wreck-It Ralph (2012)
    - Animation
    - Comedy
Frozen (2013)
    - Adventure
    - Animation
    - Comedy
    - Fantasy
    - Musical
    - Romance
The Lego Movie (2014)
    - Action
    - Adventure
    - Animation
    - Children
    - Comedy
    - Fantasy
How to Train Your Dragon 2 (2014)
    - Action
    - Adventure
    - Animation
Big Hero 6 (2014)
    - Action
    - Animation
    - Comedy
Inside Out (2015)
    - Adventure
    - Animation
    - Children
    - Comedy
    - Drama
    - Fantasy
Zootopia (2016)
    - Action
    - Adventure
    - Animation
    - Children
    - Comedy


In [199]:
for movie in Movie.objects.all():
    movie.latent_factor = movie_vec[movie.id, :].tolist()
    movie.save(update_fields=['latent_factor'])

In [9]:
np.save('./ml-latest/movie_latent_factors.npy', movie_vec)