In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib
import tensorflow as tf
import model_keras.recommender_model as rm

pd.set_option('display.max_colwidth', 100)

### Loading Data

In [2]:
movies_encoded_by_genre = pd.read_csv('csv_files/movies_encoded_by_genre_big.csv')
user_ratings_by_genre = pd.read_csv('csv_files/user_ratings_by_genre_big.csv')
ratings_df = pd.read_csv('csv_files/ratings_df_big.csv')

In [3]:
movies_encoded_by_genre

Unnamed: 0,movieId,title,num_ratings,avg_movie_rating,year,Action,Adventure,Animation,Children,Comedy,...,Drama,Fantasy,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),15521,3.90,1995,0,1,1,1,1,...,0,1,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),6149,3.29,1995,0,1,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),3103,3.17,1995,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),610,2.90,1995,0,0,0,0,1,...,1,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),3129,3.08,1995,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50874,281920,When It Melts,1,3.50,2006,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
50875,282085,Spoonful of Sugar,1,4.00,2006,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
50876,283477,Baby Driver 2,1,2.50,2006,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50877,283571,Limbo,1,4.00,2006,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [333]:
# genres = movies_encoded_by_genre.columns[5:]
# genres_count = {}
# for genre in genres:
#     genres_count[genre] = movies_encoded_by_genre[genre].sum()

# plt.figure(figsize=(20,12))
# sns.barplot(genres_count)

### Movie and User training vectors

In [4]:
# Filtering movies by number of ratings
MIN_NUM_RATINGS = movies_encoded_by_genre['num_ratings'].quantile(0.8)

movies_filtered = movies_encoded_by_genre[movies_encoded_by_genre['num_ratings'] >= MIN_NUM_RATINGS]

In [52]:
movies_train = pd.merge(ratings_df, movies_filtered, on='movieId')

movies_train.drop(columns=['userId','title','rating'], inplace=True)
movies_train.rename(columns={'movieId' : '[movieId]', 'num_ratings':'[num_ratings]', 'avg_movie_rating':'[avg_movie_rating]'}, inplace=True)
movies_train

Unnamed: 0,[movieId],[num_ratings],[avg_movie_rating],year,Action,Adventure,Animation,Children,Comedy,Crime,...,Drama,Fantasy,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,288,5539,3.27,1994,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
1,5673,1686,3.56,2002,0,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
2,5707,154,3.67,1981,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
3,2488,307,3.79,1960,0,0,0,0,0,0,...,1,0,1,0,0,0,0,1,0,0
4,6239,161,3.45,1959,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6555747,186,2719,2.87,1995,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
6555748,1961,7259,3.92,1988,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6555749,296,21758,4.19,1994,0,0,0,0,1,1,...,1,0,0,0,0,0,0,1,0,0
6555750,53322,2147,3.52,2007,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [53]:
# Creating user_train
ratings_filtered_by_movie = ratings_df[ratings_df['movieId'].isin(movies_filtered['movieId'])]
user_train = pd.merge(ratings_filtered_by_movie, user_ratings_by_genre, on='userId')

# Creating y_train vectors for NN supervised training
y_train = user_train['rating']

# Organizing columns
user_train.drop(columns=['movieId','rating'], inplace=True)
user_train.rename(columns={'userId':'[userId]', 'avg_user_rating':'[avg_user_rating]'}, inplace=True)
user_train

Unnamed: 0,[userId],[avg_user_rating],Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,322585,3.44,0.63,0.29,0.06,0.00,1.19,1.08,0.00,1.79,0.06,0.15,0.00,0.52,0.81,0.37,1.58,0.08,0.00
1,107064,3.25,0.43,0.20,0.11,0.05,1.15,0.46,0.17,2.16,0.20,0.27,0.10,0.36,0.74,0.29,0.69,0.11,0.01
2,212796,3.75,0.00,0.00,0.00,0.00,1.17,0.00,0.00,1.83,0.00,0.58,0.00,1.92,0.67,0.75,1.33,0.00,0.42
3,243610,3.07,0.35,0.33,0.09,0.15,1.17,0.37,0.25,1.60,0.17,0.15,0.09,0.26,0.64,0.15,0.50,0.13,0.05
4,122428,3.86,1.46,1.12,0.14,0.24,0.78,0.25,0.00,1.08,0.47,0.65,0.31,0.34,0.21,2.36,1.15,0.19,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6555747,311207,3.66,1.45,1.37,0.45,0.58,1.58,0.76,0.00,0.84,0.39,0.00,0.00,0.34,0.42,0.89,0.92,0.00,0.11
6555748,274103,2.80,1.01,0.71,0.15,0.24,1.13,0.47,0.00,1.04,0.34,0.28,0.07,0.18,0.37,0.61,0.90,0.06,0.04
6555749,145722,3.83,0.00,0.52,0.39,0.70,2.22,0.61,0.00,2.91,0.35,0.00,0.17,0.00,1.26,0.00,0.65,0.39,0.00
6555750,143534,3.57,0.22,0.57,0.86,1.17,1.95,0.21,0.00,1.47,0.78,0.00,0.64,0.00,1.43,0.12,0.21,0.00,0.00


### Preparing Data

In [54]:
# scale training data
movies_train_unscaled = movies_train
user_train_unscaled = user_train
y_train_unscaled    = y_train

scalerMovies = StandardScaler()
scalerMovies.fit(movies_train)
movies_train = scalerMovies.transform(movies_train)

scalerUser = StandardScaler()
scalerUser.fit(user_train)
user_train = scalerUser.transform(user_train)

scalerTarget = MinMaxScaler((-1, 1))
scalerTarget.fit(y_train.to_numpy().reshape(-1, 1))
y_train = scalerTarget.transform(y_train.to_numpy().reshape(-1, 1))

print(np.allclose(movies_train_unscaled, scalerMovies.inverse_transform(movies_train)))
print(np.allclose(user_train_unscaled, scalerUser.inverse_transform(user_train)))

True
True


In [57]:
joblib.dump(scalerMovies, 'scalers/scalerMovies.gz')
joblib.dump(scalerUser, 'scalers/scalerUser.gz')
joblib.dump(scalerTarget, 'scalers/scalerTarget.gz')

['scalers/scalerTarget.gz']

In [58]:
movie_train, movie_test = train_test_split(movies_train, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(user_train, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test       = train_test_split(y_train,    train_size=0.80, shuffle=True, random_state=1)
print(f"movie/item training data shape: {movie_train.shape}")
print(f"movie/item test data shape: {movie_test.shape}")

movie/item training data shape: (5244601, 21)
movie/item test data shape: (1311151, 21)


### Neural Network Training

In [None]:
tf.random.set_seed(1)
tf_dataset = tf.data.Dataset.from_tensor_slices(((user_train[:,2:], movie_train[:,3:]), y_train))
tf_dataset = tf_dataset.shuffle(10000).batch(1024).prefetch(tf.data.AUTOTUNE)

model = rm.get_model(user_train.shape[1], movies_train.shape[1])
model = rm.train_model(model, tf_dataset, epochs=20)


Epoch 1/20
[1m5122/5122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 3ms/step - loss: 0.1586
Epoch 2/20
[1m5122/5122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 0.1528
Epoch 3/20
[1m5122/5122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 0.1508
Epoch 4/20
[1m5122/5122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 0.1496
Epoch 5/20
[1m5122/5122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 3ms/step - loss: 0.1487
Epoch 6/20
[1m5122/5122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step - loss: 0.1480
Epoch 7/20
[1m5122/5122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step - loss: 0.1475
Epoch 8/20
[1m5122/5122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step - loss: 0.1470
Epoch 9/20
[1m5122/5122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step - loss: 0.1467
Epoch 10/20
[1m5122/5122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [61]:
model.save('model_keras/content_based_recommender.keras')

In [60]:
model.evaluate([user_test[:,2:], movie_test[:,3:]], y_test)

[1m40974/40974[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 591us/step - loss: 0.1452


0.1451825648546219

### Recommending movies for new user

#### New user vector

In [373]:
new_user_id = 5000
new_avg_user_rating = 0.0
new_action = 0.0
new_adventure = 4.0
new_animation = 0.0
new_children = 0.0
new_comedy = 1.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 0.0
new_horror = 3.0
new_musical = 0.0
new_mystery = 5.0
new_romance = 0.0
new_scifi = 2.0
new_thriller = 0.0
new_war = 0.0
new_western = 0.0

user_vec = np.array([new_user_id, new_avg_user_rating,
                       new_action, new_adventure, new_animation, new_children,
                       new_comedy, new_crime, new_documentary,
                       new_drama, new_fantasy, new_horror,
                       new_musical, new_mystery,
                       new_romance, new_scifi, new_thriller,
                       new_war, new_western])

user_vec

array([5.e+03, 0.e+00, 0.e+00, 4.e+00, 0.e+00, 0.e+00, 1.e+00, 0.e+00,
       0.e+00, 0.e+00, 0.e+00, 3.e+00, 0.e+00, 5.e+00, 0.e+00, 2.e+00,
       0.e+00, 0.e+00, 0.e+00])

#### Movies vector based on genre and IMDB's weighted rating formula

In [374]:
def imdb_weighted_rating(v, r, c):
    '''
    This function calculates the IMDB's weighted rating for a movie
    Parameters:
    v : number of votes for the movie
    r : average rating of the movie
    c : mean vote across the whole report
    '''
    m = MIN_NUM_RATINGS

    return (r*v/(v+m))+(c*m/(v+m))


In [375]:
def gen_movie_vecs(movies_df, year_filter=2000,top_movies=50):
    movies_vecs = movies_filtered.drop(columns=['title'])
    movies_vecs.rename(columns={'movieId' : '[movieId]', 'num_ratings':'[num_ratings]', 'avg_movie_rating':'[avg_movie_rating]'}, inplace=True)

    # Adding IMDB's wheighted rating
    c = movies_vecs['[avg_movie_rating]'].mean()
    movies_vecs['imdb_rating'] = imdb_weighted_rating(movies_vecs['[num_ratings]'], movies_vecs['[avg_movie_rating]'], c).round(2)

    # Filtering by movie year
    year_filter = year_filter
    movies_vecs = movies_vecs[movies_vecs['year'] >= year_filter]

    # Filtering by favorite genres
    movies_vecs.sort_values('imdb_rating', ascending=False, inplace=True)
    aux = pd.DataFrame()
    for i in range(len(user_vec[2:])):
        genre_rate = user_vec[i+2]
        offset = 4
        column = movies_vecs.columns[offset+i]
        if genre_rate >= 3:
            subset = movies_vecs[movies_vecs[column] == 1].head(top_movies)
            aux = pd.concat([aux, subset], axis=0)

    movies_vecs = aux.drop_duplicates().drop(columns=['imdb_rating'])


    return movies_vecs

#### Predicting ratings on NN

In [376]:
# generate and replicate the user vector to match the number movies in the data set.
movies_vecs = gen_movie_vecs(movies_filtered)
user_vecs = pd.DataFrame([user_vec]*len(movies_vecs), columns = user_train_unscaled.columns)

# scale our user and item vectors
suser_vecs = scalerUser.transform(user_vecs)
smovies_vecs = scalerMovies.transform(movies_vecs)

# make a prediction
y_p = model.predict([suser_vecs[:, 2:], smovies_vecs[:, 3:]])

# unscale y prediction 
y_pu = scalerTarget.inverse_transform(y_p)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


#### Output

In [377]:
recommendations = pd.merge(movies_vecs['[movieId]'], movies_encoded_by_genre[['movieId','title', 'avg_movie_rating']], left_on='[movieId]', right_on='movieId')
recommendations = pd.concat([recommendations,pd.DataFrame(y_pu.round(2), columns=['y_pu'])],axis=1)
recommendations[['title', 'avg_movie_rating', 'y_pu']].sort_values('y_pu',ascending=False).reset_index(drop=True).head(10)

Unnamed: 0,title,avg_movie_rating,y_pu
0,Spirited Away (Sen to Chihiro no kamikakushi) (2001),4.22,4.63
1,Memento (2000),4.14,4.54
2,Howl's Moving Castle (Hauru no ugoku shiro) (2004),4.12,4.53
3,"Lord of the Rings: The Fellowship of the Ring, The (2001)",4.1,4.48
4,28 Days Later (2002),3.77,4.42
5,"Lord of the Rings: The Two Towers, The (2002)",4.09,4.41
6,Old Boy (2003),4.09,4.39
7,Get Out (2017),4.01,4.38
8,Don't Hug Me I'm Scared 5 (2015),3.69,4.37
9,The Witch (2015),3.75,4.37
