In this notebook I will explore and compare different algorithms and approaches to recommend anything. I am using the **[netflix movie-dataset](https://www.kaggle.com/netflix-inc/netflix-prize-data/home)** and the **[movies-dataset](https://www.kaggle.com/rounakbanik/the-movies-dataset/home)** for this purpose.

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model

### Load Movie-Data

In [24]:
# Load data for all movies
movie_titles = pd.read_csv('./data/netflix-prize-data/movie_titles.csv', 
                           on_bad_lines='skip',
                           encoding = 'ISO-8859-1', 
                           header = None, 
                           names = ['Id', 'Year', 'Name']).set_index('Id')

print('Shape Movie-Titles:\t{}'.format(movie_titles.shape))
movie_titles.head()

Shape Movie-Titles:	(17434, 2)


Unnamed: 0_level_0,Year,Name
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2003.0,Dinosaur Planet
2,2004.0,Isle of Man TT 2004 Review
3,1997.0,Character
4,1994.0,Paula Abdul's Get Up & Dance
5,2004.0,The Rise and Fall of ECW


In [22]:
# Load a movie metadata dataset
movie_data = pd.read_csv('./data/the-movies-dataset/movies_metadata.csv', low_memory=False)[['original_title', 'overview', 'vote_count']].set_index('original_title').dropna()
# Remove the long tail of rarly rated moves
movie_data = movie_data[movie_data['vote_count']>10].drop('vote_count', axis=1)

print(movie_data.shape)
movie_data.head(5)

(21604, 1)


Unnamed: 0_level_0,overview
original_title,Unnamed: 1_level_1
Toy Story,"Led by Woody, Andy's toys live happily in his ..."
Jumanji,When siblings Judy and Peter discover an encha...
Grumpier Old Men,A family wedding reignites the ancient feud be...
Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
Father of the Bride Part II,Just when George Banks has recovered from his ...


### Load User-Data And Preprocess Data-Structure

In [4]:
# Load single data-file
df_raw = pd.read_csv('./data/netflix-prize-data/combined_data_1.txt', header=None, names=['User', 'Rating', 'Date'], usecols=[0, 1, 2])


# Find empty rows to slice dataframe for each movie
tmp_movies = df_raw[df_raw['Rating'].isna()]['User'].reset_index()
movie_indices = [[index, int(movie[:-1])] for index, movie in tmp_movies.values]

# Shift the movie_indices by one to get start and endpoints of all movies
shifted_movie_indices = deque(movie_indices)
shifted_movie_indices.rotate(-1)


# Gather all dataframes
user_data = []

# Iterate over all movies
for [df_id_1, movie_id], [df_id_2, next_movie_id] in zip(movie_indices, shifted_movie_indices):
    
    # Check if it is the last movie in the file
    if df_id_1<df_id_2:
        tmp_df = df_raw.loc[df_id_1+1:df_id_2-1].copy()
    else:
        tmp_df = df_raw.loc[df_id_1+1:].copy()
        
    # Create movie_id column
    tmp_df['Movie'] = movie_id
    
    # Append dataframe to list
    user_data.append(tmp_df)

# Combine all the dataframes
df = pd.concat(user_data)
del user_data, df_raw, tmp_movies, tmp_df, shifted_movie_indices, movie_indices, df_id_1, movie_id, df_id_2, next_movie_id
print('Shape User-Ratings:\t{}'.format(df.shape))
df.sample(5)

Shape User-Ratings:	(24053764, 4)


Unnamed: 0,User,Rating,Date,Movie
15719585,1822001,5.0,2004-03-28,3071
7134432,559608,3.0,2003-08-14,1428
9398640,1994339,5.0,2004-09-24,1856
11228991,304409,3.0,2004-05-18,2153
16792608,1973723,2.0,2004-03-18,3256


In [5]:
# Filter sparse movies
min_movie_ratings = 10000
filter_movies = (df['Movie'].value_counts()>min_movie_ratings)
filter_movies = filter_movies[filter_movies].index.tolist()

# Filter sparse users
min_user_ratings = 200
filter_users = (df['User'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filterd = df[(df['Movie'].isin(filter_movies)) & (df['User'].isin(filter_users))]
del filter_movies, filter_users, min_movie_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filterd.shape))

Shape User-Ratings unfiltered:	(24053764, 4)
Shape User-Ratings filtered:	(4178032, 4)


### Create Train and Test Sets

In [17]:
df_filterd.head()

Unnamed: 0,User,Rating,Movie
0,443353,4.0,1719
1,874505,5.0,2782
2,2637187,1.0,658
3,409104,4.0,4341
4,1305838,5.0,2372


In [6]:
# Shuffle DataFrame
df_filterd = df_filterd.drop('Date', axis=1).sample(frac=1).reset_index(drop=True)

# Testingsize
n = 100000

# Split train- & testset
df_train = df_filterd[:-n]
df_test = df_filterd[-n:]

### Transform The User-Ratings To User-Movie-Matrix

In [7]:
# Create a user-movie matrix with empty values
df_p = df_train.pivot_table(index='User', columns='Movie', values='Rating')
print('Shape User-Movie-Matrix:\t{}'.format(df_p.shape))
df_p.sample(3)

Shape User-Movie-Matrix:	(20828, 491)


Movie,8,18,28,30,58,77,83,97,108,111,...,4392,4393,4402,4418,4420,4432,4472,4479,4488,4490
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2172303,,,5.0,4.0,,,,,,2.0,...,,2.0,,,,2.0,2.0,,,
1399249,3.0,3.0,,,,3.0,3.0,,,,...,,3.0,3.0,,,3.0,,3.0,3.0,
1661655,,,4.0,,,,,,,,...,4.0,3.0,,3.0,,4.0,,,,


## Recommendation Systems
***
### 1. Cosine User-User Similarity 

In [18]:
# User index for recommendation
user_index = 0

# Number of similar users for recommendation
n_recommendation = 100

# Plot top n recommendations
n_plot = 10


# Fill in missing values
df_p_imputed = df_p.T.fillna(df_p.mean(axis=1)).T

# Compute similarity between all users
similarity = cosine_similarity(df_p_imputed.values)

# Remove self-similarity from similarity-matrix
similarity -= np.eye(similarity.shape[0])


# Sort similar users by index
similar_user_index = np.argsort(similarity[user_index])[::-1]
# Sort similar users by score
similar_user_score = np.sort(similarity[user_index])[::-1]


# Get unrated movies
unrated_movies = df_p.iloc[user_index][df_p.iloc[user_index].isna()].index

# Weight ratings of the top n most similar users with their rating and compute the mean for each movie
mean_movie_recommendations = (df_p_imputed.iloc[similar_user_index[:n_recommendation]].T * similar_user_score[:n_recommendation]).T.mean(axis=0)

# Filter for unrated movies and sort results
best_movie_recommendations = mean_movie_recommendations[unrated_movies].sort_values(ascending=False).to_frame().join(movie_titles)


# Create user-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_p_imputed.index)}

prediction = []
# Iterate over all testset items
for user_id in df_test['User'].unique():
    
    # Sort similar users by index
    similar_user_index = np.argsort(similarity[user_id_mapping[user_id]])[::-1]
    # Sort similar users by score
    similar_user_score = np.sort(similarity[user_id_mapping[user_id]])[::-1]
    
    for movie_id in df_test[df_test['User']==user_id]['Movie'].values:

        # Compute predicted score
        score = (df_p_imputed.iloc[similar_user_index[:n_recommendation]][movie_id] * similar_user_score[:n_recommendation]).values.sum() / similar_user_score[:n_recommendation].sum()
        prediction.append([user_id, movie_id, score])


# Create prediction DataFrame
df_pred = pd.DataFrame(prediction, columns=['User', 'Movie', 'Prediction']).set_index(['User', 'Movie'])
df_pred = df_test.set_index(['User', 'Movie']).join(df_pred)
print(df_pred)    

# Get labels and predictions
y_true = df_pred['Rating'].values
y_pred = df_pred['Prediction'].values

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true=y_true, y_pred=y_pred))

               Rating  Prediction
User    Movie                    
1314869 3256      4.0    4.401744
2501068 1861      3.0    4.475654
2518195 175       4.0    4.445764
775416  329       3.0    4.403596
672097  4306      4.0    4.669126
...               ...         ...
1084524 1202      3.0    4.555293
952981  2171      4.0    4.571723
969530  3463      5.0    4.640530
2602249 3526      2.0    4.575167
937784  295       2.0    4.550828

[100000 rows x 2 columns]


In [19]:
rmse

1.3347671072672664

### 2. Cosine TFIDF Movie Description Similarity

In [25]:
# Create tf-idf matrix for text comparison
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movie_data['overview'].dropna())


# Compute cosine similarity between all movie-descriptions
similarity = cosine_similarity(tfidf_matrix)
# Remove self-similarity from matrix
similarity -= np.eye(similarity.shape[0])



In [42]:
def return_similars(movie):
    # Get index of movie to find similar movies
    n_plot = 10
    index = movie_data.reset_index(drop=True)[movie_data.index==movie].index[0]

    # Get indices and scores of similar movies
    similar_movies_index = np.argsort(similarity[index])[::-1][:n_plot]
    similar_movies_score = pd.Series(np.sort(similarity[index])[::-1][:n_plot])

    # Get titles of similar movies
    similar_movie_titles = pd.Series(movie_data.iloc[similar_movies_index].index)
    similarity_df = pd.concat([similar_movie_titles, similar_movies_score], axis = 1)
    print(similarity_df)

    return similarity_df

In [44]:
movie = 'Batman Begins'
similars = return_similars(movie)

                                      original_title         0
0  Batman Unmasked: The Psychology of the Dark Kn...  0.326434
1            Batman: The Dark Knight Returns, Part 1  0.275389
2                                  Batman: Bad Blood  0.243454
3                                   Batman: Year One  0.226230
4                         Batman: Under the Red Hood  0.212147
5                           Batman Beyond: The Movie  0.201747
6                                     Batman Forever  0.200459
7                       Batman: Mask of the Phantasm  0.199494
8                                      Batman & Bill  0.190721
9                                             Batman  0.186656


In [50]:
movie = 'The Godfather'
similars = return_similars(movie)

                     original_title         0
0            The Godfather: Part II  0.450344
1  The Godfather Trilogy: 1972-1990  0.338927
2           The Godfather: Part III  0.167636
3                        Blood Ties  0.156073
4                               黑社會  0.132408
5                          Mobsters  0.122048
6                     Live by Night  0.120963
7                    Bad Turn Worse  0.120485
8                         Miss Bala  0.116257
9                   Family Business  0.115425


In [46]:
movie = 'Toy Story'
similars = return_similars(movie)

           original_title         0
0             Toy Story 3  0.532358
1             Toy Story 2  0.468491
2  The 40 Year Old Virgin  0.280902
3               Small Fry  0.276693
4               The Champ  0.201434
5   Rebel Without a Cause  0.184986
6  For Your Consideration  0.159134
7               Condorman  0.158504
8         Man on the Moon  0.139145
9                  Malice  0.136603


In [47]:
movie = 'Jumanji'
similars = return_similars(movie)

     original_title         0
0      Table No. 21  0.218070
1              Quiz  0.181517
2           Quintet  0.170891
3         Brainscan  0.167036
4      Turkey Shoot  0.166474
5         Beta Test  0.159217
6            DeVour  0.148305
7  Poolhall Junkies  0.145673
8            Pixels  0.144933
9           Standby  0.143149


In [49]:
movie = 'Waiting to Exhale'
similars = return_similars(movie)

           original_title         0
0       The Boy Next Door  0.103118
1                  Bernie  0.096671
2        Robin and Marian  0.094642
3                    Hero  0.094356
4       Little Black Book  0.089334
5        Spring Breakdown  0.088759
6  Weekend at Bernie's II  0.085443
7         Ruthless People  0.085209
8          The Bunny Game  0.083840
9             L'arnacoeur  0.082101


### Matrix Factorisation With Keras And Gradient Descent

In [51]:
# Create user- & movie-id mapping
user_id_mapping = {id:i for i, id in enumerate(df_filterd['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df_filterd['Movie'].unique())}


# Create correctly mapped train- & testset
train_user_data = df_train['User'].map(user_id_mapping)
train_movie_data = df_train['Movie'].map(movie_id_mapping)

test_user_data = df_test['User'].map(user_id_mapping)
test_movie_data = df_test['Movie'].map(movie_id_mapping)


# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)
embedding_size = 10


##### Create model
# Set input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=embedding_size, 
                            input_dim=movies,
                            input_length=1, 
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([embedding_size])(user_embedding)
movie_vector = Reshape([embedding_size])(movie_embedding)

# Compute dot-product of reshaped embedding layers as prediction
y = Dot(1, normalize=False)([user_vector, movie_vector])

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')


# Fit model
model.fit([train_user_data, train_movie_data],
          df_train['Rating'],
          batch_size=256, 
          epochs=5,
          validation_split=0.1,
          shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
y_true = df_test['Rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Matrix-Factorization: {:.4f} RMSE'.format(rmse))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Testing Result With Keras Matrix-Factorization: 0.8611 RMSE


### Deep Learning With Keras

In [52]:
# Setup variables
user_embedding_size = 20
movie_embedding_size = 10


##### Create model
# Set input layers
user_id_input = Input(shape=[1], name='user')
movie_id_input = Input(shape=[1], name='movie')

# Create embedding layers for users and movies
user_embedding = Embedding(output_dim=user_embedding_size, 
                           input_dim=users,
                           input_length=1, 
                           name='user_embedding')(user_id_input)
movie_embedding = Embedding(output_dim=movie_embedding_size, 
                            input_dim=movies,
                            input_length=1, 
                            name='item_embedding')(movie_id_input)

# Reshape the embedding layers
user_vector = Reshape([user_embedding_size])(user_embedding)
movie_vector = Reshape([movie_embedding_size])(movie_embedding)

# Concatenate the reshaped embedding layers
concat = Concatenate()([user_vector, movie_vector])

# Combine with dense layers
dense = Dense(256)(concat)
y = Dense(1)(dense)

# Setup model
model = Model(inputs=[user_id_input, movie_id_input], outputs=y)
model.compile(loss='mse', optimizer='adam')


# Fit model
model.fit([train_user_data, train_movie_data],
          df_train['Rating'],
          batch_size=256, 
          epochs=5,
          validation_split=0.1,
          shuffle=True)

# Test model
y_pred = model.predict([test_user_data, test_movie_data])
y_true = df_test['Rating'].values

#  Compute RMSE
rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Deep Learning: {:.4f} RMSE'.format(rmse))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Testing Result With Keras Deep Learning: 0.9061 RMSE
