# Netflix movie recommendation engine
Kaggle competition link: https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data 

After doing some preprocessing, now we have 2 datasets in sparse matrix format. 
1. train_sparse_matrix
2. test_sparse_matrix

Both the data sets have 'user' as rows and 'movie' as columns and 'rating' as data.<br>

Here, our goal is to create most relevant features to train our model on so that it can be as accurate as possible. We will create below 13 features and make the model ready dataset for train & test:
1. Global average
2. Average rating by the user
3. Average rating for the movie
4. Ratings received from top 5 similar users for the movie
5. Ratings given for top 5 similar movies from the user

In [1]:
# imports
from datetime import datetime # To compute time taken wherever necessary
import os
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load train & test sparse matrix which we've created in previous file
transformed_folder = 'F:/09 - Machine Learning Case Studies/01 - Netflix Movies Recommendation/Transformed Data'
train_sparse_matrix = sparse.load_npz(transformed_folder+'/train_sparse_matrix.npz')
print('train_sparse_matrix loaded!')
test_sparse_matrix = sparse.load_npz(transformed_folder+'/test_sparse_matrix.npz')
print('test_sparse_matrix loaded!')

train_sparse_matrix loaded!
test_sparse_matrix loaded!


### 1. Create samples and do feature enginnering on sample data sets
The train dataset contains 405k users & 17k movies while test dataset contains 35k users & 17k movies. If we perform similarity computation & featurization on these, it will take a lot of time considering my local machine. To save time, lets create a sample of these datasets and operate on them. Later we can extend the same operations on full dataset.

#### A function that takes sparse matrix and returns sample sparse matrix 

In [3]:
def get_sample(sparse_matrix, no_users, no_movies, file_name):
    
    row_index, col_index, ratings = sparse.find(sparse_matrix)
    users = np.unique(row_index)
    movies = np.unique (col_index)
    
    print("Original matrix stats")
    print("Users: {}\nMovies: {}\nRatings: {}".format(len(users), len(movies), len(ratings)))
    
    # Pick random state so we will get same samples everytime 
    np.random.seed(15)
    # Pick samples without replacement
    sample_users = np.random.choice(users, no_users, replace=False)
    sample_movies = np.random.choice(movies, no_movies, replace=False)
    
    # Get the boolean mask of these sampled indexes from original matrix. 
    # This will help us to directly use index reference to pick the rating 
    boolean_mask = np.logical_and(np.isin(row_index, sample_users), np.isin(col_index, sample_movies))
    
    # Pick the sample values which from the index where boolean_mask is returning true
    sample_sparse_matrix = sparse.csc_matrix( (ratings[boolean_mask], (row_index[boolean_mask], col_index[boolean_mask])),
                                            shape = (max(sample_users)+1, max(sample_movies)+1) )

    print("\nSampled matrix stats")
    print("Users: {}\nMovies: {}\nRatings: {}".format(len(sample_users), len(sample_movies), ratings[boolean_mask].shape[0]))
    
    print('Saving to {}'.format(transformed_folder+'/'+file_name))
    sparse.save_npz(transformed_folder+'/'+file_name, sample_sparse_matrix)
    return sample_sparse_matrix

#### 1.1 Create sample_train_sparse_matrix from train_sparse_matrix

In [4]:
start = datetime.now()
if os.path.isfile(transformed_folder+'/sample_train_sparse_matrix.npz'):
    print('Sparse matrix for sample train already exists. Getting it from disk...')
    sample_train_sparse_matrix = sparse.load_npz(transformed_folder+'/sample_train_sparse_matrix.npz')
    print('Done!')
    print('Time taken: {}'.format(datetime.now() - start))
else:
    print('Creating sample_train_sparse_matrix from train_sparse_matrix...')
    sample_train_sparse_matrix = get_sample(sparse_matrix=train_sparse_matrix, no_users=25000, 
                                            no_movies=3000, file_name="sample_train_sparse_matrix.npz")
    print('Done!')
    print('Time taken: {}'.format(datetime.now() - start))

Sparse matrix for sample train already exists. Getting it from disk...
Done!
Time taken: 0:00:00.217653


#### 1.2 Create sample_test_sparse_matrix from test_sparse_matrix

In [5]:
start = datetime.now()
if os.path.isfile(transformed_folder+'/sample_test_sparse_matrix.npz'):
    print('Sparse matrix for sample test already exists. Getting it from disk...')
    sample_test_sparse_matrix = sparse.load_npz(transformed_folder+'/sample_test_sparse_matrix.npz')
    print('Done!')
    print('Time taken: {}'.format(datetime.now() - start))
else:
    print('Creating sample_train_sparse_matrix from test_sparse_matrix...')
    sample_test_sparse_matrix = get_sample(sparse_matrix=test_sparse_matrix, no_users=15000, 
                                            no_movies=2000, file_name="sample_test_sparse_matrix.npz")
    print('Done!')
    print('Time taken: {}'.format(datetime.now() - start))

Sparse matrix for sample test already exists. Getting it from disk...
Done!
Time taken: 0:00:00.022558


In [6]:
# Compare samples with original
print('Ratings in train_sparse_matrix: {}'.format(train_sparse_matrix.count_nonzero()))
print('Ratings in sample_train_sparse_matrix: {}'.format(sample_train_sparse_matrix.count_nonzero()))

print('\nRatings in test_sparse_matrix: {}'.format(test_sparse_matrix.count_nonzero()))
print('Ratings in sample_test_sparse_matrix: {}'.format(sample_test_sparse_matrix.count_nonzero()))


Ratings in train_sparse_matrix: 80384405
Ratings in sample_train_sparse_matrix: 856986

Ratings in test_sparse_matrix: 20096102
Ratings in sample_test_sparse_matrix: 103997


In [7]:
# Change variable names for ease
del train_sparse_matrix
del test_sparse_matrix
train_sparse_matrix = sample_train_sparse_matrix
test_sparse_matrix = sample_test_sparse_matrix

In [8]:
# After changing variable names
print('Ratings in train_sparse_matrix: {}'.format(train_sparse_matrix.count_nonzero()))
print('Ratings in sample_train_sparse_matrix: {}'.format(sample_train_sparse_matrix.count_nonzero()))

print('\nRatings in test_sparse_matrix: {}'.format(test_sparse_matrix.count_nonzero()))
print('Ratings in sample_test_sparse_matrix: {}'.format(sample_test_sparse_matrix.count_nonzero()))

Ratings in train_sparse_matrix: 856986
Ratings in sample_train_sparse_matrix: 856986

Ratings in test_sparse_matrix: 103997
Ratings in sample_test_sparse_matrix: 103997


### 2. Feature Engineering

#### 2.1 Basic features based on statistics
Lets try to build some features that will be useful in modelling. Few such features could be:
1. Average of all ratings given
2. Average rating per user
3. Average rating per movie

We will create a dictionary train_averages which will store all these values. Like:<br>
train_averages = <br>
{<br>
'global_average' : xyz,<br>
'user' : { 'user1': abc, 'user2':pqr},<br>
'movie' : { 'movie1': abc, 'movie2':pqr}<br>
}

In [9]:
train_averages = dict()

#### Global average

In [10]:
start = datetime.now()
# Global average
train_averages['global_average'] = train_sparse_matrix.sum()/train_sparse_matrix.count_nonzero()
print(train_averages)
print("*"*50)
print('Time taken: {}'.format(datetime.now() - start))

{'global_average': 3.5875813607223455}
**************************************************
Time taken: 0:00:00.020541


#### Average rating per user

In [11]:
start = datetime.now()
# Average rating per user
print('Computing average ratings per user...')
sum_of_ratings_per_user = train_sparse_matrix.sum(axis=1).A1
no_of_ratings_per_user = (train_sparse_matrix!=0).sum(axis=1).A1

u,m = train_sparse_matrix.shape
average_ratings = dict()
for i in range(0,u):
    if no_of_ratings_per_user[i]!=0:
        average_ratings[i] = sum_of_ratings_per_user[i]/no_of_ratings_per_user[i]

train_averages['user'] = average_ratings
print('Done!')
print("*"*50)
print('Time taken: {}'.format(datetime.now() - start))

Computing average ratings per user...
Done!
**************************************************
Time taken: 0:00:00.494665


In [12]:
print('Average rating given by user 14531 is: {}'.format(train_averages['user'][14531]))

Average rating given by user 14531 is: 3.7142857142857144


#### Average rating per movie

In [13]:
start = datetime.now()
# Average rating per user
print('Computing average ratings per movie...')
sum_of_ratings_per_movie = train_sparse_matrix.sum(axis=0).A1
no_of_ratings_per_movie = (train_sparse_matrix!=0).sum(axis=0).A1

u,m = train_sparse_matrix.shape
average_ratings = dict()
for i in range(0,m):
    if no_of_ratings_per_movie[i]!=0:
        average_ratings[i] = sum_of_ratings_per_movie[i]/no_of_ratings_per_movie[i]

train_averages['movie'] = average_ratings
print('Done!')
print("*"*50)
print('Time taken: {}'.format(datetime.now() - start))

Computing average ratings per movie...
Done!
**************************************************
Time taken: 0:00:00.014997


In [14]:
print('Average rating given for movie 40 is: {}'.format(train_averages['movie'][40]))

Average rating given for movie 40 is: 2.6923076923076925


#### 2.2 Compute user-user similarity matrix

We know that there are more than 400k users. If we try to compute similarity using cosine similarity, we will need to do (400k * 400k / 2) = 80 billions computations, which will take days to compute even though it will only perform on nonzeros. <br>
If we try to reduce number of dimensions using PCA or SVD, it will take even more time as the matrix will become dense and the multiplication will be done for each feature as there will not be zero cells. <br><br>

One of the ideas is to compute similarity at <b>run time</b>. Here, we will follow below startegy to compute user-user similarity:<br>
1. We will compute similarity (top N) for given user (run time)
2. Once we compute similarity for any user, we will store this data in our customized data structure so that we can retirve it whenever we want in future
3. Now next time if we want to compute similarity for any user, first we will check if we have already computed for that particular user. If yes, then get it from out customized data structure. If not, compute the similarity and store it in our customized data structure.
<br><br>
Our customized data structure will be a <b>dictionary of dictionaries</b><br>
{ 'user1': {'similar_user1' : value1 }, {'similar_user2' : value2 }, {'similar_user3' : value3 } <br>
  'user2': {'similar_user1' : value1 }, {'similar_user2' : value2 }, {'similar_user3' : value3 } }
  
<br>
For now, since we have taken sample of users, lets compute similarity using cosine_similarity which wouldn't take much time.<br>
<b>From the result of this, we will pick top 5 similar users and take them as features to train our model.</b>

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
start = datetime.now()
# To compute this, we will use consine similarity
if os.path.isfile(transformed_folder+'/u_u_similarity_sparse.npz'):
    print('Sparse matrix for user-user similarity already exists, getting it from disk... ')
    u_u_similarity_sparse = sparse.load_npz(transformed_folder+'/u_u_similarity_sparse.npz')
    print('Done!')
    print('Time taken; {}'.format(datetime.now() - start))
else:
    print('Creating sparse matrix for user-user similarity...')
    u_u_similarity_sparse = cosine_similarity(X=train_sparse_matrix, dense_output=Falseb)
    print('Saving to {}'.format(transformed_folder+'/u_u_similarity_sparse.npz'))
    sparse.save_npz(transformed_folder+'/u_u_similarity_sparse.npz', u_u_similarity_sparse)
    print('Done!')
    print('Time taken: {}'.format(datetime.now() - start))

Sparse matrix for user-user similarity already exists, getting it from disk... 
Done!
Time taken; 0:00:31.186462


In [17]:
print('Shape of user-user similarity matrix: {}'.format(u_u_similarity_sparse.shape))

Shape of user-user similarity matrix: (2649405, 2649405)


In [18]:
# Get unique user IDs
user_ids = np.unique(u_u_similarity_sparse.nonzero()[1])

In [19]:
print('Total number of unique users: {}'.format(len(user_ids)))

Total number of unique users: 24029


#### 2.3 Compute movie-movie similarity matrix

We know that there are around 17k movies in total. If we try to compute cosine similarity, we will need to do (17k * 17k / 2) = 144 millions computations, which would not take much longer. In this case, we can definately go with cosine similarity to get similar movies for each movie.

<br>
<b>From the result of this, we will pick top 5 similar movies and take them as features to train our model.</b>

In [20]:
start = datetime.now()
# To compute this, we will use consine similarity
if os.path.isfile(transformed_folder+'/m_m_similarity_sparse.npz'):
    print('Sparse matrix for movie-movie similarity already exists, getting it from disk... ')
    m_m_similarity_sparse = sparse.load_npz(transformed_folder+'/m_m_similarity_sparse.npz')
    print('Done!')
    print('Time taken; {}'.format(datetime.now() - start))
else:
    print('Creating sparse matrix for movie-movie similarity...')
    m_m_similarity_sparse = cosine_similarity(X=train_sparse_matrix.T, dense_output=False)
    print('Saving to {}'.format(transformed_folder+'/m_m_similarity_sparse.npz'))
    sparse.save_npz(transformed_folder+'/m_m_similarity_sparse.npz', m_m_similarity_sparse)
    print('Done!')
    print('Time taken: {}'.format(datetime.now() - start))

Creating sparse matrix for movie-movie similarity...
Saving to F:/09 - Machine Learning Case Studies/01 - Netflix Movies Recommendation/Transformed Data/m_m_similarity_sparse.npz
Done!
Time taken: 0:00:04.083615


In [21]:
print('Shape of movie-movie similarity matrix: {}'.format(m_m_similarity_sparse.shape))

Shape of movie-movie similarity matrix: (17755, 17755)


In [22]:
# Get unique movie IDs which have atleast 1 rating
movie_ids = np.unique(m_m_similarity_sparse.nonzero()[1])

In [24]:
print('Total number of unique movies: {}'.format(len(movie_ids)))

Total number of unique movies: 2958


### 3. Featurizing/Preparing Datasets

As of now, we have decided to have features to train our model on:
1. Global average
2. Average rating by the user
3. Average rating for the movie
4. Ratings received from top 5 similar users for the movie
5. Ratings given for top 5 similar movies from the user

In total, we have 13 features to train our base model on. Lets featurize these to bring them in a required format.

In [27]:
print('No of ratings in train matrix is : {}'.format(train_sparse_matrix.count_nonzero()))
print('No of ratings in test matrix is  : {}'.format(test_sparse_matrix.count_nonzero()))

No of ratings in train matrix is : 856986
No of ratings in test matrix is  : 103997


#### 3.1 Featurize/prepare train data

In [28]:
# get users, movies and ratings from our train sparse matrix
train_users, train_movies, train_ratings = sparse.find(train_sparse_matrix)
print('Operation complete!')

Operation complete!


In [77]:
start = datetime.now()
model_data_folder = 'C:/Users/ParikshitShinge/Downloads/09 - Machine Learning Case Studies/01 - Netflix Movies Recommendation/Model Ready Data'
if os.path.isfile(model_data_folder+'/featurized_train.csv'):
    print('Featurized train already exists.')
else:
    print('Preparing featurized_train.csv file for {} ratings...'.format(train_ratings))
    
    with open(model_data_folder+'/featurized_train.csv', mode='w') as featurized_train_file:
        user_counter = 0
        
        # Loop for each user-movie-rating combination
        for (user, movie, rating) in zip(train_users, train_movies, train_ratings):
            
            # Lets get rating for the "movie" by similar users
            top_sim_users = u_u_similarity_sparse[user].toarray().ravel().argsort()[::-1][1:]
            top_ratings = train_sparse_matrix[top_sim_users, movie].toarray().ravel()
            top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
            top_sim_users_ratings.extend([train_averages['movie'][movie]] * (5 - len(top_sim_users_ratings))) # Fill up the zeros with movie's average rating by all users
            
            # Lets get rating by the "user" for similar movies
            top_sim_movies = m_m_similarity_sparse[movie].toaray().ravel().argsort()[::-1][1:]
            top_ratings = train_sparse_matrix[user, top_sim_movies].toarray().ravel()
            top_sim_movies_ratings = list(top_ratings[top_ratings !=0][:5])
            top_sim_movies_ratings.extend([train_averages['user'][user]] * (5 - len(top_sim_movies_ratings))) # Fill up the zeros with user's average rating for all movies
            
            # Prepare a row to insert into featurized file
            row = list()
            row.append(user)
            row.append(movie)
            row.append(rating)
            row.append(train_averages['global'])
            row.append(train_averages['user'][user])
            row.append(train_averages['movie'][movie])
            row.extend(top_sim_users_ratings)
            row.extend(top_sim_movies_ratings)
            
            # Increase the count for verbose
            user_counter = user_counter + 1
            
            # Add the row to file
            featurized_train_file.write(','.join(map(str, row)))
            featurized_train_file.write('\n')
            if user_counter%1000 == 0:
                print('Done for {} rows in {}'.format(user_counter, datetime.now()-start ))
            
print('Time taken: {}', datetime.now()-start)

Featurized train already exists.
Time taken: {} 0:00:00.001003


In [79]:
featurized_train = pd.read_csv(model_data_folder+'/featurized_train.csv', names = ['user', 'movie', 'GAvg', 'sur1', 'sur2', 'sur3', 'sur4', 'sur5','smr1', 'smr2', 'smr3', 'smr4', 'smr5', 'UAvg', 'MAvg', 'rating'], header=None)
featurized_train.head()

# GAvg : Average rating of all the ratings 
# Similar users rating of this movie : sur1, sur2, sur3, sur4, sur5 ( top 5 similar users who rated that movie.. )
# Similar movies rated by this user : smr1, smr2, smr3, smr4, smr5 ( top 5 similar movies rated by this movie.. )
# UAvg : User's Average rating
# MAvg : Average rating of this movie
# rating : Rating of this movie by this user.

Unnamed: 0,user,movie,GAvg,sur1,sur2,sur3,sur4,sur5,smr1,smr2,smr3,smr4,smr5,UAvg,MAvg,rating
0,53406,33,3.581679,4.0,5.0,5.0,4.0,1.0,5.0,2.0,5.0,3.0,1.0,3.37037,4.092437,4
1,99540,33,3.581679,5.0,5.0,5.0,4.0,5.0,3.0,4.0,4.0,3.0,5.0,3.555556,4.092437,3
2,99865,33,3.581679,5.0,5.0,4.0,5.0,3.0,5.0,4.0,4.0,5.0,4.0,3.714286,4.092437,5
3,101620,33,3.581679,2.0,3.0,5.0,5.0,4.0,4.0,3.0,3.0,4.0,5.0,3.584416,4.092437,5
4,112974,33,3.581679,5.0,5.0,5.0,5.0,5.0,3.0,5.0,5.0,5.0,3.0,3.75,4.092437,5


#### 3.2 Featurize/prepare test data

Here, we will face cold start problem for the users & movies which are not present in train dataset. We will try to catch these exceptions using error handling and assign them global averages to all the features since we do not have any historical data.

In [80]:
# get users, movies and ratings from our train sparse matrix
test_users, test_movies, test_ratings = sparse.find(test_sparse_matrix)
print('Operation complete!')

Operation complete!


In [81]:
start = datetime.now()

if os.path.isfile(model_data_folder+'/featurized_test.csv'):
    print('Featurized test already exists.')
else:
    print('Preparing featurized_test.csv file for {} ratings...'.format(test_ratings))
    
    with open(model_data_folder+'/featurized_test.csv', mode='w') as featurized_test_file:
        user_counter = 0
        
        # Loop for each user-movie-rating combination
        for (user, movie, rating) in zip(test_users, test_movies, test_ratings):
            
            try:
                # Lets get rating for the "movie" by similar users
                top_sim_users = u_u_similarity_sparse[user].toarray().ravel().argsort()[::-1][1:]
                top_ratings = train_sparse_matrix[top_sim_users, movie].toarray().ravel()
                top_sim_users_ratings = list(top_ratings[top_ratings != 0][:5])
                top_sim_users_ratings.extend([train_averages['movie'][movie]] * (5 - len(top_sim_users_ratings))) # Fill up the zeros with movie's average rating by all users
            except (IndexError, KeyError):
                # COLD START PROBLEM
                top_sim_users_ratings.extend([train_averages['global']] * (5 - len(top_sim_users_ratings)))
            except:
                print('Exception for {} user and {} movie'.format(user, movie))
                raise

            try:
                # Lets get rating by the "user" for similar movies
                top_sim_movies = m_m_similarity_sparse[movie].toaray().ravel().argsort()[::-1][1:]
                top_ratings = train_sparse_matrix[user, top_sim_movies].toarray().ravel()
                top_sim_movies_ratings = list(top_ratings[top_ratings !=0][:5])
                top_sim_movies_ratings.extend([train_averages['user'][user]] * (5 - len(top_sim_movies_ratings))) # Fill up the zeros with user's average rating for all movies
            except (IndexError, KeyError):
                # COLD START PROBLEM
                top_sim_users_ratings.extend([train_averages['global']] * (5 - len(top_sim_users_ratings)))
            except:
                print('Exception for {} user and {} movie'.format(user, movie))
                raise                
                
            # Prepare a row to insert into featurized file
            row = list()
            row.append(user)
            row.append(movie)
            row.append(rating)
            row.append(train_averages['global'])
            try:
                row.append(train_averages['user'][user])
            except KeyError:
                row.append(train_averages['global'])
            try:
                row.append(train_averages['movie'][movie])
            except KeyError:
                row.append(train_averages['global'])
                
            row.extend(top_sim_users_ratings)
            row.extend(top_sim_movies_ratings)
            
            # Increase the count for verbose
            user_counter = user_counter + 1
            
            # Add the row to file
            featurized_test_file.write(','.join(map(str, row)))
            featurized_test_file.write('\n')
            if user_counter%1000 == 0:
                print('Done for {} rows in {}'.format(user_counter, datetime.now()-start ))
            
print('Time taken: {}', datetime.now()-start)

Featurized test already exists.
Time taken: {} 0:00:00


In [82]:
featurized_test = pd.read_csv(model_data_folder+'/featurized_test.csv', names = ['user', 'movie', 'GAvg', 'sur1', 'sur2', 'sur3', 'sur4', 'sur5','smr1', 'smr2', 'smr3', 'smr4', 'smr5','UAvg', 'MAvg', 'rating'], header=None)
featurized_test.head()


# GAvg : Average rating of all the ratings 
# Similar users rating of this movie : sur1, sur2, sur3, sur4, sur5 ( top 5 similar users who rated that movie.. )
# Similar movies rated by this user : smr1, smr2, smr3, smr4, smr5 ( top 5 similar movies rated by this movie.. )
# UAvg : User's Average rating
# MAvg : Average rating of this movie
# rating : Rating of this movie by this user.

Unnamed: 0,user,movie,GAvg,sur1,sur2,sur3,sur4,sur5,smr1,smr2,smr3,smr4,smr5,UAvg,MAvg,rating
0,808635,71,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,5
1,941866,71,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,4
2,1737912,71,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3
3,1849204,71,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,4
4,28572,111,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,3.581679,1
