# Movie recommendation using Collaborative Filtering

In [1]:
import numpy as np
import pandas as pd

## Encoding rating data
Working with a small dataset to make sure the functions work

In [2]:
# The first row says that user 11 reated movie 1 with a score of 4
!cat tiny_training2.csv 

userId,movieId,rating
11,1,4
11,23,5
2,23,5
2,4,3
31,1,4
31,23,4
4,1,5
4,3,2
52,1,1
52,3,4
61,3,5
7,23,1
7,3,3


In [3]:
def proc_col(col):
    """Encodes a pandas column with continous ids. 
    """
    uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx[x] for x in col]), len(uniq)

In [12]:
def encode_data(df):
    """
    Encodes rating data with continous user and movie ids
    """

    u = proc_col(df['userId'])
    df['userId'] = u[1]
    num_users = u[2]
    
    i = proc_col(df['movieId'])
    df['movieId'] = i[1]
    num_movies = i[2]
    
    return df, num_users, num_movies

In [13]:
df = pd.read_csv("tiny_training2.csv")
df, num_users, num_movies = encode_data(df)

In [14]:
# look at the data after encoding
df

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


## Initializing parameters

In [None]:
def create_embedings(n, K):
    """ 
    Create a numpy random matrix of shape n, K
    """
    np.random.seed(3)
    emb = 6*np.random.random((n, K)) / K
    return emb

## Encoding users and movie combinations as a sparse matrix 

In [20]:
from scipy import sparse
def df2matrix(df, nrows, ncols, column_name="rating"):
    """ 
    Returns a sparse matrix 
    """
    values = df[column_name].values
    ind_movie = df['movieId'].values
    ind_user = df['userId'].values
    return sparse.csc_matrix((values,(ind_user, ind_movie)),shape=(nrows, ncols))

In [21]:
Y = df2matrix(df, num_users, num_movies)

In [22]:
print(Y)

  (0, 0)	4
  (2, 0)	4
  (3, 0)	5
  (4, 0)	1
  (0, 1)	5
  (1, 1)	5
  (2, 1)	4
  (6, 1)	1
  (1, 2)	3
  (3, 3)	2
  (4, 3)	4
  (5, 3)	5
  (6, 3)	3


In [23]:
def sparse_multiply(df, emb_user, emb_movie):
    """
    This function returns U*V^T element wise multiplication as a sparse matrix.
    """
    df["Prediction"] = np.sum(emb_user[df["userId"].values]*emb_movie[df["movieId"].values], axis=1)
    return df2matrix(df, emb_user.shape[0], emb_movie.shape[0], column_name="Prediction")

## Calculating the cost function

In [69]:
# calculating cost of the prediction as MSE
def cost(df, emb_user, emb_movie):
    """ 
    Computes mean square error
    """
    Y_hat = sparse_multiply(df, emb_user, emb_movie)
    Y = df2matrix(df, emb_user.shape[0], emb_movie.shape[0])
    error = Y-Y_hat
    
    return error.power(2).sum()/len(df)

Implementing a gradient descent with momentum from scratch to optimize the weights of the embeddings for users and movies 

## Calculating gradient

In [78]:
def gradient(df, Y, emb_user, emb_movie):
    """ 
    Computes the gradient.
    """

    Y_hat = sparse_multiply(df, emb_user, emb_movie)
    error = Y-Y_hat
    
    d_emb_user = (-2/len(df))* (error.dot(emb_movie))
    d_emb_movie = (-2/len(df))* (error.T.dot(emb_user))
                                   
    return d_emb_user, d_emb_movie

## Using gradient descent with momentum

In [171]:
def gradient_descent(df, emb_user, emb_movie, iterations=100, learning_rate=0.01, df_val=None):
    """ Computes gradient descent with momentum (0.9) for a number of iterations.
    """
    Y = df2matrix(df, emb_user.shape[0], emb_movie.shape[0])

    grad_user, grad_movie = gradient(df, Y, emb_user, emb_movie)
    vu = grad_user
    vm = grad_movie
    for i in range(iterations):
        grad_user, grad_movie = gradient(df, Y, emb_user, emb_movie)
        vu = 0.9*vu+0.1*grad_user
        vm = 0.9*vm+0.1*grad_movie
        
        emb_user = emb_user - learning_rate*vu
        emb_movie = emb_movie - learning_rate*vm
        
        if(i % 50 ==49):
            print('Training cost: '+str(cost(df,emb_user, emb_movie)))
            if df_val is not None:
                print('Validation cost: '+str(cost(df_val,emb_user, emb_movie)))
        
    return emb_user, emb_movie

## Predicting on new data

In [163]:
# encode vaidation/test data with same encoding as the training data
def encode_new_data(df_val, df_train):
    """ 
    Encodes df_val with the same encoding as df_train.
    """

    df_val2 = df_val.copy()
    uni_u = df_train['userId'].unique()
    uni_m = df_train['movieId'].unique()
    
    df_val2 = df_val2[(df_val2['userId'].isin(uni_u)) & (df_val2['movieId'].isin(uni_m))]
    
    name2idx_u = proc_col(df_train['userId'])[0]
    name2idx_m = proc_col(df_train['movieId'])[0]
    
    df_val2['userId'] = np.array([name2idx_u[i] for i in df_val2['userId']])
    df_val2['movieId'] = np.array([name2idx_m[i] for i in df_val2['movieId']])
    
    return df_val2

## Recommending movies
We are getting the data for this exercise from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [169]:
#Load data
path = "ml-latest-small/"
data = pd.read_csv(path + "ratings.csv")

#create training and validation sets
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

#encode data
df_train, num_users, num_movies = encode_data(train.copy())
df_val = encode_new_data(val.copy(), train.copy())
print(len(val), len(df_val))

20205 19507


In [172]:
# training embeddings, 
K = 50 # embedding size
emb_user = create_embedings(num_users, K)
emb_movie = create_embedings(num_movies, K)
emb_user, emb_movie = gradient_descent(df_train, emb_user, emb_movie, iterations=2000, learning_rate=1, df_val=df_val)

Training cost: 9.98836344217
Validation cost: 10.1253120833
Training cost: 7.22565025945
Validation cost: 7.36167540124
Training cost: 5.22643461835
Validation cost: 5.35719756062
Training cost: 4.07667755518
Validation cost: 4.20134257812
Training cost: 3.34174315552
Validation cost: 3.45519954423
Training cost: 2.83263058825
Validation cost: 2.93606678409
Training cost: 2.46393754738
Validation cost: 2.56054267977
Training cost: 2.18758530287
Validation cost: 2.27995700876
Training cost: 1.97441970687
Validation cost: 2.06435350736
Training cost: 1.80592801063
Validation cost: 1.89464507847
Training cost: 1.66992604337
Validation cost: 1.75827074408
Training cost: 1.55815302884
Validation cost: 1.64672176183
Training cost: 1.46485885392
Validation cost: 1.55408138155
Training cost: 1.3859424916
Validation cost: 1.47613410534
Training cost: 1.3184119916
Validation cost: 1.40980777706
Training cost: 1.26003836732
Validation cost: 1.35281624887
Training cost: 1.20912909979
Validation co

In [173]:
# report accuracy of train and validation
train_mse = cost(df_train, emb_user, emb_movie)
val_mse = cost(df_val, emb_user, emb_movie)
print(train_mse, val_mse)

0.766573002646 0.910427187685
