In [18]:
import pandas as pd
import numpy as np

def fit(movies_path, reviews_path, latent_features = 15, learning_rate = 0.001, iters = 5):
    """
    fit the recommender to your dataset and also have this save the results
    to pull from when you need to make predictions
    """
    
    # Read in the data
    movies = pd.read_csv(movies_path)
    reviews = pd.read_csv(reviews_path)
    
    # Create user-by-item matrix - nothing to do here
    train_user_item = reviews[['user_id', 'movie_id', 'rating', 'timestamp']]
    train_data_df = train_user_item.groupby(['user_id', 'movie_id'])['rating'].max().unstack()
    train_data_np = np.array(train_data_df)
    
    
    # Set up useful values to be used through the rest of the function
    n_users = train_data_np.shape[0]
    n_movies = train_data_np.shape[1]
    num_ratings = np.count_nonzero(~np.isnan(train_data_np))
    
    # initialize the user and movie matrices with random values
    user_mat = np.random.rand(n_users, latent_features)
    movie_mat = np.random.rand(latent_features, n_movies)
    
    # initialize sse at 0 for first iteration
    sse_accum = 0
    
    # keep track of iteration and MSE
    print("Optimizaiton Statistics")
    print("Iterations | Mean Squared Error ")
    
    # for each iteration
    for iteration in range(iters):

        # update our sse
        sse_accum = 0
        
        # For each user-movie pair
        for i in range(n_users):
            for j in range(n_movies):
                
                # if the rating exists
                if train_data_np[i, j] > 0:
                    
                    # compute the error as the actual minus the dot product of the user and movie latent features
                    diff = train_data_np[i, j] - np.dot(user_mat[i, :], movie_mat[:, j])
                    
                    # Keep track of the sum of squared errors for the matrix
                    sse_accum += diff**2
                    
                    # update the values in each matrix in the direction of the gradient
                    for k in range(latent_features):
                        user_mat[i, k] += learning_rate * (2*diff*movie_mat[k, j])
                        movie_mat[k, j] += learning_rate * (2*diff*user_mat[i, k])

        # print results
        print("%d \t\t %f" % (iteration+1, sse_accum / num_ratings))
        
    return user_mat, movie_mat 

In [19]:
fit('movies_clean.csv', 'train_data.csv', latent_features = 15, learning_rate = 0.001, iters = 2)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 14.242965
2 		 10.917766


(array([[0.38091877, 0.66318563, 0.34483564, ..., 0.21072541, 0.08967444,
         0.13108305],
        [0.6839765 , 0.23386924, 0.65417004, ..., 0.12132513, 0.02744722,
         0.27525867],
        [0.2205001 , 0.95745904, 0.56990932, ..., 0.26459268, 0.64142633,
         0.19139394],
        ...,
        [0.16630826, 0.21709987, 0.65121239, ..., 0.35416181, 0.19992449,
         0.0320086 ],
        [0.27611047, 0.17203368, 0.21686036, ..., 0.68470673, 0.29908348,
         0.57858831],
        [0.56165498, 0.91313142, 0.58034851, ..., 0.73235052, 0.25482396,
         0.89422438]]),
 array([[0.89212487, 0.58005218, 0.76776821, ..., 0.43139757, 0.770534  ,
         0.2030816 ],
        [0.73659805, 0.56473347, 0.67906327, ..., 0.79942911, 0.02690398,
         0.1824379 ],
        [0.77233877, 0.71233165, 0.54843072, ..., 0.10303335, 0.18032889,
         0.74970433],
        ...,
        [0.31652091, 0.82639409, 0.40062314, ..., 0.34190206, 0.32291712,
         0.49455586],
        [0.3

In [9]:
reviews = pd.read_csv('train_data.csv')

# Create user-by-item matrix - nothing to do here
train_user_item = reviews[['user_id', 'movie_id', 'rating', 'timestamp']]
train_data_df = train_user_item.groupby(['user_id', 'movie_id'])['rating'].max().unstack()
train_data_np = np.array(train_data_df)




array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])