In [62]:
import pandas as pd
import numpy as np

# Create user-by-item matrix - nothing to do here
train_user_item = reviews[['user_id', 'movie_id', 'rating', 'timestamp']]
train_data_df = train_user_item.groupby(['user_id', 'movie_id'])['rating'].max().unstack()
train_data_np = np.array(train_data_df)

movies = pd.read_csv('movies_clean.csv')
reviews = pd.read_csv('train_data.csv')

def fit(movies=movies, reviews=reviews, latent_features = 15, learning_rate = 0.001, iters = 5):
    """
    fit the recommender to your dataset and also have this save the results
    to pull from when you need to make predictions
    """
    
    # Read in the data
        
    # Set up useful values to be used through the rest of the function
    n_users = train_data_np.shape[0]
    n_movies = train_data_np.shape[1]
    num_ratings = np.count_nonzero(~np.isnan(train_data_np))
    
    # initialize the user and movie matrices with random values
    user_mat = np.random.rand(n_users, latent_features)
    movie_mat = np.random.rand(latent_features, n_movies)
    
    # initialize sse at 0 for first iteration
    sse_accum = 0
    
    # keep track of iteration and MSE
    print("Optimizaiton Statistics")
    print("Iterations | Mean Squared Error ")
    
    # for each iteration
    for iteration in range(iters):

        # update our sse
        sse_accum = 0
        
        # For each user-movie pair
        for i in range(n_users):
            for j in range(n_movies):
                
                # if the rating exists
                if train_data_np[i, j] > 0:
                    
                    # compute the error as the actual minus the dot product of the user and movie latent features
                    diff = train_data_np[i, j] - np.dot(user_mat[i, :], movie_mat[:, j])
                    
                    # Keep track of the sum of squared errors for the matrix
                    sse_accum += diff**2
                    
                    # update the values in each matrix in the direction of the gradient
                    for k in range(latent_features):
                        user_mat[i, k] += learning_rate * (2*diff*movie_mat[k, j])
                        movie_mat[k, j] += learning_rate * (2*diff*user_mat[i, k])

        # print results
        print("%d \t\t %f" % (iteration+1, sse_accum / num_ratings))
        
    return user_mat, movie_mat 

In [63]:
user_mat, movie_mat = fit(movies, reviews, latent_features = 15, learning_rate = 0.001, iters = 2)

Optimizaiton Statistics
Iterations | Mean Squared Error 
1 		 14.065378
2 		 10.806885


In [64]:
movies.head()

Unnamed: 0.1,Unnamed: 0,movie_id,movie,genre,date,1800's,1900's,2000's,History,News,...,Fantasy,Romance,Game-Show,Action,Documentary,Animation,Comedy,Short,Western,Thriller
0,0,8,Edison Kinetoscopic Record of a Sneeze (1894),Documentary|Short,1894,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
1,1,10,La sortie des usines Lumière (1895),Documentary|Short,1895,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
2,2,12,The Arrival of a Train (1896),Documentary|Short,1896,1,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
3,3,25,The Oxford and Cambridge University Boat Race ...,,1895,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,91,Le manoir du diable (1896),Short|Horror,1896,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [67]:
movie_name = movies[movies.movie_id == 8]['movie'][0]
movie_name

'Edison Kinetoscopic Record of a Sneeze (1894)'

In [59]:
def predict_rating(user_id, movie_id):
    """
    makes predictions of a rating for a user on a movie-user combo
    """

    movie_name = movies[movies.movie_id == movie_id][]
    
    if user_id in train_data_df.index:

        # Use the training data to create a series of users and movies that matches the ordering in training data
        user_ids_series = np.array(train_data_df.index)
        movie_ids_series = np.array(train_data_df.columns)

        # User row and Movie Column
        user_row = np.where(user_ids_series == user_id)[0][0]
        movie_col = np.where(movie_ids_series == movie_id)[0][0]
        print(user_row)
        print(movie_col)

        # Take dot product of that row and column in U and V to make prediction
        pred = np.dot(user_mat[user_row, :], movie_mat[:, movie_col])
        return pred
    
    else: 
        print('omg')
    
        

In [61]:
pred = predict_rating(2, 14142)
print(pred)

omg
None
