Below are all modules used for this notebook.<br>
Run the below box regardless of whether you are running one box or all!

In [4]:
import pandas as pd
from multiprocessing import Pool,Process
import time
import random
import math
import numpy as np

# The below suppresses all warnings in the notebook
# Only leave this uncommented for display purposes
import warnings
warnings.filterwarnings("ignore")

Below are the parameters which a user may want to change when running this program.

In [5]:
ourCustID = 2630337 # ID of customer we want to make sure we have recommendations for (make sure not to exclude)
train_frac = 0.50 # Percentage of data to use for training
# Anyone who rated less movies than this will be excluded from consideration as a user to compare to
min_movies_rated = 10
# Min number of people rating a movie for it to be included
min_raters_per_movie = 10
# Fraction of customers to include for consideration in SVM
cust_frac = 0.01
# How many iterations to skip between displaying current error
it_disp_rate = 10
# Min difference allowed between two iterations before stopping the loop for performing SVD
perc_change_min_lim = 0.01
# Maximum number of iterations to go through for performing SVD
iterations = 300
# Number of features to include in the model
K = 20
# Learning rate for SVD implementation
alpha = 0.002
# Regularization parameter for SVD implementation
beta = 0.01

This box creates the dataframe with all relevant information in the columns.

In [6]:
global movie_data_file, a, b

# Load movie dataset
movie_data_file = pd.read_csv('movie_titles.csv', encoding="ISO-8859-1", header=None, names=['Movie_Id', 'Year', 'title'])

# Read IMDB  dataset for specific field names
read_cols = ['genres', 'keywords', 'title', 'overview', 'popularity', 'vote_average', 'vote_count']
IMDB_movie_list = pd.read_csv('tmdb_5000_movies.csv', skipinitialspace=True, usecols=read_cols)

# Find union of two data frames and save this to a third list
# Column_name should be common in both dataframes
# 'how' represents type of intersection. In this case it is inner(INNER JOIN)
output = pd.merge(IMDB_movie_list, movie_data_file, how="inner",on="title")

final_output = output.drop_duplicates('title')  # Remove duplicate rows

# Initialize dataframe
combined_cd = pd.DataFrame(columns=['title', 'Cust_Id', 'Rating', 'Release-Date'])
combined_cd = combined_cd.fillna(0)  # Fill missing values with zeros

a = 'combined_data_'
b = '.txt'

# Create dataframe with requested columns
def rename_by_movie(filenumber):
    df = pd.read_csv(a + str(filenumber) + b, header=None, names=['Cust_Id', 'Rating'],
                     usecols=[0, 1], dtype={'Rating': 'float'})
    movie_index_indices = (df.loc[df['Rating'].isnull()]).index.values.tolist() # Indices with NaN
    if len(movie_index_indices) <= 1: # Works if there is only one movie in the file
        num1 = movie_index_indices[0]
        movie_id = (df.loc[num1, :]['Cust_Id']).strip(":") # Strip off as this is always in 1st column
        df.loc[num1:, 'Movie_ID'] = int(movie_id)
    else:
        i = 0
        while (i + 1) < len(movie_index_indices): # Cycle through all movies in file
            num1 = movie_index_indices[i] # Index for current movie
            num2 = movie_index_indices[i + 1] # Following index for next movie
            movie_id = (df.loc[num1, :]['Cust_Id']).strip(":") # Current movie ID
            df.loc[num1:num2 - 1, 'Movie_ID'] = int(movie_id) # All indices for current movie ID
            i += 1

        movie_id = (df.loc[num2, :]['Cust_Id']).strip(":") # Accounts for last movie in current file
        df.loc[num2:, 'Movie_ID'] = int( movie_id )

    df = df[pd.notnull(df['Rating'])] # Remove rows with only movie ID given
    df['Cust_Id'] = pd.to_numeric(df['Cust_Id'])  # Change customer ID to int data type
    time.sleep(1) # Pause for one second upon completion (used for creating approximate progress bar)
    return df

print('Beginning to format and combine the data...')
# Beginning combining all 4 pieces using parallel processing
if __name__ == '__main__':
    # Define the dataset to use for parallel processing
    dataset = [1, 2, 3, 4]
    agents = 4 # Number of cores to use
    chunksize = 2

    pool = Pool(processes=agents)  # Start 4 worker processes
    
    # Combine all four pieces into single result
    df_comb = pd.concat(pool.map(rename_by_movie, dataset, chunksize))

print('File with all user data has been created')

Beginning to format and combine the data...
File with all user data has been created


Run below to create the testing and training datasets.

In [7]:
print('Filtering out users below minimum threshold of movies rated...')
# Below Section Removes users who have reated below a certain number of movies
movie_count = pd.DataFrame()
movie_count=df_comb.groupby(['Cust_Id']).size().reset_index(name='No_of_ratings')[['Cust_Id', 'No_of_ratings']]
# Now remove users who rated below minimum threshold of movies
mask = movie_count['No_of_ratings'] < min_movies_rated
movie_count = movie_count.loc[~mask] # Remove users below that threshold from list

# Now get list of unique users remaining
unique_users=movie_count.loc[:,'Cust_Id'].unique().tolist()
del movie_count # Delete for memory management

#Keep only those users in df_comb
mask = df_comb['Cust_Id'].isin(unique_users) # Find locations of remaining users in main dataframe
df_comb = df_comb.loc[mask] # Keep only those users in the dataframe

# Save the dataframe for the user we want to make sure is included
df_chosen_cust = df_comb.loc[df_comb['Cust_Id'] == ourCustID]

print('Removing excluded fraction of customers from being inlcuded in training or testing...')
# Minimize number of customers from dataset to be included in training or testing dataframes
unique_users=df_comb.loc[:,'Cust_Id'].unique().tolist()
num_cust_to_sample = int(round(len(unique_users) * cust_frac))
cust_IDs_to_Keep = random.sample(unique_users, num_cust_to_sample)
mask = df_comb['Cust_Id'].isin(cust_IDs_to_Keep)
df_comb = df_comb.loc[mask]  # No use looking at movies not relating to users left in dataset

# Make sure our chosen customer has not been excluded
incl_users = df_comb.loc[:,'Cust_Id'].unique().tolist()
if ourCustID not in incl_users:
    df_comb.append(df_chosen_cust)

print('Creating training and testing dataframes...')
# Now take desired fraction of data for use in training set
df_train=df_comb.sample(frac=train_frac)

df_test=df_comb.drop(df_train.index)
del df_comb # Delete for memory management

# Create a dictionary containing correlation between row number and Customer ID
unique_users=df_train.loc[:,'Cust_Id'].tolist()
user_dict=[(row_index, user_id) for row_index, user_id in enumerate(unique_users)]

print('Filtering out movies with less users who rated them than the minimum threshold...')
# Below Section Removes movies below a certain threshold of users who have rated them
user_count = pd.DataFrame()
user_count=df_train.groupby(['Movie_ID']).size().reset_index(name='No_of_raters')[['Movie_ID', 'No_of_raters']]

# Now remove movies with less than the minimum of threshold of people who rated them
mask = user_count['No_of_raters'] < min_raters_per_movie
user_count = user_count.loc[~mask]

# Get list of unique movies remaining
unique_movies=user_count.loc[:,'Movie_ID'].unique().tolist()
del user_count # Delete for memory management

# Keep only those movies in df_comb
mask = df_train['Movie_ID'].isin(unique_movies)
df_train = df_train.loc[mask]

# Create a dictionary containing correlation between column number and Movie ID
unique_movies=df_train.loc[:,'Movie_ID'].tolist()
movie_dict = [(col_index, movie_id) for col_index, movie_id in enumerate(unique_movies)]

# Format dataframe with row indices as Users and column indices as Movies
print('Pivoting the training dataframe...')
df_train=df_train.pivot(index = 'Cust_Id', columns ='Movie_ID', values = 'Rating').fillna(0)

print('Converting pivoted dataframe to an array...')
# convert to numpy array
train_array = df_train.values
del df_train # Delete for memory management
print('Files with numpy array of users and movies and both dictionaries have created')

Filtering out users below minimum threshold of movies rated...
Removing excluded fraction of customers from being inlcuded in training or testing...
Creating training and testing dataframes...
Filtering out movies with less users who rated them than the minimum threshold...
Pivoting the training dataframe...
Converting pivoted dataframe to an array...
Files with numpy array of users and movies and both dictionaries have created


This box creates predictions for all users.

In [8]:
# Create class for performing the matrix factorization
class Matrix_Factoring():

    # Initializing the user-movie rating matrix,no of hidden features,bias alpha and beta
    def __init__(self, R, K, alpha, beta, iterations, perc_change_min_lim):
        self.R = R
        self.num_unique_users, self.num_unique_movies = self.R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.iterations = iterations
        
        # List of training samples (Start with none and build)
        self.dicts_row = dict()
        self.dicts_column = dict()
        
        # Intialize the P and Q matrix (Only done once) (Over k to normalize for below summations)
        self.P = np.random.normal(scale=1. / self.K, size=(self.num_unique_users, self.K))
        self.Q = np.random.normal(scale=1. / self.K, size=(self.num_unique_movies, self.K))
        
        # Initialize the associated bias terms (Start with bias for users and movies set to zero)
        self.b_P = np.zeros(self.num_unique_users)
        self.c_Q = np.zeros(self.num_unique_movies)
        mask = (self.R != 0)  # Find entries in the true rating matrix where we have actual ratings
        self.mu = np.mean(self.R[mask])  # Overall average rating across all users
        self.perc_change_min_lim = perc_change_min_lim

    # Main definition in this class (iteratively trains weighting for all data points)
    def train(self):
        temp = []
        temp1 = []
        t0 = time.time()
        for i in range(self.num_unique_users):
            temp = self.R[i,:].tolist()  # Holds list of all ratings for user i
            temp1 = [(num, entryvalue) for num, entryvalue in enumerate(temp) if entryvalue > 0]
            self.dicts_row[i] = temp1  # Store dict for user i

        training_process = []
        count = 0
        perc_change = 100  # Initialize so following loop occurs
        while perc_change > self.perc_change_min_lim: # This is the iterative gradient descent
            t1 = time.time()
            # Make list of keys and list of values and zip them together, will preserve order
            a = list(self.dicts_row.keys())  # List of all row indices corresponding to users
            b = list(self.dicts_row.values())  # List of all (movie,ratings) combinations corresponding to all rated movies for each user
            # Below is a list within a list
            combined = list(zip(a, b))  # Creates list, one for each user, holding (movie_index,rating) for each rated movie
            random.shuffle(combined)  # Randomizing order of the users to be considered

            # Creates new dictionary, using the reordered users
            self.dicts_row = {x[0]: x[1] for x in combined}

            # Update weights for P,Q,b_P,c_Q using stochastic gradient descent (for all users and all movies)
            self.sgd()

            # Calculate overall error for the predicted matrix
            error = self.mean_squared_error()
            training_process.append((count, error))  # Holds list of iteration, error

            if count > 0:  # Only makes sense after at least one iteration has occured
                perc_change = abs(100 * (training_process[count-1][1] - training_process[count][1]) / training_process[count][1])
            if (count-1) > self.iterations:  # Artificially end loop if we reach set number of iterations
                perc_change = 0

            # Display progress at provided frequency
            if (count + 1) % it_disp_rate == 0:
                t2 = time.time()
                time_elapsed = (t2-t0)/60
                est_time_per_iteration = time_elapsed / float(count+1)
                est_time_remaining = est_time_per_iteration * (self.iterations - count)
                print("At iteration: %d of max %d the error is %.2f" % (count + 1, self.iterations, error))
                print('\t' + 'It has been ' + str(round(time_elapsed, 2)) + ' minutes since the function started')
                print('\t' + 'The percent difference stop condition is set to ' + str(self.perc_change_min_lim) + ' and currently it is at ' + str(round(perc_change,2)))
                print('\t' + 'Approximately ' + str(round(est_time_remaining, 2)) + ' minutes remain until iteration max limit is reached')

            count += 1

        print('Training is complete...')
        return training_process  # So we can plot error progress for chosen k

    # Predicts rating specific to user i and movie j
    def get_rating(self, row, column):
        prediction = self.mu + self.b_P[row] + self.c_Q[column] + self.P[row, :].dot(self.Q[column, :].T)
        return prediction #Predicted rating for given user and movie

    # Updates weights for P,Q,b_P,c_Q by stochastic gradient descent,mu stays the same
    def sgd(self):
        for key, value in self.dicts_row.items():
            row = int(key)  # Represents current customer we are looking at
            list_of_movies=np.asarray(value)  # List of rated (movie_index, rating) for that customer
            
            for l in range(len(value)):  # Go through every rated movie for that customer
                column = int(list_of_movies[l, 0])  # Holds movie index
                rating = list_of_movies[l, 1]  # Holds rating associated with that movie
                predicted_rating = self.get_rating(row, column)  # Holds predicted rating for that movie
                e = rating-predicted_rating  # Difference between actual rating and predicted rating
                
                # Update all matrices for this particular user, movie combination
                self.b_P[row] += self.alpha * (e-self.beta * self.b_P[row])
                self.c_Q[column] += self.alpha * (e-self.beta * self.c_Q[column])
                self.P[row, :] += self.alpha*(e * self.Q[column, :] - self.beta * self.P[row, :])
                self.Q[column, :] += self.alpha * (e * self.P[row, :] - self.beta * self.Q[column, :])

    # Calculates overall error for predicted matrix
    def mean_squared_error(self):
        l_row, l_column = self.R.nonzero()  # List of row and column index for non zero elements in R.
        predicted = self.full_matrix()  # Holds full prediction matrix with ratings for all customers and movies
        error = 0  # Initialize for iteration
        
        for row, column in zip(l_row, l_column):  # Go through each user,movie combination in full matrix
            error += pow(self.R[row, column]-predicted[row, column], 2)
        return np.sqrt(error)  # Holds mean squared error

    # Creates and returns the full prediction matrix
    def full_matrix(self):
        return self.mu + self.b_P.reshape(-1, 1) + self.c_Q[np.newaxis:, ] + self.P.dot(self.Q.T)

mf = Matrix_Factoring(train_array, K, alpha, beta, iterations, perc_change_min_lim)  # Call and initialize class
print('Beginning training iterations...')
training_errors = mf.train()  # Begin training using the function in the class

print('Creating full prediction matrix...')
prediction_matrix = mf.full_matrix()  # Save the prediction matrix after going through all iterations

user_bias = mf.b_P
movie_bias = mf.c_Q
user_feature_mat = mf.P
movie_feature_mat = mf.Q

print('All requested ratings have been generated')

Beginning training iterations...
At iteration: 10 of max 300 the error is 635.69
	It has been 3.84 minutes since the function started
	The percent difference stop condition is set to 0.01 and currently it is at 0.22
	Approximately 111.71 minutes remain until iteration max limit is reached
At iteration: 20 of max 300 the error is 627.06
	It has been 7.6 minutes since the function started
	The percent difference stop condition is set to 0.01 and currently it is at 0.11
	Approximately 106.83 minutes remain until iteration max limit is reached
At iteration: 30 of max 300 the error is 616.92
	It has been 11.13 minutes since the function started
	The percent difference stop condition is set to 0.01 and currently it is at 0.24
	Approximately 100.53 minutes remain until iteration max limit is reached
At iteration: 40 of max 300 the error is 595.90
	It has been 14.61 minutes since the function started
	The percent difference stop condition is set to 0.01 and currently it is at 0.4
	Approximatel

Next portion, not written yet, will use test set to see how general accurate the predictions are.