Below are all modules used for this notebook.<br>
Run the below box regardless of whether you are running one box or all!

In [1]:
import pandas as pd
from multiprocessing import Pool,Process
import time
import random
import math

Below are the parameters which a user may want to change when running this program.

In [2]:
ourCustID = 2630337 # ID of customer we want recommendations for
train_frac = 0.10 # Percentage of data to use for training
# Anyone who rated less movies than this will be excluded from consideration as a user to compare to
min_movies_rated = 100
# Threshold for minimum number of users another user must share with our target user in order to be
# considered for calculation of the Pearson Correlation
union_min = 100
# Exclude movies from rating predictions if less than this number of users rated it
min_users_who_rated = 100
# If more users rated a movie in union than the below number the code will randomly sample a subset of users
# To get the number of ratings for that movie down to this number
# This is done purely to make the calculation more tractable
max_ratings_allowed = 300 

This box creates the dataframe with all relevant information in the columns.

In [3]:
global movie_data_file, a, b

# Load movie dataset
movie_data_file = pd.read_csv('movie_titles.csv', encoding="ISO-8859-1", header=None, names=['Movie_Id', 'Year', 'title'])

# Read IMDB  dataset for specific field names
read_cols = ['genres', 'keywords', 'title', 'overview', 'popularity', 'vote_average', 'vote_count']
IMDB_movie_list = pd.read_csv('tmdb_5000_movies.csv', skipinitialspace=True, usecols=read_cols)

# Find union of two data frames and save this to a third list
# Column_name should be common in both dataframes
# 'how' represents type of intersection. In this case it is inner(INNER JOIN)
output = pd.merge(IMDB_movie_list, movie_data_file, how="inner",on="title")

final_output = output.drop_duplicates('title')  # Remove duplicate rows

# Initialize dataframe
combined_cd = pd.DataFrame(columns=['title', 'Cust_Id', 'Rating', 'Release-Date'])
combined_cd = combined_cd.fillna(0)  # Fill missing values with zeros

a = 'combined_data_'
b = '.txt'

# Create dataframe with requested columns
def rename_by_movie(filenumber):
    df = pd.read_csv(a + str(filenumber) + b, header=None, names=['Cust_Id', 'Rating'],
                     usecols=[0, 1], dtype={'Rating': 'float'})
    movie_index_indices = (df.loc[df['Rating'].isnull()]).index.values.tolist() # Indices with NaN
    if len(movie_index_indices) <= 1: # Works if there is only one movie in the file
        num1 = movie_index_indices[0]
        movie_id = (df.loc[num1, :]['Cust_Id']).strip(":") # Strip off as this is always in 1st column
        df.loc[num1:, 'Movie_ID'] = int(movie_id)
    else:
        i = 0
        while (i + 1) < len(movie_index_indices): # Cycle through all movies in file
            num1 = movie_index_indices[i] # Index for current movie
            num2 = movie_index_indices[i + 1] # Following index for next movie
            movie_id = (df.loc[num1, :]['Cust_Id']).strip(":") # Current movie ID
            df.loc[num1:num2 - 1, 'Movie_ID'] = int(movie_id) # All indices for current movie ID
            i += 1

        movie_id = (df.loc[num2, :]['Cust_Id']).strip(":") # Accounts for last movie in current file
        df.loc[num2:, 'Movie_ID'] = int( movie_id )

    df = df[pd.notnull(df['Rating'])] # Remove rows with only movie ID given
    df['Cust_Id'] = pd.to_numeric(df['Cust_Id'])  # Change customer ID to int data type
    time.sleep(1) # Pause for one second upon completion (used for creating approximate progress bar)
    return df

print('Beginning to format and combine the data...')
# Beginning combining all 4 pieces using parallel processing
if __name__ == '__main__':
    # Define the dataset to use for parallel processing
    dataset = [1, 2, 3, 4]
    agents = 4 # Number of cores to use
    chunksize = 2

    pool = Pool(processes=agents)  # Start 4 worker processes
    
    # Combine all four pieces into single result
    df_comb = pd.concat(pool.map(rename_by_movie, dataset, chunksize))

print('File with all user data has been created')

Beginning to format and combine the data...
File with all user data has been created


Run below to create the training dataset.

In [4]:
# Save dataframe for our user before selecting random fraction of other users for training
df_chosen_user = df_comb.loc[df_comb['Cust_Id'] == ourCustID]

print('Finding number of movies rated for each user...')
# This is a list of all unique users in our dataset
unique_users=df_comb.loc[:,'Cust_Id'].unique().tolist()
tot_num_of_users = len(unique_users)
movie_count = pd.DataFrame()

movie_count=df_comb.groupby(['Cust_Id']).size().reset_index(name='No_of_ratings')[['Cust_Id', 'No_of_ratings']]

# Remove the target user from this list
mask = movie_count['Cust_Id'].isin([ourCustID])
movie_count = movie_count.loc[~mask]

# Now remove users who rated below minimum threshold of movies
mask = movie_count['No_of_ratings'] < min_movies_rated
movie_count = movie_count.loc[~mask]

# Randomly choose users to sample
unique_users=movie_count.loc[:,'Cust_Id'].unique().tolist()
num_cust_to_sample = int(round(len(unique_users) * train_frac))
cust_ID_to_Train = random.sample(unique_users, num_cust_to_sample)

mask = df_comb['Cust_Id'].isin(cust_ID_to_Train)
df_train = df_comb.loc[mask] # Training dataset

mask = ~df_comb['Cust_Id'].isin(cust_ID_to_Train)
df_test = df_comb.loc[mask] # Testing dataset

del df_comb # Memory management to get full dataframe out of memory before continuing
print('Testing and training datasets have been created')

Finding number of movies rated for each user...


Save information for all users matching the condition for being compared with our target user.

In [5]:
disp_rate = 500 # How many users to skip before displaying progress

# Get a list of all users in training dataset
all_user_IDs=df_train.loc[:,'Cust_Id'].unique().tolist()

# Get list of unique movies our chosen user has rated
chosen_user_movies = df_chosen_user['Movie_ID'].tolist()

# Find mean rating for our chosen user
chosen_user_mean = df_chosen_user.groupby(['Cust_Id']).mean()['Rating'].values[0]

user_sims = pd.DataFrame(columns=['Cust_Id', 'Mean Rating', 'Similarity', 'Num_Movies_In_Union'])
user_sims.loc[1,'Cust_Id'] = ourCustID
user_sims.loc[1,'Mean Rating'] = chosen_user_mean
user_sims.loc[1,'Similarity'] = 'nan'
user_sims.loc[1,'Num_Movies_In_Union'] = 'nan'

print('Starting going through all users and calculating similarity score...')
if 't1' in locals(): # If t1 still in memory from previous run of this box delete before continuing
    del t1
if 't2' in locals(): # If t2 still in memory from previous run of this box delete before continuing
    del t2
t0 = time.time()
count = 0
time_elapsed = 0
k = 2 # Start after putting chosen user in first position
for i in range(len(all_user_IDs)):
    if (i % disp_rate) == 0: # Only display ever set number of users checked
        t3 = time.time() # Stores time at which this statement was triggered
        
        if 't2' in locals(): # Only makes sense if we have gone through at least once
            time_elapsed += round((t3 - t2) / 60, 2) #Total time since main loop started
            users_processed = i # User index we are processing at this time
            est_time_per_user = time_elapsed / float(i)
            users_remaining = (len(all_user_IDs) - users_processed)
            est_time_remaining = est_time_per_user * users_remaining
            print ('User ' + str(i) + ' of ' + str(len(all_user_IDs)) + ' being examined (' + str(
                round(float(i) * 100 / len(all_user_IDs), 2)) + '%)')
            print('\t' + str(round((t3-t2) / 60, 2)) + ' minutes taken to process previous ' + str(disp_rate) + ' users')
            print('\t' + 'The code has run for ' + str(round((t3-t0) / 60, 2))  + ' minutes and ' + str(round((count / float(i))*100,2)) + '% of users have been included')
            print('\t' + 'Approximately ' + str(round(est_time_remaining,2)) + ' minutes remaining until code completes')
        
        t2 = time.time() # Reset time for next iteration of users
    
    # For this current user see how many movies they have in common with our target user
    curr_user_df = df_train.loc[df_train['Cust_Id'] == all_user_IDs[i]]
    curr_user_movie_list = curr_user_df['Movie_ID'].tolist()
    list_item_intersection = [x for x in curr_user_movie_list if x in chosen_user_movies]
    
    # If current user does not meet condition skip to next user
    if len(list_item_intersection) < union_min:
        continue
    count += 1
    
    # If current user does meet condition then continue and calculate the Pearson Correlation Coefficient
    curr_user_mean = curr_user_df.groupby(['Cust_Id']).mean()['Rating'].values[0]
    sum = 0
    sum_sq1 = 0
    sum_sq2 = 0
    # Now go through each movie in intersection to calculate the pearson correlation coefficient
    for j in range(len(list_item_intersection)):

        # Find rating for our user and current user for the current movie
        curr_user_rat = curr_user_df.loc[curr_user_df['Movie_ID'] == list_item_intersection[j]]['Rating'].values[0]
        our_user_rat = df_chosen_user.loc[df_chosen_user['Movie_ID'] == list_item_intersection[j]]['Rating'].values[0]
        sum += (curr_user_rat - curr_user_mean) * (our_user_rat - chosen_user_mean)
        sum_sq1 += (curr_user_rat - curr_user_mean) ** 2
        sum_sq2 += (our_user_rat - chosen_user_mean) ** 2
        
    sim_Pearson = sum / (math.sqrt(sum_sq1) * math.sqrt(sum_sq2)) # Calculate Pearson Coefficient
    user_sims.loc[k, 'Cust_Id'] = all_user_IDs[i]
    user_sims.loc[k, 'Mean Rating'] = curr_user_mean
    user_sims.loc[k, 'Similarity'] = sim_Pearson
    user_sims.loc[k, 'Num_Movies_In_Union'] = len(list_item_intersection)
    k += 1
    
t1 = time.time()
print('It took ' + str((t1-t0)/60) + ' minutes to run code')
print('In total ' + str(count) + ' users satisfied the requirement')

# Only keep those chosen ID's in main training dataframe
all_user_IDs=user_sims.loc[:,'Cust_Id'].unique().tolist()
mask = df_train['Cust_Id'].isin(all_user_IDs)
df_users = df_train.loc[mask]

print('Files with data for selected users have been created')

Starting going through all users and calculating similarity score...
User 500 of 23612 being examined (2.12%)
	4.11 minutes taken to process previous 500 users
	The code has run for 4.11 minutes and 81.4% of users have been included
	Approximately 189.98 minutes remaining until code completes
User 1000 of 23612 being examined (4.24%)
	3.81 minutes taken to process previous 500 users
	The code has run for 7.92 minutes and 82.7% of users have been included
	Approximately 179.09 minutes remaining until code completes
User 1500 of 23612 being examined (6.35%)
	3.76 minutes taken to process previous 500 users
	The code has run for 11.68 minutes and 82.87% of users have been included
	Approximately 172.18 minutes remaining until code completes
User 2000 of 23612 being examined (8.47%)
	5.69 minutes taken to process previous 500 users
	The code has run for 17.37 minutes and 84.85% of users have been included
	Approximately 187.7 minutes remaining until code completes
User 2500 of 23612 being 



User 4500 of 23612 being examined (19.06%)
	4.32 minutes taken to process previous 500 users
	The code has run for 39.73 minutes and 87.11% of users have been included
	Approximately 168.78 minutes remaining until code completes
User 5000 of 23612 being examined (21.18%)
	3.95 minutes taken to process previous 500 users
	The code has run for 43.68 minutes and 87.62% of users have been included
	Approximately 162.63 minutes remaining until code completes
User 5500 of 23612 being examined (23.29%)
	3.93 minutes taken to process previous 500 users
	The code has run for 47.61 minutes and 88.09% of users have been included
	Approximately 156.82 minutes remaining until code completes
User 6000 of 23612 being examined (25.41%)
	4.06 minutes taken to process previous 500 users
	The code has run for 51.67 minutes and 88.28% of users have been included
	Approximately 151.7 minutes remaining until code completes
User 6500 of 23612 being examined (27.53%)
	4.28 minutes taken to process previous 50

User 22500 of 23612 being examined (95.29%)
	0.99 minutes taken to process previous 500 users
	The code has run for 128.93 minutes and 78.35% of users have been included
	Approximately 6.37 minutes remaining until code completes
User 23000 of 23612 being examined (97.41%)
	0.75 minutes taken to process previous 500 users
	The code has run for 129.68 minutes and 77.49% of users have been included
	Approximately 3.45 minutes remaining until code completes
User 23500 of 23612 being examined (99.53%)
	0.59 minutes taken to process previous 500 users
	The code has run for 130.27 minutes and 76.46% of users have been included
	Approximately 0.62 minutes remaining until code completes
It took 130.3784529685974 minutes to run code
In total 17999 users satisfied the requirement
Files with data for selected users have been created


This box creates the movie recommendations for our target user.

In [6]:
#List of movies seen by our user
list_movies_chosen_rated = df_chosen_user.loc[:,'Movie_ID'].unique().tolist()

#Calculate all movies in remaining training dataset
list_all_movies=df_users.loc[:,'Movie_ID'].unique().tolist()

print('Beginning making predictions for movies in union the chosen user has not seen...')
pred_movie_ratings=pd.DataFrame(columns=['Movie_ID', 'Predicted_Rating'])
j=0
count = 1
time_elapsed = 0
if 't1' in locals(): # If t1 still in memory from previous run of this box delete before continuing
    del t1
if 't2' in locals(): # If t2 still in memory from previous run of this box delete before continuing
    del t2
t0 = time.time()
for item in list_all_movies:
    if (count % disp_rate) == 0:
        t2 = time.time() # Stores time at which this statement was triggered
        if 't1' in locals(): # Only makes sense if we have gone through at least once
            time_elapsed += round((t2 - t1) / 60, 2) # Total time since main loop started
            est_time_per_movie = time_elapsed / count
            movies_remaining = len(list_all_movies) - count
            est_time_remaining = est_time_per_movie * movies_remaining
            
            print ('Movie index ' + str(count) + ' of ' + str(int(len(list_all_movies))) + ' is being examined (' + str(
                round((count) * 100 / len(list_all_movies), 2)) + '%)')
            print('\t' + str(round((t2-t1)/60, 2)) + ' minutes taken to process the previous ' + str(disp_rate) + ' movies')
            print('\t' + 'The code has run for ' + str(round((t2-t0) / 60, 2)) + ' minutes and ' + str(pred_movie_ratings.shape[0]) + ' movie predictions have been made')
            print('\t' + 'Approximately ' + str(round(est_time_remaining,2)) + ' minutes remaining until code completes')
        
        t1 = time.time() # Reset time for next iteration of users

    list_users_per_movie=df_users.loc[df_users['Movie_ID'] == item]['Cust_Id'].unique().tolist()
    count += 1
    
    #Skips movie from consideration if less than the desired number users rated it
    if len(list_users_per_movie) < min_users_who_rated: 
        continue
    # If too many users rated it we randomly sample users to get down to set number of users for calculation
    # This is done purely for tractability of calculation
    if len(list_users_per_movie) > max_ratings_allowed:
        # Randomly sample to get number of users to consider down to set number of allowed user ratings
        list_users_per_movie = random.sample(list_users_per_movie, max_ratings_allowed)
    
    sum0=0
    sum1=0
    for user in list_users_per_movie:
        # Apply prediction formula to generate new rating for the movie
        sim=user_sims.loc[user_sims['Cust_Id']==user][['Similarity']].values[0]
        avg_rating_curr_user = user_sims.loc[user_sims['Cust_Id']==user][['Mean Rating']].values[0]
        curr_user_rated_movies = df_users.loc[df_users['Cust_Id'] == user]
        curr_user_rating = curr_user_rated_movies.loc[curr_user_rated_movies['Movie_ID'] == item][['Rating']].values[0]
        
        sum0+=sim*(curr_user_rating - avg_rating_curr_user)
        sum1+=sim
    
    # Create and store the predicted rating for the current movie
    pred_rating = (sum0/sum1)+ user_sims.loc[user_sims['Cust_Id']==ourCustID][['Mean Rating']].values[0]
    pred_movie_ratings.loc[j,'Movie_ID'] = item
    pred_movie_ratings.loc[j,'Predicted_Rating']=pred_rating
    j+=1

print('File with predicted movie ratings have been created')

Beginning making predictions for movies in union the chosen user has not seen...
Movie index 1000 of 17761 is being examined (5.63%)
	12.34 minutes taken to process the previous 500 movies
	The code has run for 25.86 minutes and 357 movie predictions have been made
	Approximately 206.83 minutes remaining until code completes
Movie index 1500 of 17761 is being examined (8.45%)
	12.13 minutes taken to process the previous 500 movies
	The code has run for 37.99 minutes and 525 movie predictions have been made
	Approximately 265.27 minutes remaining until code completes
Movie index 2000 of 17761 is being examined (11.26%)
	13.36 minutes taken to process the previous 500 movies
	The code has run for 51.34 minutes and 694 movie predictions have been made
	Approximately 298.12 minutes remaining until code completes
Movie index 2500 of 17761 is being examined (14.08%)
	13.84 minutes taken to process the previous 500 movies
	The code has run for 65.18 minutes and 877 movie predictions have been

Movie index 17500 of 17761 is being examined (98.53%)
	14.78 minutes taken to process the previous 500 movies
	The code has run for 460.43 minutes and 6351 movie predictions have been made
	Approximately 6.67 minutes remaining until code completes
File with predicted movie ratings have been created


Next portion, not written yet, will compare all movie recommendations made by similar users with the actual ratings which our user has given.