In [7]:
# Import
import numpy as np
import pandas as pd
import scipy as sp # <-- The sister of Numpy, used in our code for numerical efficientcy.
from scipy.sparse import csr_matrix, hstack
import matplotlib.pyplot as plt
import seaborn as sns

# Entity featurization and similarity computation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Libraries used during sorting procedures.
import operator # <-- Convienient item retrieval during iteration
import heapq # <-- Efficient sorting of large lists

# Import
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Reading csv from s3 bucket
#Imports
import boto3
from io import StringIO

In [None]:
# Replace 'your-bucket-name' and 'your-file-key' with your actual S3 bucket name and file key
bucket_name = 'unsupervised_data'
test_key = 'edsa-movie-recommendation-predict/test.csv'
train_key = 'edsa-movie-recommendation-predict/train.csv'

In [None]:
# Create an S3 client
s3 = boto3.client('s3')

In [None]:
# Read CSV file from S3 into a Pandas DataFrame
test_obj = s3.get_object(Bucket=bucket_name, Key=test_key)
test = pd.read_csv(test_obj['Body'])

In [None]:
# Read CSV file from S3 into a Pandas DataFrame
train_obj = s3.get_object(Bucket=bucket_name, Key=train_key)
train = pd.read_csv(test_obj['Body'])

In [None]:
# Display the DataFrame
test.head()

In [None]:
# Display the DataFrame
train.head()

In [8]:
#test = pd.read_csv('test.csv')
#train = pd.read_csv('train.csv')

In [9]:
# Drop the timestamp column
train = train.drop('timestamp', axis=1)

In [10]:
# Convert data types
#train['userId'] = train['userId'].astype('int32')
#train['movieId'] = train['movieId'].astype('int32')
#train['rating'] = train['rating'].astype('float32')

In [11]:
util_matrix = train.pivot_table(index=['userId'],
                                       columns=['movieId'],
                                       values='rating')
util_matrix.shape

(671, 9066)

In [12]:
# Normalize each row (a given user's ratings) of the utility matrix
util_matrix_norm = util_matrix.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# Fill Nan values with 0's, transpose matrix, and drop users with no ratings
util_matrix_norm.fillna(0, inplace=True)
util_matrix_norm = util_matrix_norm.T
util_matrix_norm = util_matrix_norm.loc[:, (util_matrix_norm != 0).any(axis=0)]
# Save the utility matrix in scipy's sparse matrix format
util_matrix_sparse = sp.sparse.csr_matrix(util_matrix_norm.values)

In [13]:
# Compute the similarity matrix using the cosine similarity metric
user_similarity = cosine_similarity(util_matrix_sparse.T)
# Save the matrix as a dataframe to allow for easier indexing
user_sim_df = pd.DataFrame(user_similarity,
                           index = util_matrix_norm.columns,
                           columns = util_matrix_norm.columns)

# Review a small portion of the constructed similartiy matrix
user_sim_df[:5]

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,0.0,0.00362,-0.002274,0.0,-0.070321,0.0,0.042632,0.0,...,0.0,0.0,0.018643,0.001031,0.0,0.0,0.0,0.044095,0.0,-0.013096
2,0.0,1.0,-0.001852,-0.004854,0.012639,0.0,0.042691,0.021066,0.011109,-0.007989,...,-0.018248,-0.021546,0.018902,-0.058952,0.028515,-0.106828,-0.007999,-0.041628,-0.090233,0.056258
3,0.0,-0.001852,1.0,0.018594,-0.025903,-0.0632,0.0549,0.026488,-0.036187,0.038021,...,0.044297,0.019581,0.070702,0.030669,0.143705,0.096713,0.027451,0.089297,-0.009815,0.062276
4,0.00362,-0.004854,0.018594,1.0,0.010801,0.019224,0.057519,0.05543,-0.010442,0.005126,...,0.011978,0.006569,0.027687,0.092092,0.021334,0.040833,0.018428,0.028642,0.019848,0.032749
5,-0.002274,0.012639,-0.025903,0.010801,1.0,-0.005843,-0.015075,-0.038886,0.013708,0.0305,...,0.046134,0.001903,0.00162,0.036819,-0.038269,-0.019537,-0.071721,0.00376,-0.029455,-0.036814


In [22]:
def collab_generate_rating_estimate(movie_id, user, k=20, threshold=0.0):
    if movie_id not in util_matrix.columns:
        # Handle non-valid movieId, return a null or default value
        return np.nan
    if user not in util_matrix.index:
        # Handle non-valid userId, return a null or default value
        return np.nan
    
    # Gather the k users which are most similar to the reference user
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:k+1]
    # Store the corresponding user's similarity values
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:k+1]
    rating_list = [] # <-- List of k user's ratings for the reference item
    weight_list = [] # <-- List of k user's similarities to the reference user

    # Create a weighted sum for each of the k users who have rated the
    # reference item (movie).
    for sim_idx, userId in enumerate(sim_users):
        # User's rating of the item
        rating = util_matrix.loc[userId, movie_id]
        # User's similarity to the reference user
        similarity = user_values[sim_idx]
        # Skip the user if they have not rated the item, or are too dissimilar to
        # the reference user
        if (np.isnan(rating)) or (similarity < threshold):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    try:
        # Return the weighted sum as the predicted rating for the reference item
        predicted_rating = sum(rating_list)/sum(weight_list)
    except ZeroDivisionError:
        # If no ratings for the reference item can be collected, return the average
        # rating given by all users for the item.
        predicted_rating = np.mean(util_matrix[movie_id])
    return predicted_rating

In [1]:
#id = 31
#actual_rating = train[(train['userId'] == 31) & (train['movieId'] == id)]['rating'].values[0]
#pred_rating = collab_generate_rating_estimate(movie_id = id, user = 31)
#print (f"Title - {title}")
#print ("---")
#print (f"Actual rating: \t\t {actual_rating}")
#print (f"Predicted rating: \t {pred_rating}")

In [20]:
test['Id'] = test.apply(lambda row: f"{row['userId']}_{row['movieId']}", axis=1)
test.head()

Unnamed: 0,userId,movieId,Id
0,1,2011,1_2011
1,1,4144,1_4144
2,1,5767,1_5767
3,1,6711,1_6711
4,1,7318,1_7318


In [23]:
test['rating'] = test.apply(lambda row: collab_generate_rating_estimate(movie_id=row['movieId'], user=row['userId']), axis=1)
test.head()

Unnamed: 0,userId,movieId,Id,rating
0,1,2011,1_2011,3.62499
1,1,4144,1_4144,4.0
2,1,5767,1_5767,
3,1,6711,1_6711,4.5
4,1,7318,1_7318,3.041667


In [25]:
print(test['rating'].isnull().sum())

4983063


In [26]:
# Impute NaNs with mean rating
mean_rating = test['rating'].mean()
test['rating'].fillna(mean_rating, inplace=True)

In [27]:
# Create a csv

test[['Id', 'rating']].to_csv('submission1.csv', index=False)

In [None]:
# Create Pickle file

import pickle

# Save the user_sim_df DataFrame to a pickle file
with open('user_sim_df.pkl', 'wb') as f:
    pickle.dump(user_sim_df, f)

# To load the user_sim_df DataFrame back
#with open('user_sim_df.pkl', 'rb') as f:
#    loaded_user_sim_df = pickle.load(f)