In [1]:
import pandas as pd
import numpy as np
training_df = pd.read_csv('C:/Users/nafla/OneDrive/Documents/system development/Netflix/training_data.csv')
training_df.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,RatingYear,MovieAge
0,1,1488844,3,2005-09-06,2003,2005,2
1,1,822109,5,2005-05-13,2003,2005,2
2,1,885013,4,2005-10-19,2003,2005,2
3,1,30878,4,2005-12-26,2003,2005,2
4,1,823519,3,2004-05-03,2003,2004,1


In [2]:
training_df['CustomerID'] = training_df['CustomerID'].astype(str)
training_df['MovieID'] = training_df['MovieID'].astype(str)
training_df['Rating'] = pd.to_numeric(training_df['Rating'], errors='coerce')  # Converts to float, makes non-numeric as NaN


In [3]:
# Calculate quantiles for user activity and item popularity
user_activity_quantiles = training_df['CustomerID'].value_counts().quantile([0.25, 0.5, 0.75])
item_popularity_quantiles = training_df['MovieID'].value_counts().quantile([0.25, 0.5, 0.75])
print(user_activity_quantiles)
print(item_popularity_quantiles)

0.25     8.0
0.50    24.0
0.75    64.0
Name: CustomerID, dtype: float64
0.25     192.0
0.50     552.5
0.75    2539.0
Name: MovieID, dtype: float64


In [4]:
# Assign each user and item to a bin based on the quantiles
training_df['UserActivityBin'] = pd.qcut(training_df.groupby('CustomerID')['Rating'].transform('size'), 
                                q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

training_df['ItemPopularityBin'] = pd.qcut(training_df.groupby('MovieID')['Rating'].transform('size'), 
                                  q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

# Combine these with Rating to create a stratification key
training_df['Strata'] = training_df['UserActivityBin'].astype(str) + "_" + training_df['ItemPopularityBin'].astype(str) + "_" + training_df['Rating'].astype(str)

# Perform stratified sampling
# we use groupby and  frac to specify a fraction of each strata and in case number of rows is less that 10 it takes all rows
strat_sample_df = training_df.groupby('Strata').apply(lambda x: x.sample(frac=0.001 if len(x) > 10 else len(x)/len(x))).reset_index(drop=True)


In [5]:
num_sampled_rows = len(strat_sample_df)
print(f"Number of rows in the sampled DataFrame: {num_sampled_rows}")

Number of rows in the sampled DataFrame: 24052


In [6]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
training_data, testing_data = train_test_split(strat_sample_df, test_size=0.2, random_state=42)

# further split the training data into training and validation sets
training_data, validation_data = train_test_split(training_data, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2


In [7]:
# Creating the user-item matrix using pivot_table
user_item_matrix = training_data.pivot_table(index='CustomerID', columns='MovieID', values='Rating', fill_value=0)


In [8]:
# Convert the pivot table to a numpy array if you need a dense matrix format
dense_matrix = user_item_matrix.to_numpy()

In [9]:
from sklearn.preprocessing import normalize

# Normalize the matrix so that each user vector has unit norm
user_item_matrix_normalized = normalize(dense_matrix, axis=1, norm='l2')

In [10]:
def get_dense_ratings(user1_idx, user2_idx, user_item_matrix):
    """
    Extracts dense rating vectors for common items rated by both users.
    """
    user1_ratings = user_item_matrix[user1_idx, :]
    user2_ratings = user_item_matrix[user2_idx, :]
    
    common_item_indices = np.where((user1_ratings > 0) & (user2_ratings > 0))[0]
    
    return user1_ratings[common_item_indices], user2_ratings[common_item_indices]


In [11]:
from scipy.spatial.distance import cosine

# def cosine_similarity(user1_common_ratings, user2_common_ratings):
#     """
#     Computes the cosine similarity between two users' ratings for common items.
    
#     Args:
#     - user1_common_ratings, user2_common_ratings: Dense rating vectors for common items.
    
#     Returns:
#     - The computed similarity score.
#     """
#     # Prevent division by zero
#     if len(user1_common_ratings) == 0 or len(user2_common_ratings) == 0:
#         return 0
#     # Compute the cosine distance and convert it to similarity
#     similarity = 1 - cosine(user1_common_ratings, user2_common_ratings)
#     return similarity


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
# Calculate the cosine similarity matrix
user_similarity_matrix = cosine_similarity(dense_matrix)

In [13]:
import numpy as np

def predict_rating(user_idx, item_idx, k, cosine_similarity_matrix, user_item_matrix):
    # Get the similarities for the target user
    user_similarities = cosine_similarity_matrix[user_idx]
    
    # Get indices of the top k most similar users, excluding the user itself
    similar_users_indices = np.argsort(user_similarities)[-k-1:-1][::-1]
    
    # Calculate the predicted rating using the similarities and ratings of the top k similar users
    top_k_similarities = user_similarities[similar_users_indices]
    top_k_users_ratings = user_item_matrix[similar_users_indices, item_idx]
    
    # Check for NaN values in top_k_similarities
    if np.isnan(top_k_similarities).any():
        print(f"NaN found in similarities for user index {user_idx} and item index {item_idx}.")
        # Handle the NaN values, for example, by setting them to 0 or ignoring them
        top_k_similarities = np.nan_to_num(top_k_similarities)
    
    # Predict rating 
    rated = top_k_users_ratings > 0
    if rated.any() and np.sum(top_k_similarities[rated]) > 0:
        predicted_rating = np.dot(top_k_users_ratings[rated], top_k_similarities[rated]) / np.sum(top_k_similarities[rated])
    else:
        predicted_rating = 0  # Or use another fallback such as global or user's average rating
    
    return predicted_rating


In [14]:
# import numpy as np

# def predict_rating_vectorized(user_idx, item_idx, k, cosine_similarity_matrix, user_item_matrix):
#     # Get the similarities for the target user
#     user_similarities = cosine_similarity_matrix[user_idx]
    
#     # Get indices of the top k most similar users, excluding the user itself
#     similar_users_indices = np.argsort(user_similarities)[-k-1:-1][::-1]
    
#     # Calculate the predicted rating using the similarities and ratings of the top k similar users
#     top_k_similarities = user_similarities[similar_users_indices]
#     top_k_users_ratings = user_item_matrix[similar_users_indices, item_idx]
    
#     # Check for NaN values in top_k_similarities
#     if np.isnan(top_k_similarities).any():
#         print(f"NaN found in similarities for user index {user_idx} and item index {item_idx}.")
#         # Handle the NaN values, for example, by setting them to 0 or ignoring them
#         top_k_similarities = np.nan_to_num(top_k_similarities)
    
#     # Continue with your logic...
#     rated = top_k_users_ratings > 0
#     if rated.any() and np.sum(top_k_similarities[rated]) > 0:
#         predicted_rating = np.dot(top_k_users_ratings[rated], top_k_similarities[rated]) / np.sum(top_k_similarities[rated])
#     else:
#         predicted_rating = 0  # Or use another fallback such as global or user's average rating
    
#     return predicted_rating


In [15]:
from sklearn.metrics import mean_squared_error

def evaluate_rmse_in_batches(testing_data, cosine_similarity_matrix, user_item_matrix, k, user_ids, item_ids, batch_size=1000):
    """
    Evaluates RMSE of the KNN model on the test data in batches.
    
    Args:
    - testing_data: DataFrame containing the test set.
    - train_matrix: User-item matrix for the training set, can be dense or sparse.
    - k: Number of nearest neighbors to consider.
    - user_ids: Array of user IDs from the original dataset, used to map users to matrix indices.
    - item_ids: Array of item IDs from the original dataset, used to map items to matrix indices.
    - batch_size: Number of user-item pairs to evaluate per batch, for efficiency.
    
    Returns:
    - The RMSE for the test set.
    """
    # Initialize list to store actual and predicted ratings
    actual_ratings = []
    predicted_ratings = []
    
    # Mapping dictionaries for user and item IDs to their indices
    user_indices = {user_id: idx for idx, user_id in enumerate(user_ids)}
    item_indices = {item_id: idx for idx, item_id in enumerate(item_ids)}
    
    # Process in batches for efficiency
    for start_idx in range(0, testing_data.shape[0], batch_size):
        end_idx = min(start_idx + batch_size, testing_data.shape[0])
        batch_data = testing_data.iloc[start_idx:end_idx]
        
        for _, row in batch_data.iterrows():
            user_id, item_id, actual_rating = row['CustomerID'], row['MovieID'], row['Rating']
            if user_id in user_indices and item_id in item_indices:
                user_idx = user_indices[user_id]
                item_idx = item_indices[item_id]
                # Predict rating using the optimized function for dense matrices
                predicted_rating = predict_rating(user_idx, item_idx, k, cosine_similarity_matrix, user_item_matrix)
                predicted_ratings.append(predicted_rating)
                actual_ratings.append(actual_rating)
    
    # Calculate and return RMSE for the batch
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    return rmse


In [16]:
# k_values = [20, 25, 30, 35, 40, 45, 50, 100, 200]

# best_k = None
# lowest_rmse = float('inf')

# # Ensure the `cosine_similarity_matrix` and `user_item_matrix` are correctly prepared
# # `user_ids` and `item_ids` should correctly map users and items to their indices in these matrices

# # Iterate over each value of k
# for k in k_values:
#     # Calculate the RMSE for this k using the validation data
#     # Assuming the evaluate_rmse_in_batches function signature is updated to accept the cosine similarity matrix
#     current_rmse = evaluate_rmse_in_batches2(validation_data, cosine_similarity_matrix, k, user_item_matrix_normalized, user_ids, item_ids, batch_size=1000)
    
#     print(f'RMSE for k={k} on validation set: {current_rmse}')
    
#     # Update best_k if the current model performs better on the validation set
#     if current_rmse < lowest_rmse:
#         best_k = k
#         lowest_rmse = current_rmse

# print(f'Best k: {best_k} with RMSE: {lowest_rmse} on validation set')


In [21]:
# Set the number of neighbors
k = 100

# Defining user and item ids in training data
user_ids = training_data['CustomerID'].unique()
item_ids = training_data['MovieID'].unique()

# Evaluate the model on the test set using the batch processing function
rmse = evaluate_rmse_in_batches(validation_data,  user_similarity_matrix, user_item_matrix_normalized, k, user_ids, item_ids, batch_size=1000)

print(f"Test RMSE: {rmse}")

Test RMSE: 3.717630861285077
