In [22]:
import pandas as pd
import numpy as np
training_df = pd.read_csv('C:/Users/nafla/OneDrive/Documents/system development/Netflix/training_data.csv')
training_df.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,RatingYear,MovieAge
0,1,1488844,3,2005-09-06,2003,2005,2
1,1,822109,5,2005-05-13,2003,2005,2
2,1,885013,4,2005-10-19,2003,2005,2
3,1,30878,4,2005-12-26,2003,2005,2
4,1,823519,3,2004-05-03,2003,2004,1


In [23]:
# # Convert CustomerID and MovieID to categorical types for memory efficiency
# training_df['CustomerID'] = training_df['CustomerID'].astype("category")
# training_df['MovieID'] = training_df['MovieID'].astype("category")
# training_df['Rating'] = training_df['Rating'].astype('float16')


In [24]:
training_df['CustomerID'] = training_df['CustomerID'].astype(str)
training_df['MovieID'] = training_df['MovieID'].astype(str)
training_df['Rating'] = pd.to_numeric(training_df['Rating'], errors='coerce')  # Converts to float, makes non-numeric as NaN


In [25]:
# Calculate quantiles for user activity and item popularity
user_activity_quantiles = training_df['CustomerID'].value_counts().quantile([0.25, 0.5, 0.75])
item_popularity_quantiles = training_df['MovieID'].value_counts().quantile([0.25, 0.5, 0.75])
print(user_activity_quantiles)
print(item_popularity_quantiles)

0.25     8.0
0.50    24.0
0.75    64.0
Name: CustomerID, dtype: float64
0.25     192.0
0.50     552.5
0.75    2539.0
Name: MovieID, dtype: float64


In [26]:
# Assign each user and item to a bin based on the quantiles
training_df['UserActivityBin'] = pd.qcut(training_df.groupby('CustomerID')['Rating'].transform('size'), 
                                q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

training_df['ItemPopularityBin'] = pd.qcut(training_df.groupby('MovieID')['Rating'].transform('size'), 
                                  q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

# Combine these with Rating to create a stratification key
training_df['Strata'] = training_df['UserActivityBin'].astype(str) + "_" + training_df['ItemPopularityBin'].astype(str) + "_" + training_df['Rating'].astype(str)

# Perform stratified sampling
# we use groupby and  frac to specify a fraction of each strata and in case number of rows is less that 10 it takes all rows
strat_sample_df = training_df.groupby('Strata').apply(lambda x: x.sample(frac=0.001 if len(x) > 10 else len(x)/len(x))).reset_index(drop=True)


In [27]:
num_sampled_rows = len(strat_sample_df)
print(f"Number of rows in the sampled DataFrame: {num_sampled_rows}")

Number of rows in the sampled DataFrame: 24052


In [28]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
training_data, testing_data = train_test_split(strat_sample_df, test_size=0.2, random_state=42)

# further split the training data into training and validation sets
training_data, validation_data = train_test_split(training_data, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2


In [29]:
# Create user and item indxs
num_users = training_data['CustomerID'].nunique()
num_items = training_data['MovieID'].nunique()
user_to_index = {user: idx for idx, user in enumerate(training_data['CustomerID'].unique())}
item_to_index = {item: idx for idx, item in enumerate(training_data['MovieID'].unique())}

# Defining user and item ids in training data
user_ids = training_data['CustomerID'].unique()
item_ids = training_data['MovieID'].unique()

# Initialize a dense matrix
train_matrix = np.zeros((num_users, num_items))

# Not rated items are implicitly handled by the initial setup of the dense matrix using np.zeros
# Populate the matrix with ratings
for _, row in training_data.iterrows():
    user_idx = user_to_index[row['CustomerID']]
    item_idx = item_to_index[row['MovieID']]
    train_matrix[user_idx, item_idx] = row['Rating']

# Rated Items: Have their actual rating values as specified in training_data.
# Not Rated Items: Are represented by a 0, indicating either the absence of a rating or implicitly treating unrated items as having a rating of 0.

In [31]:
def get_dense_ratings(user1_idx, user2_idx, user_item_matrix):
    """
    Extracts dense rating vectors for common items rated by both users.
    """
    user1_ratings = user_item_matrix[user1_idx, :]
    user2_ratings = user_item_matrix[user2_idx, :]
    
    common_item_indices = np.where((user1_ratings > 0) & (user2_ratings > 0))[0]
    
    return user1_ratings[common_item_indices], user2_ratings[common_item_indices]


In [32]:
from scipy.spatial.distance import cityblock

def manhattan_similarity(user1_common_ratings, user2_common_ratings):
    """
    Computes the inverse Manhattan distance between two users' ratings as similarity.
    
    Args:
    - user1_common_ratings, user2_common_ratings: Dense rating vectors for common items.
    
    Returns:
    - The computed similarity score.
    """
    if len(user1_common_ratings) == 0:  # Return 0 if there are no common ratings
        return 0
    # Calculate the Manhattan distance
    distance = cityblock(user1_common_ratings, user2_common_ratings)
    # Convert distance to similarity; add 1 to avoid division by zero
    similarity = 1 / (1 + distance)
    return similarity


In [36]:
def predict_rating(user_idx, item_idx, k, user_item_matrix, user_ids):
    """
    Predicts the rating for a specific user and item using KNN based on Manhattan similarity.

    Args:
    - user_idx: Index of the target user in the user_ids array.
    - item_idx: Index of the target item.
    - k: Number of nearest neighbors to consider.
    - user_item_matrix: Sparse user-item matrix from the training set.
    - user_ids: Array of user IDs corresponding to the rows in train_user_item_matrix.
    
    Returns:
    - The predicted rating.
    """
    num_users = user_item_matrix.shape[0]
    similarities = np.zeros(num_users)
    
    # Compute similarity between the target user and all other users in the training set
    for other_user_idx in range(num_users):
        if other_user_idx != user_idx:
            user1_common_ratings, user2_common_ratings = get_dense_ratings(user_idx, other_user_idx, user_item_matrix)
            similarity = manhattan_similarity(user1_common_ratings, user2_common_ratings)
            similarities[other_user_idx] = similarity
            # Use cosine similarity
            # similarity = cosine_similarity (user1_common_ratings, user2_common_ratings)
            # similarities[other_user_idx] = similarity
    
    # Sort users by similarity and select the top k similar users
    top_k_users_indices = np.argsort(similarities)[-k:]
    
    # Calculate the weighted average of ratings from these top k similar users
    top_k_similarities = similarities[top_k_users_indices]
    # Ensure to extract ratings for the target item from top k users
    top_k_ratings = user_item_matrix[top_k_users_indices, item_idx]
    
    # Compute the predicted rating as a weighted average
    if np.sum(top_k_similarities) > 0:
        predicted_rating = np.dot(top_k_ratings, top_k_similarities) / np.sum(top_k_similarities)
    else:
        # Fallback to the average rating for the item if no similarities are found
        # Directly work with dense matrix, ensuring to consider only rated items
        rated_item_indices = user_item_matrix[:, item_idx] > 0
        if rated_item_indices.any():
            predicted_rating = np.mean(user_item_matrix[rated_item_indices, item_idx])
        else:
            predicted_rating = 0  # Use a sensible default, like global average rating, if preferable
    
    return predicted_rating

In [42]:
from sklearn.metrics import mean_squared_error

def evaluate_rmse_in_batches(testing_data, training_matrix, k, user_ids, item_ids, batch_size=1000):
    """
    Evaluates RMSE of the KNN model on the test data in batches.
    
    Args:
    - testing_data: DataFrame containing the test set.
    - train_matrix: User-item matrix for the training set, can be dense or sparse.
    - k: Number of nearest neighbors to consider.
    - user_ids: Array of user IDs from the original dataset, used to map users to matrix indices.
    - item_ids: Array of item IDs from the original dataset, used to map items to matrix indices.
    - batch_size: Number of user-item pairs to evaluate per batch, for efficiency.
    
    Returns:
    - The RMSE for the test set.
    """
    # Initialize list to store actual and predicted ratings
    actual_ratings = []
    predicted_ratings = []
    
    # Mapping dictionaries for user and item IDs to their indices
    user_indices = {user_id: idx for idx, user_id in enumerate(user_ids)}
    item_indices = {item_id: idx for idx, item_id in enumerate(item_ids)}
    
    # Process in batches for efficiency
    for start_idx in range(0, testing_data.shape[0], batch_size):
        end_idx = min(start_idx + batch_size, testing_data.shape[0])
        batch_data = testing_data.iloc[start_idx:end_idx]
        
        for _, row in batch_data.iterrows():
            user_id, item_id, actual_rating = row['CustomerID'], row['MovieID'], row['Rating']
            if user_id in user_indices and item_id in item_indices:
                user_idx = user_indices[user_id]
                item_idx = item_indices[item_id]
                # Predict rating using the optimized function for dense matrices
                predicted_rating = predict_rating(user_idx, item_idx, k, training_matrix, user_ids)
                predicted_ratings.append(predicted_rating)
                actual_ratings.append(actual_rating)
    
    # Calculate and return RMSE for the batch
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    return rmse


In [44]:
k_values = [20, 25, 30, 35, 40, 45, 50, 100, 200]

best_k = None
lowest_rmse = float('inf')

# Iterate over each value of k
for k in k_values:
    # Use your existing function to calculate the RMSE for this k
    current_rmse = evaluate_rmse_in_batches(validation_data, train_matrix, k, user_ids, item_ids, batch_size=1000)
    
    print(f'RMSE for k={k}: {current_rmse}')
    
    # Update best_k if the current model performs better
    if current_rmse < lowest_rmse:
        best_k = k
        lowest_rmse = current_rmse

print(f'Best k: {best_k} with RMSE: {lowest_rmse}')


RMSE for k=20: 3.624659741935849
RMSE for k=25: 3.624659741935849
RMSE for k=30: 3.624659741935849
RMSE for k=35: 3.624659741935849
RMSE for k=40: 3.624659741935849
RMSE for k=45: 3.624659741935849
RMSE for k=50: 3.624659741935849
RMSE for k=100: 3.624579019979229
RMSE for k=200: 3.624579019979229
Best k: 100 with RMSE: 3.624579019979229


In [43]:
# Set the number of neighbors
k = 200

# Evaluate the model on the test set using the batch processing function
rmse = evaluate_rmse_in_batches(validation_data,  train_matrix, k, user_ids, item_ids, batch_size=1000)

print(f"Test RMSE: {rmse}")

Test RMSE: 3.624579019979229
