In [1]:
import pandas as pd
import numpy as np
training_df = pd.read_csv('C:/Users/nafla/OneDrive/Documents/system development/Netflix/training_data.csv')
training_df.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,RatingYear,MovieAge
0,1,1488844,3,2005-09-06,2003,2005,2
1,1,822109,5,2005-05-13,2003,2005,2
2,1,885013,4,2005-10-19,2003,2005,2
3,1,30878,4,2005-12-26,2003,2005,2
4,1,823519,3,2004-05-03,2003,2004,1


In [2]:
# Calculate quantiles for user activity and item popularity
user_activity_quantiles = training_df['CustomerID'].value_counts().quantile([0.25, 0.5, 0.75])
item_popularity_quantiles = training_df['MovieID'].value_counts().quantile([0.25, 0.5, 0.75])
print(user_activity_quantiles)
print(item_popularity_quantiles)

0.25     8.0
0.50    24.0
0.75    64.0
Name: CustomerID, dtype: float64
0.25     192.0
0.50     552.5
0.75    2539.0
Name: MovieID, dtype: float64


In [3]:
training_df['CustomerID'] = training_df['CustomerID'].astype(str)
training_df['MovieID'] = training_df['MovieID'].astype(str)
training_df['Rating'] = pd.to_numeric(training_df['Rating'], errors='coerce')  # Converts to float, makes non-numeric as NaN

In [4]:
# Check data types
print("Before conversion:")
print(training_df.dtypes)

Before conversion:
MovieID          object
CustomerID       object
Rating            int64
Date             object
YearOfRelease     int64
RatingYear        int64
MovieAge          int64
dtype: object


# Stratified Sampling Method 

To create a representative sample of our dataset, we employ a stratified sampling method that accounts for three key dimensions: Rating Distribution, User Activity, and Item Popularity. This approach ensures our sample maintains the diversity and characteristics of the entire dataset, facilitating more reliable model training and evaluation.

- User Activity is quantified by the number of ratings a user has provided.
- Item Popularity reflects the number of ratings an item has received.

Finally, We combine User Activity, Item Popularity, and Rating into a composite stratification key for each record. This multi-dimensional key ensures our sampling process considers the distribution across all three axes.

In [5]:
# Assign each user and item to a bin based on the quantiles
training_df['UserActivityBin'] = pd.qcut(training_df.groupby('CustomerID')['Rating'].transform('size'), 
                                q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

training_df['ItemPopularityBin'] = pd.qcut(training_df.groupby('MovieID')['Rating'].transform('size'), 
                                  q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

# Combine these with Rating to create a stratification key
training_df['Strata'] = training_df['UserActivityBin'].astype(str) + "_" + training_df['ItemPopularityBin'].astype(str) + "_" + training_df['Rating'].astype(str)

# Perform stratified sampling
# we use groupby and  frac to specify a fraction of each strata and in case number of rows is less that 10 it takes all rows
strat_sample_df = training_df.groupby('Strata').apply(lambda x: x.sample(frac=0.005 if len(x) > 10 else len(x)/len(x))).reset_index(drop=True)


In [6]:
num_sampled_rows = len(strat_sample_df)
print(f"Number of rows in the sampled DataFrame: {num_sampled_rows}")

Number of rows in the sampled DataFrame: 120269


# Splitting dataset to training, test, validation

In [7]:
from sklearn.model_selection import train_test_split

# Split the remaining data into training, testing, and validation sets
training_data, testing_data = train_test_split(strat_sample_df, test_size=0.2, random_state=42)


In [8]:
# Calculate the size of each split
training_size = training_data.shape[0]  # Number of rows in the training data
# validation_size = validation_data.shape[0]  # Number of rows in the validation data
testing_size = testing_data.shape[0]  # Number of rows in the testing data

# Print the sizes
print(f"Training Data Size: {training_size}")
# print(f"Validation Data Size: {validation_size}")
print(f"Testing Data Size: {testing_size}")

Training Data Size: 96215
Testing Data Size: 24054


In [20]:
# Assuming final_training_data, validation_data, and testing_data are your data splits

# Count unique MovieIDs in the final training data
unique_movies_training = training_data['CustomerID'].nunique()

# Count unique MovieIDs in the validation data
# unique_movies_validation = validation_data['MovieID'].nunique()

# Count unique MovieIDs in the testing data
unique_movies_testing = testing_data['MovieID'].nunique()

# Print the counts
print(f"Unique CustomerIDs in Training Data: {unique_movies_training}")
# print(f"Unique MovieIDs in Validation Data: {unique_movies_validation}")
print(f"Unique MovieIDs in Testing Data: {unique_movies_testing}")


Unique CustomerIDs in Training Data: 74762
Unique MovieIDs in Testing Data: 2329


# Creating User - Item matrix

In [10]:
# Creating customer-movie matrix
import pandas as pd
from scipy.sparse import csr_matrix


# Map user IDs and movie IDs to integer indices for CSR matrix
user_ids = training_data['CustomerID'].unique()
movie_ids = training_data['MovieID'].unique()

user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
movie_id_to_index = {movie_id: index for index, movie_id in enumerate(movie_ids)}

# Prepare row, column, and data arrays for csr_matrix
rows = training_data['CustomerID'].map(user_id_to_index)
cols = training_data['MovieID'].map(movie_id_to_index)
data = training_data['Rating']

# Create CSR matrix
ratings_csr_matrix = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(movie_ids)))


print(ratings_csr_matrix)

  (0, 0)	4
  (1, 1)	5
  (1, 128)	5
  (2, 2)	4
  (3, 3)	4
  (3, 26)	4
  (4, 4)	2
  (4, 179)	4
  (5, 5)	4
  (6, 6)	3
  (6, 458)	3
  (7, 7)	3
  (8, 8)	5
  (9, 9)	3
  (9, 1586)	4
  (10, 10)	4
  (10, 2638)	3
  (11, 11)	4
  (12, 12)	3
  (12, 35)	4
  (13, 13)	2
  (13, 48)	4
  (13, 660)	3
  (13, 1740)	2
  (13, 1960)	2
  :	:
  (74737, 1578)	1
  (74738, 14)	3
  (74739, 71)	5
  (74740, 37)	4
  (74741, 321)	5
  (74742, 174)	4
  (74743, 42)	3
  (74744, 27)	3
  (74745, 301)	3
  (74746, 154)	3
  (74747, 824)	3
  (74748, 180)	5
  (74749, 247)	3
  (74750, 203)	1
  (74751, 579)	4
  (74752, 75)	4
  (74753, 161)	5
  (74754, 198)	5
  (74755, 1174)	4
  (74756, 197)	5
  (74757, 114)	4
  (74758, 186)	4
  (74759, 1969)	5
  (74760, 37)	3
  (74761, 264)	2


# Define similarity function for each given user

In [11]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

cosine_similarity_matrix_csr = cosine_similarity(ratings_csr_matrix, dense_output=False)
    



In [12]:
print(cosine_similarity_matrix_csr)

  (0, 72861)	1.0
  (0, 72688)	1.0
  (0, 71762)	1.0
  (0, 71152)	1.0
  (0, 70407)	1.0
  (0, 70130)	1.0
  (0, 69721)	1.0
  (0, 69673)	1.0
  (0, 68551)	0.7071067811865475
  (0, 67891)	1.0
  (0, 67832)	1.0
  (0, 67747)	1.0
  (0, 67572)	1.0
  (0, 66904)	1.0
  (0, 65637)	1.0
  (0, 65604)	1.0
  (0, 64342)	1.0
  (0, 64185)	1.0
  (0, 64131)	1.0
  (0, 64054)	0.8320502943378437
  (0, 64028)	0.9486832980505138
  (0, 63894)	1.0
  (0, 62700)	1.0
  (0, 60965)	1.0
  (0, 60786)	1.0
  :	:
  (74761, 8934)	0.6
  (74761, 8642)	0.5144957554275265
  (74761, 8517)	0.3651483716701107
  (74761, 8387)	1.0
  (74761, 8020)	1.0
  (74761, 7527)	0.39056673294247163
  (74761, 7518)	0.4242640687119285
  (74761, 6612)	1.0
  (74761, 6271)	0.8574929257125441
  (74761, 6102)	0.7071067811865475
  (74761, 5291)	0.5298129428260175
  (74761, 4849)	1.0
  (74761, 4708)	0.5883484054145521
  (74761, 4486)	1.0
  (74761, 3638)	0.31622776601683794
  (74761, 3620)	0.5298129428260175
  (74761, 3414)	0.4472135954999579
  (74761, 2301)	1

In [13]:
user_item_matrix_df = training_data.pivot_table(index='CustomerID', columns='MovieID', values='Rating')


In [14]:

user_user_corr_matrix = user_item_matrix_df.corr()  # Use Pearson correlation


In [16]:
# Convert to sparse matrix format if needed
pearson_similarity_matrix = csr_matrix(user_user_corr_matrix.fillna(0).values)

print(pearson_similarity_matrix)

  (0, 0)	1.0
  (3, 3)	1.0
  (4, 4)	1.0
  (5, 5)	1.0
  (6, 6)	1.0
  (7, 7)	1.0
  (9, 9)	1.0
  (11, 11)	1.0
  (12, 12)	1.0
  (13, 13)	1.0
  (15, 15)	1.0
  (16, 16)	1.0
  (17, 17)	1.0
  (18, 18)	1.0
  (20, 20)	1.0
  (21, 21)	1.0
  (22, 22)	1.0
  (24, 24)	1.0
  (25, 25)	1.0
  (26, 26)	1.0
  (27, 27)	1.0
  (27, 804)	1.0
  (28, 28)	1.0
  (29, 29)	1.0
  (30, 30)	1.0
  :	:
  (3471, 3471)	1.0
  (3472, 3472)	1.0
  (3473, 3473)	1.0
  (3474, 541)	1.0
  (3474, 1189)	0.8660254037844387
  (3474, 2085)	1.0
  (3474, 2239)	1.0
  (3474, 2688)	1.0
  (3474, 3014)	-1.0
  (3474, 3025)	0.3333333333333334
  (3474, 3122)	0.5000000000000001
  (3474, 3200)	1.0
  (3474, 3433)	1.0
  (3474, 3474)	1.0
  (3475, 3475)	1.0
  (3476, 3476)	1.0
  (3477, 3477)	1.0
  (3478, 3478)	1.0
  (3479, 3479)	1.0
  (3480, 3480)	1.0
  (3481, 3481)	1.0
  (3482, 3482)	1.0
  (3484, 1987)	1.0
  (3484, 3484)	1.0
  (3485, 3485)	1.0


In [18]:
print(pearson_similarity_matrix.shape)

(3486, 3486)


# Predict ratings using similarities

In [44]:
def predict_rating_with_similarity_matrix(csr_user_item_matrix, similarity_matrix, user_index, movie_index, k):
    """
    Predict the rating for a given movie by a target user, based on the ratings of top-k similar users.
    This function uses a pre-calculated similarity matrix.
    
    Parameters:
    - csr_user_item_matrix: CSR matrix representing the user-item matrix.
    - similarity_matrix: CSR matrix representing the similarity scores between users.
    - user_index: The index of the user for whom the rating is being predicted.
    - movie_index: The index of the movie for which the rating is being predicted.
    - k: Number of top similar users to consider for prediction.
    
    Returns:
    - Predicted rating for the movie by the target user.
    """
    num_users = similarity_matrix.shape[0]  # Number of users in the similarity matrix
    
    # Check if user_index exists in the similarity matrix
    if user_index >= 0 and user_index < num_users:
        # Extract similarity scores for the target user
        user_similarities = similarity_matrix.getrow(user_index).toarray().flatten()
    else:
        # If user_index does not exist, consider the similarity scores as zeros
        user_similarities = np.zeros(num_users)
    # # Extract similarity scores for the target user and sort them to find top k similar users
    # user_similarities = similarity_matrix.getrow(user_index).toarray().flatten()
    top_k_users_indices = np.argsort(user_similarities)[-k:]
#     # Step 1: Extract all user indices who have rated the movie
#     users_who_rated_movie_indices = np.flatnonzero(csr_user_item_matrix[:, movie_index].toarray().flatten() > 0)

# # Step 2: Extract similarity scores for the target user
#     user_similarities = similarity_matrix.getrow(user_index).toarray().flatten()

# # Filter similarities for users who have rated the movie
#     filtered_similarities = user_similarities[users_who_rated_movie_indices]

# # Check if there are enough users who rated the movie
#     if len(filtered_similarities) >= k:
#         # Step 3: Find indices of top k similar users from the filtered list
#         # Since we are working with filtered indices, we need to sort them based on similarity and then map back to the original user indices
#         top_k_filtered_indices = np.argsort(filtered_similarities)[-k:]

#         # Map back to original indices in the user_item_matrix
#         top_k_users_indices = users_who_rated_movie_indices[top_k_filtered_indices]
#     else:
#         # If there are fewer than k users, use all available users
#         top_k_users_indices = users_who_rated_movie_indices

    # Get ratings of the movie from all users
    movie_ratings = csr_user_item_matrix.getcol(movie_index).toarray().flatten()

    # Calculate weighted average of ratings
    top_k_similarities = user_similarities[top_k_users_indices]
    top_k_ratings = movie_ratings[top_k_users_indices]
    
    weighted_sum = np.dot(top_k_similarities, top_k_ratings)
    similarity_sum = np.sum(top_k_similarities)

    if similarity_sum > 0:
        predicted_rating = weighted_sum / similarity_sum

    else:
        # Use the overall average rating of the movie by all users as the default rating
        overall_average_rating = csr_user_item_matrix[:, movie_index].mean()
        predicted_rating = overall_average_rating if np.isfinite(overall_average_rating) else 5
  # Default rating this is the problem 
    
    return predicted_rating


In [22]:

from sklearn.metrics import mean_squared_error
from math import sqrt
def evaluate_predictions_csr(validation_data, csr_user_item_matrix, similarity_matrix, k, user_id_to_index, movie_id_to_index):
    """
    Evaluate the recommendation system by predicting ratings for each user-movie pair in the validation set
    using a CSR matrix and pre-computed similarity matrix, and comparing the predictions to the actual ratings using RMSE.

    Parameters:
    - validation_data: DataFrame containing 'UserIndex', 'MovieIndex', and 'Rating'.
    - csr_user_item_matrix: CSR matrix representing the user-item matrix from the training set.
    - similarity_matrix: Pre-computed similarity matrix as a CSR matrix.
    - k: The number of top similar users to consider when making predictions.
    - user_id_to_index: Dictionary mapping user IDs to indices.
    - movie_id_to_index: Dictionary mapping movie IDs to indices.
    
    Returns:
    - rmse: The root mean square error of the predicted ratings against the actual ratings.
    """
    actual_ratings = []
    predicted_ratings = []

    for _, row in validation_data.iterrows():
        user_index = row['UserIndex']
        movie_index = row['MovieIndex']
        actual_rating = row['Rating']
        
        # Check if the movie index is valid
        if movie_index in csr_user_item_matrix.indices:
            predicted_rating = predict_rating_with_similarity_matrix(csr_user_item_matrix, similarity_matrix, user_index, movie_index, k)
            actual_ratings.append(actual_rating)
            predicted_ratings.append(predicted_rating)
    
    # Calculate RMSE between actual and predicted ratings
    actual_ratings = np.array(actual_ratings)
    predicted_ratings = np.array(predicted_ratings)
    valid_mask = ~np.isnan(predicted_ratings)
    rmse = sqrt(mean_squared_error(actual_ratings[valid_mask], predicted_ratings[valid_mask]))

    return rmse


In [23]:
def map_ids_to_indices(validation_data, user_id_to_index, movie_id_to_index):
    """
    Map user IDs and movie IDs to their respective indices.

    Parameters:
    - validation_data: DataFrame containing 'CustomerID', 'MovieID', and other columns.
    - user_id_to_index: Dictionary mapping user IDs to indices.
    - movie_id_to_index: Dictionary mapping movie IDs to indices.

    Returns:
    - DataFrame with added columns 'UserIndex' and 'MovieIndex' for the respective indices.
    """

    # Copy to avoid modifying the original DataFrame
    modified_data = validation_data.copy()
    
    # Map 'CustomerID' to 'UserIndex'
    modified_data['UserIndex'] = modified_data['CustomerID'].map(user_id_to_index)
    
    # Map 'MovieID' to 'MovieIndex'
    modified_data['MovieIndex'] = modified_data['MovieID'].map(movie_id_to_index)
    
    # Optional: drop rows where either UserIndex or MovieIndex is NaN (i.e., ID wasn't found)
    modified_data.dropna(subset=['UserIndex', 'MovieIndex'], inplace=True)
    
    # Convert indices to integers (they might be floats due to NaN handling)
    modified_data['UserIndex'] = modified_data['UserIndex'].astype(int)
    modified_data['MovieIndex'] = modified_data['MovieIndex'].astype(int)
    
    return modified_data


In [24]:
mapped_data = map_ids_to_indices(training_data, user_id_to_index, movie_id_to_index)


In [25]:
train_data, validation_data = train_test_split(mapped_data, test_size=0.2, random_state=42)

In [26]:
validation_data.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,RatingYear,MovieAge,UserActivityBin,ItemPopularityBin,Strata,UserIndex,MovieIndex
76827,985,939486,3,2002-11-04,1999,2002,3,medium-high,medium-high,medium-high_medium-high_3,54219,197
52655,2342,570275,5,2004-11-22,2004,2004,0,low,medium-high,low_medium-high_5,18542,103
94920,4123,2501691,4,2001-12-18,1998,2001,3,medium,high,medium_high_4,6317,310
47822,1975,1574990,2,2005-10-24,2000,2005,5,low,medium-high,low_medium-high_2,1163,211
67318,12,569566,2,2003-09-04,1947,2003,56,medium-high,low,medium-high_low_2,14719,1714


In [45]:
# Usage example
similarity_matrix = pearson_similarity_matrix
k = 10  # Example value for k
rmse = evaluate_predictions_csr(validation_data, ratings_csr_matrix, similarity_matrix, k, user_id_to_index, movie_id_to_index)
print(f"RMSE: {rmse}")




KeyboardInterrupt: 

In [None]:
# Define a range of k values to test
k_values = [ 5, 30, 70, 150, 200, 300, 500]

# Initialize a dictionary to store the RMSE for each k value
k_rmse_results = {}

# Loop over each k value
for k in k_values:
    # Evaluate the recommender system using the current k value
    rmse = evaluate_predictions_csr(validation_data, ratings_csr_matrix, similarity_matrix, k, user_id_to_index, movie_id_to_index)

    # Store the RMSE in the dictionary
    k_rmse_results[k] = rmse
    
    # Print the result for the current k
    print(f"RMSE for k={k}: {rmse}")

# Identify the k value with the lowest RMSE
optimal_k = min(k_rmse_results, key=k_rmse_results.get)
optimal_rmse = k_rmse_results[optimal_k]

print(f"\nOptimal k value: {optimal_k} with RMSE: {optimal_rmse}")



RMSE for k=5: 1.517477072883225
RMSE for k=30: 1.5173909880664258
RMSE for k=70: 1.5173909880664258
RMSE for k=150: 1.5173909880664258
RMSE for k=200: 1.5173909880664258
RMSE for k=300: 1.5173909880664258
RMSE for k=500: 1.5173909880664258

Optimal k value: 30 with RMSE: 1.5173909880664258


In [None]:

# Define a dictionary to hold your similarity methods for easy access
similarity_methods = {
    'pearson': calculate_pearson_similarity,
    'cosine': calculate_cosine_similarity,
    'manhattan': calculate_manhattan_similarity  
}

# Define the range of k values you want to test
k_values = range(5, 300, 20)

# Placeholder for storing grid search results
grid_search_results = []

# Perform grid search
for k in k_values:
    for method_name, method_function in similarity_methods.items():
        # Evaluate the recommender system's performance for each combination of k and similarity method
        rmse = evaluate_predictions(validation_data, user_item_matrix, k, method_function)
        
        # Store the results
        grid_search_results.append({'method': method_name, 'k': k, 'rmse': rmse})
        
        # Optionally print the results for each iteration
        print(f"Evaluated {method_name} method with k={k}: RMSE = {rmse}")

# Find the best performing combination of k and similarity method based on RMSE
best_configuration = min(grid_search_results, key=lambda x: x['rmse'])

# Output the best combination found
print(f"Best Configuration: Method = {best_configuration['method']}, k = {best_configuration['k']}, RMSE = {best_configuration['rmse']}")


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from scipy.sparse import lil_matrix

def calculate_pearson_similarity_matrix_sparse(user_item_matrix):
    """
    Calculate the Pearson correlation coefficient between all pairs of users' ratings in the user-item matrix.
    Only non-zero similarities are stored in a sparse matrix, making it more memory-efficient.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    
    Returns:
    - similarity_matrix_sparse: A sparse matrix representing the user-user similarity matrix using Pearson correlation coefficients.
    """
    # Replace 0 with NaN to ignore unrated items in the correlation computation
    user_item_matrix_replaced = user_item_matrix.replace(0, np.nan)
    
    # Initialize a sparse matrix for storing similarities
    n_users = user_item_matrix_replaced.shape[0]
    similarity_matrix_sparse = lil_matrix((n_users, n_users))
    
    user_indices = {user_id: index for index, user_id in enumerate(user_item_matrix_replaced.index)}
    
    # Loop through each pair of users to calculate similarity
    for user1 in user_item_matrix_replaced.index:
        for user2 in user_item_matrix_replaced.index:
            if user1 != user2:
                user1_ratings = user_item_matrix_replaced.loc[user1]
                user2_ratings = user_item_matrix_replaced.loc[user2]
                
                # Find common movies rated by both users
                common_movies = user1_ratings.dropna().index.intersection(user2_ratings.dropna().index)
                
                if len(common_movies) >= 2:
                    # Calculate Pearson correlation for common rated movies
                    correlation, _ = pearsonr(user1_ratings.loc[common_movies], user2_ratings.loc[common_movies])
                    
                    # Store the correlation if it's valid and non-zero
                    if np.isfinite(correlation) and correlation != 0:
                        similarity_matrix_sparse[user_indices[user1], user_indices[user2]] = correlation
                # No need to explicitly set values for pairs with fewer than 2 common ratings or for NaN correlations
            else:
                # Set self-similarity to 1
                similarity_matrix_sparse[user_indices[user1], user_indices[user2]] = 1
                
    return similarity_matrix_sparse


In [None]:
similarity_matrix = calculate_pearson_similarity_matrix_sparse (user_item_matrix)
similarity_matrix.head()