In [1]:
import pandas as pd
import numpy as np
training_df = pd.read_csv('C:/Users/nafla/OneDrive/Documents/system development/Netflix/training_data.csv')
training_df.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,RatingYear,MovieAge
0,1,1488844,3,2005-09-06,2003,2005,2
1,1,822109,5,2005-05-13,2003,2005,2
2,1,885013,4,2005-10-19,2003,2005,2
3,1,30878,4,2005-12-26,2003,2005,2
4,1,823519,3,2004-05-03,2003,2004,1


In [2]:
# Calculate quantiles for user activity and item popularity
user_activity_quantiles = training_df['CustomerID'].value_counts().quantile([0.25, 0.5, 0.75])
item_popularity_quantiles = training_df['MovieID'].value_counts().quantile([0.25, 0.5, 0.75])
print(user_activity_quantiles)
print(item_popularity_quantiles)

0.25     8.0
0.50    24.0
0.75    64.0
Name: CustomerID, dtype: float64
0.25     192.0
0.50     552.5
0.75    2539.0
Name: MovieID, dtype: float64


In [3]:
training_df['CustomerID'] = training_df['CustomerID'].astype(str)
training_df['MovieID'] = training_df['MovieID'].astype(str)
training_df['Rating'] = pd.to_numeric(training_df['Rating'], errors='coerce')  # Converts to float, makes non-numeric as NaN

In [4]:
# Check data types
print("Before conversion:")
print(training_df.dtypes)

Before conversion:
MovieID          object
CustomerID       object
Rating            int64
Date             object
YearOfRelease     int64
RatingYear        int64
MovieAge          int64
dtype: object


# Stratified Sampling Method 

To create a representative sample of our dataset, we employ a stratified sampling method that accounts for three key dimensions: Rating Distribution, User Activity, and Item Popularity. This approach ensures our sample maintains the diversity and characteristics of the entire dataset, facilitating more reliable model training and evaluation.

- User Activity is quantified by the number of ratings a user has provided.
- Item Popularity reflects the number of ratings an item has received.

Finally, We combine User Activity, Item Popularity, and Rating into a composite stratification key for each record. This multi-dimensional key ensures our sampling process considers the distribution across all three axes.

In [5]:
# Assign each user and item to a bin based on the quantiles
training_df['UserActivityBin'] = pd.qcut(training_df.groupby('CustomerID')['Rating'].transform('size'), 
                                q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

training_df['ItemPopularityBin'] = pd.qcut(training_df.groupby('MovieID')['Rating'].transform('size'), 
                                  q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

# Combine these with Rating to create a stratification key
training_df['Strata'] = training_df['UserActivityBin'].astype(str) + "_" + training_df['ItemPopularityBin'].astype(str) + "_" + training_df['Rating'].astype(str)

# Perform stratified sampling
# we use groupby and  frac to specify a fraction of each strata and in case number of rows is less that 10 it takes all rows
strat_sample_df = training_df.groupby('Strata').apply(lambda x: x.sample(frac=0.005 if len(x) > 10 else len(x)/len(x))).reset_index(drop=True)


In [6]:
num_sampled_rows = len(strat_sample_df)
print(f"Number of rows in the sampled DataFrame: {num_sampled_rows}")

Number of rows in the sampled DataFrame: 120269


# Splitting dataset to training, test, validation

In [7]:
from sklearn.model_selection import train_test_split

# Split the remaining data into training, testing, and validation sets
train, testing_data = train_test_split(strat_sample_df, test_size=0.2, random_state=42)
training_data , validation_data = train_test_split(train, test_size=0.2, random_state=42)

In [8]:
# Calculate the size of each split
training_size = training_data.shape[0]  # Number of rows in the training data
validation_size = validation_data.shape[0]  # Number of rows in the validation data
testing_size = testing_data.shape[0]  # Number of rows in the testing data

# Print the sizes
print(f"Training Data Size: {training_size}")
print(f"Validation Data Size: {validation_size}")
print(f"Testing Data Size: {testing_size}")

Training Data Size: 76972
Validation Data Size: 19243
Testing Data Size: 24054


In [9]:
# Assuming final_training_data, validation_data, and testing_data are your data splits

# Count unique MovieIDs in the final training data
unique_movies_training = training_data['MovieID'].nunique()

# Count unique MovieIDs in the validation data
unique_movies_validation = validation_data['MovieID'].nunique()

# Count unique MovieIDs in the testing data
unique_movies_testing = testing_data['MovieID'].nunique()

# Print the counts
print(f"Unique MovieIDs in Training Data: {unique_movies_training}")
print(f"Unique MovieIDs in Validation Data: {unique_movies_validation}")
print(f"Unique MovieIDs in Testing Data: {unique_movies_testing}")


Unique MovieIDs in Training Data: 3311
Unique MovieIDs in Validation Data: 2124
Unique MovieIDs in Testing Data: 2310


# Creating User - Item matrix

In [13]:
# Creating customer-movie matrix
import pandas as pd
from scipy.sparse import csr_matrix


# Map user IDs and movie IDs to integer indices for CSR matrix
user_ids = training_data['CustomerID'].unique()
movie_ids = training_data['MovieID'].unique()

user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
movie_id_to_index = {movie_id: index for index, movie_id in enumerate(movie_ids)}

# Prepare row, column, and data arrays for csr_matrix
rows = training_data['CustomerID'].map(user_id_to_index)
cols = training_data['MovieID'].map(movie_id_to_index)
data = training_data['Rating']

# Create CSR matrix
ratings_csr_matrix = csr_matrix((data, (rows, cols)), shape=(len(user_ids), len(movie_ids)))


print(ratings_csr_matrix)

  (0, 0)	5
  (1, 1)	4
  (1, 130)	5
  (1, 477)	4
  (1, 779)	5
  (2, 2)	2
  (3, 3)	2
  (4, 4)	3
  (4, 110)	4
  (5, 5)	4
  (5, 699)	4
  (6, 6)	5
  (7, 7)	5
  (8, 8)	3
  (9, 9)	4
  (10, 10)	5
  (10, 175)	3
  (11, 11)	4
  (11, 770)	4
  (12, 12)	1
  (13, 13)	4
  (14, 14)	4
  (15, 15)	3
  (15, 387)	3
  (15, 646)	5
  :	:
  (62381, 232)	3
  (62382, 336)	3
  (62383, 225)	5
  (62384, 96)	5
  (62385, 41)	5
  (62386, 118)	5
  (62387, 348)	5
  (62388, 828)	3
  (62389, 702)	4
  (62390, 9)	5
  (62391, 118)	3
  (62392, 278)	3
  (62393, 434)	5
  (62394, 2062)	3
  (62395, 171)	4
  (62396, 151)	2
  (62397, 725)	3
  (62398, 1368)	1
  (62399, 1082)	3
  (62400, 433)	4
  (62401, 65)	3
  (62402, 142)	5
  (62403, 191)	3
  (62404, 89)	4
  (62405, 29)	5


# Define similarity function for each given user

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

def calculate_cosine_similarity(user_item_matrix, target_user_ratings):
    """
    Calculate cosine similarity scores between a target user's ratings and all other users.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    - target_user_ratings: Series containing the target user's ratings, indexed by MovieID.
    
    Returns:
    A Series with user IDs as the index and the cosine similarity scores as the values.
    """
    # Ensure target_user_ratings is a DataFrame row for compatibility with cosine_similarity
    target_user_df = pd.DataFrame(target_user_ratings).T.fillna(0)
    
    # Align user_item_matrix with target_user_df to match columns (MovieIDs)
    aligned_user_item_matrix = user_item_matrix.reindex(columns=target_user_df.columns, fill_value=0)
    
    # Calculate cosine similarities
    similarities = cosine_similarity(aligned_user_item_matrix, target_user_df)
    
    # Flatten the similarities array and create a Series with user IDs as index
    similarities_series = pd.Series(similarities.flatten(), index=aligned_user_item_matrix.index)
    
    return similarities_series



In [14]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

def calculate_cosine_similarity_matrix_csr(user_item_matrix_csr):
    """
    Calculate cosine similarity scores between all pairs of users using a CSR matrix.
    
    Parameters:
    - user_item_matrix_csr: CSR matrix with users as rows, movies as columns, and ratings as values.
    
    Returns:
    A CSR matrix representing the user-user similarity matrix with cosine similarity scores.
    """
    # Calculate cosine similarities between all users
    similarity_matrix_csr = cosine_similarity(user_item_matrix_csr, dense_output=False)
    
    return similarity_matrix_csr


In [None]:

from scipy.stats import pearsonr

# defining function to calculate pearson correlation for pair of users
def calculate_pearson_similarity(user_item_matrix, target_user_ratings):
    """
    Calculate the Pearson correlation coefficient between a target user's ratings
    and all other users' ratings in the user-item matrix.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    - target_user_ratings: Series containing the target user's ratings, indexed by MovieID.
    
    Returns:
    - A Series with user IDs as the index and the Pearson correlation coefficients as the values.
    """
    similarities = {}
    
    user_item_matrix_replaced = user_item_matrix.replace(0, np.nan)

    # Loop through each user in the user-item matrix
    for user_id, user_ratings in user_item_matrix_replaced.iterrows():
        common_movies = user_ratings.dropna().index.intersection(target_user_ratings.dropna().index)
        if len(common_movies) >= 2:
            correlation, _ = pearsonr(user_ratings.loc[common_movies], target_user_ratings.loc[common_movies])
            if np.isnan(correlation):
                # Handle NaN correlation explicitly
                similarities[user_id] = 0
            elif np.isfinite(correlation):
                similarities[user_id] = correlation
        else:
            similarities[user_id] = 0
    # Convert the similarities dictionary to a pandas Series
    similarity_series = pd.Series(similarities, name='Similarity').sort_values(ascending=False)
    
    return similarity_series


In [None]:
# defining function to calculate manhattan distance for ratings of each pair of users
def calculate_manhattan_similarity(user_item_matrix, target_user_ratings):
    """
    Calculate user similarities using Manhattan distance, comparing target user's ratings
    with those in the user_item_matrix.
    
    Parameters:
    - user_item_matrix: DataFrame where rows are users, columns are items, and values are ratings (training data).
    - target_user_ratings: Series or dict containing the target user's movie ratings.
    
    Returns:
    A Series with user IDs as the index and the similarity scores as the values.
    """
    similarities = {}

    for user_id in user_item_matrix.index:
        user_ratings = user_item_matrix.loc[user_id]
        
        # Calculate distance only for movies both have rated
        common_movies = user_ratings.index.intersection(target_user_ratings.index)
        if not common_movies.empty:
            distance = np.nansum(np.abs(user_ratings[common_movies] - target_user_ratings[common_movies]))
            similarity = 1 / (1 + distance) if distance != 0 else 0
        else:
            similarity = 0  # No common movies means no similarity
        
        similarities[user_id] = similarity

    similarity_series = pd.Series(similarities, name="Similarity").sort_values(ascending=False)
    return similarity_series




# Predict ratings using similarities

In [None]:
# defining a function to predict rating for each pair of user and movie based on their similarity
def predict_rating(user_item_matrix, target_user_ratings, movie_id, k, similarity_method):

    """
    Predict the rating for a given movie by a target user, based on the ratings of top-k similar users.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    - target_user_id: The ID of the user for whom the rating is being predicted.
    - movie_id: The ID of the movie for which the rating is being predicted.
    - k: Number of top similar users to consider for prediction.
    - similarity_method: Function to calculate similarity scores between users.
    
    Returns:
    - Predicted rating for the movie by the target user.
    """

    
    # Calculate similarity scores between the target user and all others
    similarities = similarity_method(user_item_matrix, target_user_ratings)
    
   # Filter users who have rated the movie
    users_who_rated_movie = user_item_matrix.index[user_item_matrix[movie_id].notnull()]
    users_with_similarity_scores = similarities.index.intersection(users_who_rated_movie)

    # Retain similarity scores for users who have rated the movie
    similarities_filtered = similarities.loc[users_with_similarity_scores]
    
    # Filter top-k similar users from those who have rated the movie
    top_k_users = similarities_filtered.nlargest(k).index
    
    # Retrieve ratings for the movie from these top-k similar users
    top_k_ratings = user_item_matrix.loc[top_k_users, movie_id]
    
 # Calculate weighted average rating
    if not top_k_ratings.isnull().all():
        weighted_ratings = top_k_ratings * similarities.loc[top_k_users]
        predicted_rating = weighted_ratings.sum() / similarities.loc[top_k_users].sum()
    else:
        # Use the average rating for the movie if available
        if user_item_matrix[movie_id].notnull().any():
            predicted_rating = user_item_matrix[movie_id].mean()
        else:
            # Default rating if the movie has not been rated by anyone
            predicted_rating = 2.5
            
    return predicted_rating




In [35]:
def predict_rating_with_similarity_matrix(user_item_matrix, similarity_matrix,user_index, movie_index, k):
    """
    Predict the rating for a given movie by a target user, based on the ratings of top-k similar users.
    This function uses a pre-calculated similarity matrix.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    - similarity_matrix: DataFrame representing the similarity scores between users.
    - user_id: The ID of the user for whom the rating is being predicted.
    - movie_id: The ID of the movie for which the rating is being predicted.
    - k: Number of top similar users to consider for prediction.
    
    Returns:
    - Predicted rating for the movie by the target user.
    """
    
    # Check if the movie is rated by anyone
    if movie_index not in movie_id_to_index:
        return 2.5  # Default rating if the movie has not been rated by anyone
    
    # Extract similarity scores for the target user
    user_similarities = similarity_matrix[user_index, :]

    # Extract ratings for the target movie
    movie_ratings = user_item_matrix[:, movie_index].toarray().flatten()

    # Filter users who have rated the movie
    users_who_rated_movie = user_item_matrix.loc[user_item_matrix[movie_index].notnull()].index
    
    # Keep similarities only for users who have rated the movie
    similarities_filtered = user_similarities[users_who_rated_movie]
    
    # Find indices of top-k similar users
    top_k_indices = np.argsort(similarities_filtered)[-k:]
    

        # Compute the weighted average rating
    weighted_sum = np.dot(user_similarities[top_k_indices], movie_ratings[top_k_indices])
    similarity_sum = user_similarities[top_k_indices].sum()

    if similarity_sum > 0:
        predicted_rating = weighted_sum / similarity_sum
    else:
        predicted_rating = np.nan  # or 2.5, if you prefer a default rating
    
    return predicted_rating

    # # Compute the weighted average rating
    # weighted_sum = 0
    # similarity_sum = 0
    # for similar_user_id in top_k_indices:
    #     # Skip if it's the same user
    #     if similar_user_id == user_id:
    #         continue
    #     user_similarity_score = similarities_filtered[similar_user_id]
    #     user_rating = user_item_matrix.loc[similar_user_id, movie_id]
    #     weighted_sum += user_similarity_score * user_rating
    #     similarity_sum += user_similarity_score

    # # If there were any similar users who rated the movie, calculate the weighted average
    # if similarity_sum > 0:
    #     predicted_rating = weighted_sum / similarity_sum
    # else:
    #     # Default to the average rating for the movie
    #     predicted_rating = user_item_matrix[movie_id].mean()

    # return predicted_rating


In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# defining a function to evaluate accuracy of prediced ratings for each pair of user and movie
def evaluate_predictions(validation_data, user_item_matrix, k, similarity_method):
    """
    Evaluate the recommendation system by predicting ratings for each user-movie pair in the validation set
    and comparing the predictions to the actual ratings using RMSE.
    
    Parameters:
    - validation_data: DataFrame containing 'CustomerID', 'MovieID', and 'Rating'.
    - user_item_matrix: DataFrame representing the user-item matrix from the training set.
    - k: The number of top similar users to consider when making predictions.
    - similarity_method: The function to calculate similarity scores between users.
    
    Returns:
    - rmse: The root mean square error of the predicted ratings against the actual ratings.
    """
    actual_ratings = []
    predicted_ratings = []

    # Your existing logic to populate actual_ratings and predicted_ratings


    user_ratings_map = validation_data.groupby('CustomerID').apply(lambda x: x.set_index('MovieID')['Rating'])

    # Iterate over each row in the validation data
    # Inside evaluate_predictions, before calling predict_rating:
    for _, row in validation_data.iterrows():
        user_id = row['CustomerID']
        movie_id = row['MovieID']
        actual_rating = row['Rating']
    
    # Prepare target_user_ratings as a Series
    # This assumes you have a way to extract all ratings for user_id from validation_data
    # Here's a placeholder for how you might do this, replace with your actual logic
        target_user_ratings = user_ratings_map.get(user_id, pd.Series(dtype='float64'))

    # Now call predict_rating with target_user_ratings instead of target_user_id
    if movie_id in user_item_matrix.columns and not target_user_ratings.empty:  # Check if movie exists in training data
        predicted_rating = predict_rating(user_item_matrix, target_user_ratings, movie_id, k, similarity_method)
        actual_ratings.append(actual_rating)
        predicted_ratings.append(predicted_rating)
        
    
    # Calculate RMSE between actual and predicted ratings
    rmse = sqrt(mean_squared_error(actual_ratings, predicted_ratings))

    return rmse




In [32]:
def evaluate_predictions_csr(validation_data, csr_user_item_matrix, similarity_matrix, k, user_id_to_index, movie_id_to_index):
    """
    Evaluate the recommendation system by predicting ratings for each user-movie pair in the validation set
    using a CSR matrix and pre-computed similarity matrix, and comparing the predictions to the actual ratings using RMSE.

    Parameters:
    - validation_data: DataFrame containing 'UserIndex', 'MovieIndex', and 'Rating'.
    - csr_user_item_matrix: CSR matrix representing the user-item matrix from the training set.
    - similarity_matrix: Pre-computed similarity matrix as a CSR matrix.
    - k: The number of top similar users to consider when making predictions.
    - user_id_to_index: Dictionary mapping user IDs to indices.
    - movie_id_to_index: Dictionary mapping movie IDs to indices.
    
    Returns:
    - rmse: The root mean square error of the predicted ratings against the actual ratings.
    """
    actual_ratings = []
    predicted_ratings = []

    for _, row in validation_data.iterrows():
        user_index = row['UserIndex']
        movie_index = row['MovieIndex']
        actual_rating = row['Rating']
        
        # Check if the movie index is valid
        if movie_index in csr_user_item_matrix.indices:
            predicted_rating = predict_rating_with_similarity_matrix(csr_user_item_matrix, similarity_matrix, user_index, movie_index, k)
            actual_ratings.append(actual_rating)
            predicted_ratings.append(predicted_rating)
    
    # Calculate RMSE between actual and predicted ratings
    actual_ratings = np.array(actual_ratings)
    predicted_ratings = np.array(predicted_ratings)
    valid_mask = ~np.isnan(predicted_ratings)
    rmse = sqrt(mean_squared_error(actual_ratings[valid_mask], predicted_ratings[valid_mask]))

    return rmse


In [29]:
def map_ids_to_indices(validation_data, user_id_to_index, movie_id_to_index):
    """
    Map user IDs and movie IDs to their respective indices.

    Parameters:
    - validation_data: DataFrame containing 'CustomerID', 'MovieID', and other columns.
    - user_id_to_index: Dictionary mapping user IDs to indices.
    - movie_id_to_index: Dictionary mapping movie IDs to indices.

    Returns:
    - DataFrame with added columns 'UserIndex' and 'MovieIndex' for the respective indices.
    """

    # Copy to avoid modifying the original DataFrame
    modified_data = validation_data.copy()
    
    # Map 'CustomerID' to 'UserIndex'
    modified_data['UserIndex'] = modified_data['CustomerID'].map(user_id_to_index)
    
    # Map 'MovieID' to 'MovieIndex'
    modified_data['MovieIndex'] = modified_data['MovieID'].map(movie_id_to_index)
    
    # Optional: drop rows where either UserIndex or MovieIndex is NaN (i.e., ID wasn't found)
    modified_data.dropna(subset=['UserIndex', 'MovieIndex'], inplace=True)
    
    # Convert indices to integers (they might be floats due to NaN handling)
    modified_data['UserIndex'] = modified_data['UserIndex'].astype(int)
    modified_data['MovieIndex'] = modified_data['MovieIndex'].astype(int)
    
    return modified_data


In [30]:
mapped_validation_data = map_ids_to_indices(validation_data, user_id_to_index, movie_id_to_index)


In [33]:
mapped_validation_data.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,RatingYear,MovieAge,UserActivityBin,ItemPopularityBin,Strata,UserIndex,MovieIndex
76827,1719,769555,3,2005-06-13,2004,2005,1,medium-high,medium-high,medium-high_medium-high_3,3228,88
23683,3890,682697,3,2004-12-10,2004,2004,0,high,medium,high_medium_3,49282,828
78709,312,617941,4,2002-06-14,2000,2002,2,medium-high,medium-high,medium-high_medium-high_4,39082,56
15611,1843,2625306,2,2004-04-12,1994,2004,10,high,medium-high,high_medium-high_2,20580,31
13050,4201,1295669,4,2002-10-05,1996,2002,6,high,low,high_low_4,4352,1436


In [36]:
# Usage example
similarity_matrix = calculate_cosine_similarity_matrix_csr(ratings_csr_matrix)
k = 10  # Example value for k
rmse = evaluate_predictions_csr(mapped_validation_data, ratings_csr_matrix, similarity_matrix, k, user_id_to_index, movie_id_to_index)
print(f"RMSE: {rmse}")




RMSE: 1.4887757342686776


In [38]:
# Define a range of k values to test
k_values = [ 5, 30, 70, 150, 200, 300, 500]

# Initialize a dictionary to store the RMSE for each k value
k_rmse_results = {}

# Loop over each k value
for k in k_values:
    # Evaluate the recommender system using the current k value
    rmse = evaluate_predictions_csr(mapped_validation_data, ratings_csr_matrix, similarity_matrix, k, user_id_to_index, movie_id_to_index)

    # Store the RMSE in the dictionary
    k_rmse_results[k] = rmse
    
    # Print the result for the current k
    print(f"RMSE for k={k}: {rmse}")

# Identify the k value with the lowest RMSE
optimal_k = min(k_rmse_results, key=k_rmse_results.get)
optimal_rmse = k_rmse_results[optimal_k]

print(f"\nOptimal k value: {optimal_k} with RMSE: {optimal_rmse}")



RMSE for k=5: 1.4887757342686776
RMSE for k=30: 1.4887757342686776
RMSE for k=70: 1.4887757342686776
RMSE for k=150: 1.4887757342686776
RMSE for k=200: 1.4887757342686776
RMSE for k=300: 1.4887757342686776
RMSE for k=500: 1.4887757342686776

Optimal k value: 5 with RMSE: 1.4887757342686776


In [None]:

# Define a dictionary to hold your similarity methods for easy access
similarity_methods = {
    'pearson': calculate_pearson_similarity,
    'cosine': calculate_cosine_similarity,
    'manhattan': calculate_manhattan_similarity  
}

# Define the range of k values you want to test
k_values = range(5, 300, 20)

# Placeholder for storing grid search results
grid_search_results = []

# Perform grid search
for k in k_values:
    for method_name, method_function in similarity_methods.items():
        # Evaluate the recommender system's performance for each combination of k and similarity method
        rmse = evaluate_predictions(validation_data, user_item_matrix, k, method_function)
        
        # Store the results
        grid_search_results.append({'method': method_name, 'k': k, 'rmse': rmse})
        
        # Optionally print the results for each iteration
        print(f"Evaluated {method_name} method with k={k}: RMSE = {rmse}")

# Find the best performing combination of k and similarity method based on RMSE
best_configuration = min(grid_search_results, key=lambda x: x['rmse'])

# Output the best combination found
print(f"Best Configuration: Method = {best_configuration['method']}, k = {best_configuration['k']}, RMSE = {best_configuration['rmse']}")


In [None]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from scipy.sparse import lil_matrix

def calculate_pearson_similarity_matrix_sparse(user_item_matrix):
    """
    Calculate the Pearson correlation coefficient between all pairs of users' ratings in the user-item matrix.
    Only non-zero similarities are stored in a sparse matrix, making it more memory-efficient.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    
    Returns:
    - similarity_matrix_sparse: A sparse matrix representing the user-user similarity matrix using Pearson correlation coefficients.
    """
    # Replace 0 with NaN to ignore unrated items in the correlation computation
    user_item_matrix_replaced = user_item_matrix.replace(0, np.nan)
    
    # Initialize a sparse matrix for storing similarities
    n_users = user_item_matrix_replaced.shape[0]
    similarity_matrix_sparse = lil_matrix((n_users, n_users))
    
    user_indices = {user_id: index for index, user_id in enumerate(user_item_matrix_replaced.index)}
    
    # Loop through each pair of users to calculate similarity
    for user1 in user_item_matrix_replaced.index:
        for user2 in user_item_matrix_replaced.index:
            if user1 != user2:
                user1_ratings = user_item_matrix_replaced.loc[user1]
                user2_ratings = user_item_matrix_replaced.loc[user2]
                
                # Find common movies rated by both users
                common_movies = user1_ratings.dropna().index.intersection(user2_ratings.dropna().index)
                
                if len(common_movies) >= 2:
                    # Calculate Pearson correlation for common rated movies
                    correlation, _ = pearsonr(user1_ratings.loc[common_movies], user2_ratings.loc[common_movies])
                    
                    # Store the correlation if it's valid and non-zero
                    if np.isfinite(correlation) and correlation != 0:
                        similarity_matrix_sparse[user_indices[user1], user_indices[user2]] = correlation
                # No need to explicitly set values for pairs with fewer than 2 common ratings or for NaN correlations
            else:
                # Set self-similarity to 1
                similarity_matrix_sparse[user_indices[user1], user_indices[user2]] = 1
                
    return similarity_matrix_sparse


In [None]:
similarity_matrix = calculate_pearson_similarity_matrix_sparse (user_item_matrix)
similarity_matrix.head()