In [1]:
import pandas as pd
import numpy as np
training_df = pd.read_csv('C:/Users/nafla/OneDrive/Documents/system development/training_data.csv')
training_df.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,MovieTitle,RatingYear,MovieAge,user_activity,AverageMovieAgeRated,user_average_rating,average_rating_per_movie,number_of_ratings_per_movie,scaled_movie_age
0,1,1488844,3,2005-09-06,2003,Dinosaur Planet,2005,2,1.473012,1.640503,3.253308,3.910534,1.010541,1.215054
1,1,822109,5,2005-05-13,2003,Dinosaur Planet,2005,2,1.031355,1.405855,4.083333,3.910534,1.010541,1.215054
2,1,885013,4,2005-10-19,2003,Dinosaur Planet,2005,2,1.077044,1.400853,3.873563,3.910534,1.010541,1.215054
3,1,30878,4,2005-12-26,2003,Dinosaur Planet,2005,2,1.275924,1.525706,3.634304,3.910534,1.010541,1.215054
4,1,823519,3,2004-05-03,2003,Dinosaur Planet,2004,1,1.139754,1.326786,3.917197,3.910534,1.010541,1.172043


In [2]:
# Calculate quantiles for user activity and item popularity
user_activity_quantiles = training_df['CustomerID'].value_counts().quantile([0.25, 0.5, 0.75])
item_popularity_quantiles = training_df['MovieID'].value_counts().quantile([0.25, 0.5, 0.75])
print(user_activity_quantiles)
print(item_popularity_quantiles)

0.25     8.0
0.50    24.0
0.75    64.0
Name: count, dtype: float64
0.25     192.0
0.50     552.5
0.75    2539.0
Name: count, dtype: float64


In [3]:
training_df['CustomerID'] = training_df['CustomerID'].astype(str)
training_df['MovieID'] = training_df['MovieID'].astype(str)
training_df['Rating'] = pd.to_numeric(training_df['Rating'], errors='coerce')  # Converts to float, makes non-numeric as NaN

In [4]:
# Check data types
print(training_df.dtypes)

MovieID                         object
CustomerID                      object
Rating                           int64
Date                            object
YearOfRelease                    int64
MovieTitle                      object
RatingYear                       int64
MovieAge                         int64
user_activity                  float64
AverageMovieAgeRated           float64
user_average_rating            float64
average_rating_per_movie       float64
number_of_ratings_per_movie    float64
scaled_movie_age               float64
dtype: object


In [5]:
# List of your columns to be rounded and converted
columns_to_round_and_convert = ['user_activity', 'AverageMovieAgeRated', 'user_average_rating']

# Apply rounding and conversion to all specified columns
for column in columns_to_round_and_convert:
    training_df[column] = training_df[column].round().astype(int)

# Display the DataFrame to verify the changes
print(training_df.head())


  MovieID CustomerID  Rating        Date  YearOfRelease       MovieTitle  \
0       1    1488844       3  2005-09-06           2003  Dinosaur Planet   
1       1     822109       5  2005-05-13           2003  Dinosaur Planet   
2       1     885013       4  2005-10-19           2003  Dinosaur Planet   
3       1      30878       4  2005-12-26           2003  Dinosaur Planet   
4       1     823519       3  2004-05-03           2003  Dinosaur Planet   

   RatingYear  MovieAge  user_activity  AverageMovieAgeRated  \
0        2005         2              1                     2   
1        2005         2              1                     1   
2        2005         2              1                     1   
3        2005         2              1                     2   
4        2004         1              1                     1   

   user_average_rating  average_rating_per_movie  number_of_ratings_per_movie  \
0                    3                  3.910534                     1.010541

In [6]:
# Assuming your DataFrame is named df
# List the names of the columns you want to drop
columns_to_drop = ['average_rating_per_movie', 'number_of_ratings_per_movie', 'MovieAge']

# Drop the specified columns from the DataFrame
training_df = training_df.drop(columns=columns_to_drop)

# Display the DataFrame to verify the changes
print(training_df.head())


  MovieID CustomerID  Rating        Date  YearOfRelease       MovieTitle  \
0       1    1488844       3  2005-09-06           2003  Dinosaur Planet   
1       1     822109       5  2005-05-13           2003  Dinosaur Planet   
2       1     885013       4  2005-10-19           2003  Dinosaur Planet   
3       1      30878       4  2005-12-26           2003  Dinosaur Planet   
4       1     823519       3  2004-05-03           2003  Dinosaur Planet   

   RatingYear  user_activity  AverageMovieAgeRated  user_average_rating  \
0        2005              1                     2                    3   
1        2005              1                     1                    4   
2        2005              1                     1                    4   
3        2005              1                     2                    4   
4        2004              1                     1                    4   

   scaled_movie_age  
0          1.215054  
1          1.215054  
2          1.215054  
3   

In [7]:
# Check data types
print(training_df.dtypes)

MovieID                  object
CustomerID               object
Rating                    int64
Date                     object
YearOfRelease             int64
MovieTitle               object
RatingYear                int64
user_activity             int32
AverageMovieAgeRated      int32
user_average_rating       int32
scaled_movie_age        float64
dtype: object


In [8]:
# Assuming your DataFrame is named df and the column you're interested in is 'column_name'

# Get the minimum value in the column
min_value = training_df['user_average_rating'].min()

# Get the maximum value in the column
max_value = training_df['user_average_rating'].max()

# Display the minimum and maximum values
print(f"Minimum value in column 'column_name': {min_value}")
print(f"Maximum value in column 'column_name': {max_value}")


Minimum value in column 'column_name': 1
Maximum value in column 'column_name': 5


# Stratified Sampling Method 

To create a representative sample of our dataset, we employ a stratified sampling method that accounts for three key dimensions: Rating Distribution, User Activity, and Item Popularity. This approach ensures our sample maintains the diversity and characteristics of the entire dataset, facilitating more reliable model training and evaluation.

- User Activity is quantified by the number of ratings a user has provided.
- Item Popularity reflects the number of ratings an item has received.

Finally, We combine User Activity, Item Popularity, and Rating into a composite stratification key for each record. This multi-dimensional key ensures our sampling process considers the distribution across all three axes.

In [9]:
# # Assign each user and item to a bin based on the quantiles
# training_df['UserActivityBin'] = pd.qcut(training_df.groupby('CustomerID')['Rating'].transform('size'), 
#                                 q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

# # training_df['ItemPopularityBin'] = pd.qcut(training_df.groupby('MovieID')['Rating'].transform('size'), 
# #                                   q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

# # Combine these with Rating to create a stratification key
# # training_df['Strata'] = training_df['UserActivityBin'].astype(str) + "_" + training_df['ItemPopularityBin'].astype(str) + "_" + training_df['Rating'].astype(str)
# training_df['Strata'] = training_df['UserActivityBin'].astype(str) + training_df['Rating'].astype(str)

# # Perform stratified sampling
# # we use groupby and  frac to specify a fraction of each strata and in case number of rows is less that 10 it takes all rows
# strat_sample_df = training_df.groupby('Strata').apply(lambda x: x.sample(frac=0.005 if len(x) > 10 else len(x)/len(x))).reset_index(drop=True)


In [10]:
import pandas as pd

# Assuming 'training_df' is your DataFrame

# Step 1: Filter users with more than 10 ratings
user_filtered_df = training_df.groupby('CustomerID').filter(lambda x: len(x) > 10)

# Step 2: Filter movies with more than 10 ratings
movie_filtered_df = user_filtered_df.groupby('MovieID').filter(lambda x: len(x) > 10)

# Step 3: Perform random sampling
# Replace 'fraction' with the fraction of data you want to sample. For example, 0.005 for 0.5%
fraction = 0.005
strat_sample_df = movie_filtered_df.sample(frac=fraction, random_state=42)  # Ensure reproducibility with random_state

# Display the shapes of the original, user-filtered, movie-filtered, and sampled DataFrames
print("Original DataFrame shape:", training_df.shape)
print("User-Filtered DataFrame shape:", user_filtered_df.shape)
print("Movie-Filtered DataFrame shape:", movie_filtered_df.shape)
print("Sampled DataFrame shape:", strat_sample_df.shape)

# 'strat_sample_df' now contains the randomly sampled data from both the users and movies with more than 10 ratings.


Original DataFrame shape: (24053575, 11)
User-Filtered DataFrame shape: (23343305, 11)
Movie-Filtered DataFrame shape: (23343305, 11)
Sampled DataFrame shape: (116717, 11)


In [11]:
num_sampled_rows = len(strat_sample_df)
print(f"Number of rows in the sampled DataFrame: {num_sampled_rows}")

Number of rows in the sampled DataFrame: 116717


# Splitting dataset to training, test, validation

In [12]:
from sklearn.model_selection import train_test_split

# Assuming strat_sample_df is your entire dataset
user_ids = strat_sample_df['CustomerID'].unique()
movie_ids = strat_sample_df['MovieID'].unique()

# Create mappings based on the entire dataset
user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
movie_id_to_index = {movie_id: index for index, movie_id in enumerate(movie_ids)}

# Now, split your dataset
training_data, testing_data = train_test_split(strat_sample_df, test_size=0.2, random_state=42)


In [13]:
def map_ids_to_indices(df, user_id_to_index, movie_id_to_index):
    """
    Map user IDs and movie IDs to their respective indices.

    Parameters:
    - df: DataFrame containing 'CustomerID', 'MovieID', and other columns.
    - user_id_to_index: Dictionary mapping user IDs to indices.
    - movie_id_to_index: Dictionary mapping movie IDs to indices.

    Returns:
    - DataFrame with added columns 'UserIndex' and 'MovieIndex' for the respective indices.
    """

    # Copy to avoid modifying the original DataFrame
    modified_data = df.copy()
    
    # Map 'CustomerID' to 'UserIndex'
    modified_data['UserIndex'] = modified_data['CustomerID'].map(user_id_to_index)
    
    # Map 'MovieID' to 'MovieIndex'
    modified_data['MovieIndex'] = modified_data['MovieID'].map(movie_id_to_index)
    
    # Optional: drop rows where either UserIndex or MovieIndex is NaN (i.e., ID wasn't found)
    modified_data.dropna(subset=['UserIndex', 'MovieIndex'], inplace=True)
    
    # Convert indices to integers (they might be floats due to NaN handling)
    modified_data['UserIndex'] = modified_data['UserIndex'].astype(int)
    modified_data['MovieIndex'] = modified_data['MovieIndex'].astype(int)
    
    return modified_data

In [14]:
mapped_training_data = map_ids_to_indices(training_data,user_id_to_index, movie_id_to_index)

In [15]:
# Calculate the size of each split
training_size = training_data.shape[0]  # Number of rows in the training data
training_size_mapp = mapped_training_data.shape[0]  # Number of rows in the training data
# validation_size = validation_data.shape[0]  # Number of rows in the validation data
testing_size = testing_data.shape[0]  # Number of rows in the testing data

# Print the sizes
print(f"Training Data Size: {training_size}")
print(f"Training Data Size: {training_size_mapp}")
# print(f"Validation Data Size: {validation_size}")
print(f"Testing Data Size: {testing_size}")

Training Data Size: 93373
Training Data Size: 93373
Testing Data Size: 23344


In [16]:
# Assuming final_training_data, validation_data, and testing_data are your data splits

# Count unique MovieIDs in the final training data
unique_users_training = training_data['CustomerID'].nunique()

# Count unique MovieIDs in the validation data
# unique_movies_validation = validation_data['MovieID'].nunique()

# Count unique MovieIDs in the testing data
unique_movies_testing = testing_data['CustomerID'].nunique()

# Print the counts
print(f"Unique CustomerIDs in Training Data: {unique_users_training}")
# print(f"Unique MovieIDs in Validation Data: {unique_movies_validation}")
print(f"Unique MovieIDs in Testing Data: {unique_movies_testing}")


Unique CustomerIDs in Training Data: 71886
Unique MovieIDs in Testing Data: 21626


# Creating User - Item matrix

In [17]:
# Creating customer-movie matrix
import pandas as pd
from scipy.sparse import csr_matrix

# user_ids = training_data['CustomerID'].unique()
# movie_ids = training_data['MovieID'].unique()

# user_id_to_index = {user_id: index for index, user_id in enumerate(user_ids)}
# movie_id_to_index = {movie_id: index for index, movie_id in enumerate(movie_ids)}


# Extract rows, columns, and data for CSR matrix
# rows = training_data['UserIndex'].values
# cols = training_data['MovieIndex'].values
# data = training_data['Rating'].values

# # Calculate the shape of the matrix
# num_users = len(user_id_to_index)
# num_movies = len(movie_id_to_index)

# # Create the CSR matrix
# ratings_csr_matrix = csr_matrix((data, (rows, cols)), shape=(num_users, num_movies))

# print(ratings_csr_matrix)

# # Extract the rows (user indices), columns (movie indices), and data (ratings) for the CSR matrix
# rows = mapped_training_data['UserIndex'].values
# cols = mapped_training_data['MovieIndex'].values
# data = mapped_training_data['Rating'].values

# # Determine the shape of the CSR matrix
# # The shape is (max_user_index + 1, max_movie_index + 1) because indices start from 0
# num_users = mapped_training_data['UserIndex'].max() + 1
# num_movies = mapped_training_data['MovieIndex'].max() + 1

# # Create the CSR matrix
# ratings_csr_matrix = csr_matrix((data, (rows, cols)), shape=(num_users, num_movies))

# print(ratings_csr_matrix)

In [18]:
# Assuming 'mapped_training_data' is your training dataset that contains the features
user_activity_values = mapped_training_data['user_activity'].values
AverageMovieAgeRated_values = mapped_training_data['AverageMovieAgeRated'].values
user_avg_rating_values = mapped_training_data['user_average_rating'].values


In [19]:
from scipy.sparse import csr_matrix, hstack

# Extract user indexes and movie indexes
user_indexes = mapped_training_data['UserIndex'].values
movie_indexes = mapped_training_data['MovieIndex'].values
ratings = mapped_training_data['Rating'].values

# Create the base user-item ratings CSR matrix
num_users = user_indexes.max() + 1
num_movies = movie_indexes.max() + 1
ratings_csr_matrix = csr_matrix((ratings, (user_indexes, movie_indexes)), shape=(num_users, num_movies))

# Create CSR matrices for features
user_activity_matrix = csr_matrix((user_activity_values, (user_indexes, np.zeros_like(user_indexes))), shape=(num_users, 1))
avg_movie_age_matrix = csr_matrix((AverageMovieAgeRated_values, (user_indexes, np.zeros_like(user_indexes))), shape=(num_users, 1))
user_avg_rating_matrix = csr_matrix((user_avg_rating_values, (user_indexes, np.zeros_like(user_indexes))), shape=(num_users, 1))


In [20]:
# Horizontally stack the features matrices with the ratings CSR matrix
full_csr_matrix = hstack([ratings_csr_matrix, user_activity_matrix])


In [21]:
print (full_csr_matrix)

  (0, 240)	3
  (0, 356)	1
  (0, 903)	5
  (0, 3614)	3
  (1, 1)	4
  (1, 3614)	1
  (2, 2)	3
  (2, 246)	4
  (2, 1388)	2
  (2, 3614)	6
  (4, 4)	4
  (4, 3614)	1
  (5, 5)	5
  (5, 3614)	1
  (6, 6)	5
  (6, 7)	2
  (6, 14)	5
  (6, 40)	4
  (6, 692)	5
  (6, 3614)	5
  (7, 7)	3
  (7, 64)	3
  (7, 3614)	2
  (8, 8)	5
  (8, 3614)	1
  :	:
  (85063, 3614)	1
  (85064, 144)	4
  (85064, 3614)	1
  (85066, 479)	5
  (85066, 3614)	1
  (85067, 34)	4
  (85067, 3614)	1
  (85068, 219)	4
  (85068, 3614)	1
  (85070, 17)	4
  (85070, 3614)	1
  (85071, 802)	5
  (85071, 3614)	1
  (85073, 156)	5
  (85073, 3614)	1
  (85075, 83)	3
  (85075, 3614)	1
  (85078, 19)	3
  (85078, 3614)	1
  (85079, 131)	5
  (85079, 3614)	1
  (85080, 196)	4
  (85080, 3614)	1
  (85081, 623)	1
  (85081, 3614)	1


# Define similarity function for each given user

In [22]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

cosine_similarity_matrix_csr = cosine_similarity(ratings_csr_matrix, dense_output=False)
    



In [23]:
print(cosine_similarity_matrix_csr)

  (0, 82286)	0.8451542547285166
  (0, 79940)	0.8451542547285166
  (0, 68527)	0.8451542547285166
  (0, 66537)	0.4610694459770735
  (0, 55180)	0.8451542547285166
  (0, 51297)	0.8451542547285166
  (0, 44975)	0.629940788348712
  (0, 44222)	0.8451542547285166
  (0, 40900)	0.8451542547285166
  (0, 40221)	0.8451542547285166
  (0, 39531)	0.7247137945655604
  (0, 30346)	0.6444240777830839
  (0, 29624)	0.8451542547285166
  (0, 26182)	0.8451542547285166
  (0, 19897)	0.8451542547285166
  (0, 17360)	0.8017837257372731
  (0, 16537)	0.8451542547285166
  (0, 15146)	0.8451542547285166
  (0, 13565)	0.8451542547285166
  (0, 10008)	0.4161251892882395
  (0, 4155)	0.7247137945655604
  (0, 3970)	0.8451542547285166
  (0, 3598)	0.8451542547285166
  (0, 2775)	0.8451542547285166
  (0, 2563)	0.5279636773484547
  :	:
  (85081, 51054)	1.0
  (85081, 47513)	0.7071067811865475
  (85081, 47276)	1.0
  (85081, 44291)	0.5144957554275265
  (85081, 43971)	1.0
  (85081, 42291)	0.3333333333333333
  (85081, 41546)	1.0
  (85081

In [24]:
print(cosine_similarity_matrix_csr.shape)

(85082, 85082)


# Predict ratings using similarities

In [25]:
def predict_rating_with_similarity_matrix(csr_user_item_matrix, similarity_matrix, user_index, movie_index, k):
    """
    Predict the rating for a given movie by a target user, based on the ratings of top-k similar users.
    This function uses a pre-calculated similarity matrix.
    
    Parameters:
    - csr_user_item_matrix: CSR matrix representing the user-item matrix.
    - similarity_matrix: CSR matrix representing the similarity scores between users.
    - user_index: The index of the user for whom the rating is being predicted.
    - movie_index: The index of the movie for which the rating is being predicted.
    - k: Number of top similar users to consider for prediction.
    
    Returns:
    - Predicted rating for the movie by the target user.
    """
    # Step 1: Identify users who have rated the movie
    movie_rated_indices = csr_user_item_matrix[:, movie_index].nonzero()[0]
    
    # Check if user_index exists in the similarity matrix
    if user_index < 0 or user_index >= similarity_matrix.shape[0]:
        # If user_index does not exist in similarity matrix, return default prediction
        overall_average_rating = csr_user_item_matrix[:, movie_index].data.mean()
        return overall_average_rating if np.isfinite(overall_average_rating) else 3.0  # Assuming 3.0 as a neutral rating
    
    # Step 2: Extract similarity scores for the target user with all other users
    user_similarities = similarity_matrix.getrow(user_index).toarray().flatten()
    
    # Step 3: Filter the similarities for users who have rated the movie
    filtered_similarities = user_similarities[movie_rated_indices]
    
    # Step 4: Get indices of top k similar users among those who have rated the movie
    top_k_indices = np.argsort(filtered_similarities)[-k:]
    top_k_users_indices = movie_rated_indices[top_k_indices]
    top_k_similarities = filtered_similarities[top_k_indices]

    # Retrieve ratings for the movie from these top-k similar users
    top_k_ratings = csr_user_item_matrix[top_k_users_indices, movie_index].toarray().flatten()

    # Calculate the weighted average rating
    weighted_sum = np.dot(top_k_similarities, top_k_ratings)
    similarity_sum = np.sum(top_k_similarities)
    
    if similarity_sum > 0:
        predicted_rating = weighted_sum / similarity_sum
    else:
        # Use the overall average rating of the movie by all users as the default rating
        overall_average_rating = csr_user_item_matrix[:, movie_index].data.mean()
        predicted_rating = overall_average_rating if np.isfinite(overall_average_rating) else 3.0  # Assuming 3.0 as a neutral rating

    return predicted_rating


In [26]:

from sklearn.metrics import mean_squared_error
from math import sqrt
def evaluate_predictions_csr(validation_data, csr_user_item_matrix, similarity_matrix, k):
    """
    Evaluate the recommendation system by predicting ratings for each user-movie pair in the validation set
    using a CSR matrix and pre-computed similarity matrix, and comparing the predictions to the actual ratings using RMSE.

    Parameters:
    - validation_data: DataFrame containing 'UserIndex', 'MovieIndex', and 'Rating'.
    - csr_user_item_matrix: CSR matrix representing the user-item matrix from the training set.
    - similarity_matrix: Pre-computed similarity matrix as a CSR matrix.
    - k: The number of top similar users to consider when making predictions.
    
    Returns:
    - rmse: The root mean square error of the predicted ratings against the actual ratings.
    """
    actual_ratings = []
    predicted_ratings = []

    for _, row in validation_data.iterrows():
        user_index = row['UserIndex']
        movie_index = row['MovieIndex']
        actual_rating = row['Rating']
        
        # Check if the movie index is valid
        if movie_index in csr_user_item_matrix.indices:
            predicted_rating = predict_rating_with_similarity_matrix(csr_user_item_matrix, similarity_matrix, user_index, movie_index, k)
            actual_ratings.append(actual_rating)
            predicted_ratings.append(predicted_rating)
    
    # Calculate RMSE between actual and predicted ratings
    actual_ratings = np.array(actual_ratings)
    predicted_ratings = np.array(predicted_ratings)
    valid_mask = ~np.isnan(predicted_ratings)
    rmse = sqrt(mean_squared_error(actual_ratings[valid_mask], predicted_ratings[valid_mask]))

    return rmse


In [27]:
from sklearn.model_selection import KFold
import numpy as np

# Assuming k_values to test and your similarity matrix is already defined
k_values = [5,  50,  200]
similarity_matrix = cosine_similarity_matrix_csr
# Setup KFold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize results storage
results = []

for k in k_values:
    fold_rmses = []  # Store RMSEs for each fold

    for train_indices, test_indices in kf.split(ratings_csr_matrix):
        # Splitting your data: ratings_csr_matrix doesn't change, so you just map validation set indices
        validation_data_fold = mapped_training_data.iloc[test_indices]

        # Evaluate predictions on this fold's test set
        rmse = evaluate_predictions_csr(validation_data_fold, ratings_csr_matrix, similarity_matrix, k)
        fold_rmses.append(rmse)

    # Calculate average RMSE for this k over all folds
    avg_rmse = np.mean(fold_rmses)
    results.append((k, avg_rmse))
    print(f"k={k}, Average RMSE={avg_rmse}")

# Find the best k value based on average RMSE
best_k, best_rmse = min(results, key=lambda x: x[1])
print(f"Best k: {best_k} with RMSE: {best_rmse}")


k=5, Average RMSE=0.9508280757776708
k=50, Average RMSE=0.964437779735535
k=200, Average RMSE=0.9678785252100521
Best k: 5 with RMSE: 0.9508280757776708


In [28]:
mapped_testing_data = map_ids_to_indices(testing_data, user_id_to_index, movie_id_to_index)

In [29]:
# Usage example
similarity_matrix = cosine_similarity_matrix_csr
k = 5  # Example value for k
rmse = evaluate_predictions_csr(mapped_testing_data, ratings_csr_matrix, similarity_matrix, k)
print(f"RMSE: {rmse}")

RMSE: 1.0653943639743557


In [30]:
import numpy as np

def recommend_movies4(df, csr_user_item_matrix, similarity_matrix, user_ids, k, n):
    """
    Recommend top n movies for specified user(s) based on predicted ratings.
    Assumes 'UserIndex' and 'MovieIndex' are available in 'df'.
    """
    if not isinstance(user_ids, list):
        user_ids = [user_ids]

    recommendations = {}
    
    for user_id in user_ids:
        try:
            user_index = df[df['CustomerID'] == user_id]['UserIndex'].iloc[0]  # Assuming first matching UserIndex is representative
        except IndexError:
            print(f"User ID {user_id} not found.")
            continue

        unrated_movies_indices = np.setdiff1d(np.arange(csr_user_item_matrix.shape[1]),
                                               csr_user_item_matrix.getrow(user_index).nonzero()[1])
        
        predicted_ratings = []
        for movie_index in unrated_movies_indices:
            predicted_rating = predict_rating_with_similarity_matrix(csr_user_item_matrix, similarity_matrix, user_index, movie_index, k)
            movie_id = {v: k for k, v in movie_id_to_index.items()}[movie_index]  # Reverse lookup to get MovieID from MovieIndex
            predicted_ratings.append((movie_id, predicted_rating))

        top_n_recommendations = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:n]
        recommendations[user_id] = top_n_recommendations

    return recommendations


In [31]:
main_mapped_data = map_ids_to_indices(strat_sample_df, user_id_to_index, movie_id_to_index)

In [32]:
# Extract the rows (user indices), columns (movie indices), and data (ratings) for the CSR matrix
rows = main_mapped_data['UserIndex'].values
cols = main_mapped_data['MovieIndex'].values
data = main_mapped_data['Rating'].values

# Determine the shape of the CSR matrix
# The shape is (max_user_index + 1, max_movie_index + 1) because indices start from 0
num_users = main_mapped_data['UserIndex'].max() + 1
num_movies = main_mapped_data['MovieIndex'].max() + 1

# Create the CSR matrix
Main_csr_matrix = csr_matrix((data, (rows, cols)), shape=(num_users, num_movies))

In [33]:
main_similarity_matrix = cosine_similarity(Main_csr_matrix, dense_output=False)

In [34]:
print(main_similarity_matrix.shape)

(85082, 85082)


In [35]:
main_mapped_data.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,MovieTitle,RatingYear,user_activity,AverageMovieAgeRated,user_average_rating,scaled_movie_age,UserIndex,MovieIndex
8464880,1693,1851346,1,2002-12-15,1998,Sphere,2002,1,2,3,1.301075,0,0
6311316,1220,1710563,4,2004-12-03,2004,Man on Fire,2004,1,2,3,1.129032,1,1
17205447,3316,17864,3,2004-01-07,2002,Bartleby,2004,2,1,3,1.215054,2,2
22300142,4227,1673744,4,2000-02-26,1997,The Full Monty,2000,1,2,4,1.258065,3,3
6146126,1202,1321440,4,2004-06-25,1983,National Lampoon's Vacation,2004,1,2,4,2.032258,4,4


In [50]:
# Assuming you have 'strat_sample_df', 'Main_csr_matrix', and 'main_similarity_matrix' prepared, along with 'user_id_to_index' and 'movie_id_to_index' mappings:

user_ids = ['1851346']  # Single user example
# user_ids = ['12345', '67890']  # Multiple users example
k = 5  # Number of similar users to consider
n = 5  # Number of recommendations to generate

# Generate recommendations
recommendations = recommend_movies4(main_mapped_data, Main_csr_matrix, main_similarity_matrix, user_ids, k, n)

# Display the recommendations
for user_id in user_ids:
    print(f"Recommendations for User ID {user_id}:")
    if user_id in recommendations:
        for movie_id, predicted_rating in recommendations[user_id]:
            print(f"\tMovie ID: {movie_id}, Predicted Rating: {predicted_rating}")
    else:
        print("\tNo recommendations available.")


Recommendations for User ID 1851346:
	Movie ID: 30, Predicted Rating: 5.000000000000001
	Movie ID: 3122, Predicted Rating: 5.000000000000001
	Movie ID: 1495, Predicted Rating: 5.000000000000001
	Movie ID: 2780, Predicted Rating: 5.0
	Movie ID: 2342, Predicted Rating: 5.0


## Classification

In [42]:
from collections import defaultdict

def predict_rating_with_classification(csr_user_item_matrix, similarity_matrix, user_index, movie_index, k):
    """
    Predict the rating for a given movie by a target user, based on the ratings of top-k similar users using classification (voting) logic.
    This function uses a pre-calculated similarity matrix.

    Parameters:
    - csr_user_item_matrix: CSR matrix representing the user-item matrix.
    - similarity_matrix: CSR matrix representing the similarity scores between users.
    - user_index: The index of the user for whom the rating is being predicted.
    - movie_index: The index of the movie for which the rating is being predicted.
    - k: Number of top similar users to consider for prediction.

    Returns:
    - Predicted rating for the movie by the target user.
    """
    # Step 1: Identify users who have rated the movie
    movie_rated_indices = csr_user_item_matrix[:, movie_index].nonzero()[0]
    
    # Check if user_index exists in the similarity matrix
    if user_index < 0 or user_index >= similarity_matrix.shape[0]:
        # If user_index does not exist in similarity matrix, return default prediction
        non_zero_ratings = csr_user_item_matrix[:, movie_index][csr_user_item_matrix[movie_index] != 0]
        predicted_rating = non_zero_ratings.mean() if len(non_zero_ratings) > 0 else np.nan # calculate average just concidering non zero ratings 
  
    # Step 2: Extract similarity scores for the target user with all other users
    user_similarities = similarity_matrix.getrow(user_index).toarray().flatten()
    
    # Step 3: Filter the similarities for users who have rated the movie
    filtered_similarities = user_similarities[movie_rated_indices]
    
    # Step 4: Get indices of top k similar users among those who have rated the movie
    top_k_indices = np.argsort(filtered_similarities)[-k:]
    top_k_users_indices = movie_rated_indices[top_k_indices]
    top_k_similarities = filtered_similarities[top_k_indices]

    # Retrieve ratings for the movie from these top-k similar users
    top_k_ratings = csr_user_item_matrix[top_k_users_indices, movie_index].toarray().flatten()
    
    # Create a dictionary to hold the count of votes for each rating
    rating_votes = defaultdict(int)
    
   # Calculate weights based on similarities and count votes for each rating
    for similarity, rating in zip(top_k_similarities, top_k_ratings):
        if rating in [1, 2, 3, 4, 5]:
            rating_votes[rating] += similarity

    # Find the rating with the highest sum of similarity weights
    predicted_rating = max(rating_votes, key=rating_votes.get, default=np.nan)
    
    # Use the overall average rating of the movie by all users as the default rating
    if np.isnan(predicted_rating):
        non_zero_ratings = csr_user_item_matrix[:, movie_id][csr_user_item_matrix[movie_id] != 0]
        predicted_rating = non_zero_ratings.mean() if len(non_zero_ratings) > 0 else np.nan # calculate average just concidering non zero ratings 

    return predicted_rating


In [47]:
user_index = 1  # Replace with actual user ID
movie_index = 100  # Replace with actual movie ID
K = 125  # Number of neighbors
predicted_rating = predict_rating_with_classification (Main_csr_matrix, main_similarity_matrix, user_index, movie_index, k)
print(f"Predicted rating for User ID {user_id} and Movie ID {movie_id} is: {predicted_rating} ")

Predicted rating for User ID 1 and Movie ID 1000 is: 3 


In [48]:
import numpy as np

def recommend_movies_classification(df, csr_user_item_matrix, similarity_matrix, user_ids, k, n):
    """
    Recommend top n movies for specified user(s) based on predicted ratings.
    Assumes 'UserIndex' and 'MovieIndex' are available in 'df'.
    """
    if not isinstance(user_ids, list):
        user_ids = [user_ids]

    recommendations = {}
    
    for user_id in user_ids:
        try:
            user_index = df[df['CustomerID'] == user_id]['UserIndex'].iloc[0]  # Assuming first matching UserIndex is representative
        except IndexError:
            print(f"User ID {user_id} not found.")
            continue

        unrated_movies_indices = np.setdiff1d(np.arange(csr_user_item_matrix.shape[1]),
                                               csr_user_item_matrix.getrow(user_index).nonzero()[1])
        
        predicted_ratings = []
        for movie_index in unrated_movies_indices:
            predicted_rating = predict_rating_with_classification(csr_user_item_matrix, similarity_matrix, user_index, movie_index, k)
            movie_id = {v: k for k, v in movie_id_to_index.items()}[movie_index]  # Reverse lookup to get MovieID from MovieIndex
            predicted_ratings.append((movie_id, predicted_rating))

        top_n_recommendations = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)[:n]
        recommendations[user_id] = top_n_recommendations

    return recommendations


In [49]:
user_ids = ['1851346']  # Single user example
# user_ids = ['12345', '67890']  # Multiple users example
k = 5  # Number of similar users to consider
n = 5  # Number of recommendations to generate

# Generate recommendations
recommendations = recommend_movies_classification(main_mapped_data, Main_csr_matrix, main_similarity_matrix, user_ids, k, n)

# Display the recommendations
for user_id in user_ids:
    print(f"Recommendations for User ID {user_id}:")
    if user_id in recommendations:
        for movie_id, predicted_rating in recommendations[user_id]:
            print(f"\tMovie ID: {movie_id}, Predicted Rating: {predicted_rating}")
    else:
        print("\tNo recommendations available.")

Recommendations for User ID 1851346:
	Movie ID: 2780, Predicted Rating: 5
	Movie ID: 2342, Predicted Rating: 5
	Movie ID: 3925, Predicted Rating: 5
	Movie ID: 2782, Predicted Rating: 5
	Movie ID: 1759, Predicted Rating: 5
