In [34]:
import pandas as pd
import numpy as np
training_df = pd.read_csv("C:/Users/nafla/Downloads/movielens.csv")
training_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,num_genres,(no genres listed),Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,2008-11-03 17:52:19,Toy Story (1995),5,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,1,5.0,1996-06-26 19:06:11,Toy Story (1995),5,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,7,1,4.0,2000-11-18 03:27:04,Toy Story (1995),5,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,10,1,3.0,2015-05-03 15:19:54,Toy Story (1995),5,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,12,1,5.0,1997-05-01 15:32:18,Toy Story (1995),5,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [35]:
training_df.rename(columns={'userId': 'CustomerID', 'movieId': 'MovieID', 'rating': 'Rating'}, inplace=True)

In [36]:
training_df['CustomerID'] = training_df['CustomerID'].astype(str)
training_df['MovieID'] = training_df['MovieID'].astype(str)
training_df['Rating'] = pd.to_numeric(training_df['Rating'], errors='coerce')  # Converts to float, makes non-numeric as NaN

In [37]:
# Calculate quantiles for user activity and item popularity
user_activity_quantiles = training_df['CustomerID'].value_counts().quantile([0.25, 0.5, 0.75])
item_popularity_quantiles = training_df['MovieID'].value_counts().quantile([0.25, 0.5, 0.75])
print(user_activity_quantiles)
print(item_popularity_quantiles)

0.25    15.0
0.50    32.0
0.75    95.0
Name: CustomerID, dtype: float64
0.25    1.0
0.50    2.0
0.75    8.0
Name: MovieID, dtype: float64


In [38]:
num_sampled_rows = len(training_df)
print(f"Number of rows in the sampled DataFrame: {num_sampled_rows}")

Number of rows in the sampled DataFrame: 125351


# Splitting dataset to training, test, validation

In [39]:
from sklearn.model_selection import train_test_split

# Split the remaining data into training, testing, and validation sets
train, testing_data = train_test_split(training_df, test_size=0.2, random_state=42)
training_data , validation_data = train_test_split(train, test_size=0.2, random_state=42)



In [40]:
# Calculate the size of each split
training_size = training_data.shape[0]  # Number of rows in the training data
validation_size = validation_data.shape[0]  # Number of rows in the validation data
testing_size = testing_data.shape[0]  # Number of rows in the testing data

# Print the sizes
print(f"Training Data Size: {training_size}")
print(f"Validation Data Size: {validation_size}")
print(f"Testing Data Size: {testing_size}")

Training Data Size: 80224
Validation Data Size: 20056
Testing Data Size: 25071


In [41]:
# Assuming final_training_data, validation_data, and testing_data are your data splits

# Count unique MovieIDs in the final training data
unique_movies_training = training_data['MovieID'].nunique()

# Count unique MovieIDs in the validation data
unique_movies_validation = validation_data['MovieID'].nunique()

# Count unique MovieIDs in the testing data
unique_movies_testing = testing_data['MovieID'].nunique()

# Print the counts
print(f"Unique MovieIDs in Training Data: {unique_movies_training}")
print(f"Unique MovieIDs in Validation Data: {unique_movies_validation}")
print(f"Unique MovieIDs in Testing Data: {unique_movies_testing}")


Unique MovieIDs in Training Data: 9253
Unique MovieIDs in Validation Data: 5041
Unique MovieIDs in Testing Data: 5537


# Creating User - Item matrix

In [42]:
# Creating customer-movie matrix
user_item_matrix = training_data.pivot_table(index='CustomerID', columns='MovieID', values='Rating').fillna(0)

In [43]:
user_item_matrix.head()

MovieID,1,10,100,1000,100008,100044,100058,100083,100087,100106,...,99764,99809,99843,999,99910,99912,99917,99957,99964,99986
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Define similarity functions 

In [44]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

def calculate_cosine_similarity(user_item_matrix, target_user_ratings):
    """
    Calculate cosine similarity scores between a target user's ratings and all other users.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    - target_user_ratings: Series containing the target user's ratings, indexed by MovieID.
    
    Returns:
    A Series with user IDs as the index and the cosine similarity scores as the values.
    """
    # Ensure target_user_ratings is a DataFrame row for compatibility with cosine_similarity
    target_user_df = pd.DataFrame(target_user_ratings).T.fillna(0)
    
    # Align user_item_matrix with target_user_df to match columns (MovieIDs)
    aligned_user_item_matrix = user_item_matrix.reindex(columns=target_user_df.columns, fill_value=0)
    
    # Calculate cosine similarities
    similarities = cosine_similarity(aligned_user_item_matrix, target_user_df)
    
    # Flatten the similarities array and create a Series with user IDs as index
    similarities_series = pd.Series(similarities.flatten(), index=aligned_user_item_matrix.index)
    
    return similarities_series



In [83]:

from scipy.stats import pearsonr

# defining function to calculate pearson correlation for pair of users
def calculate_pearson_similarity(user_item_matrix, target_user_ratings):
    """
    Calculate the Pearson correlation coefficient between a target user's ratings
    and all other users' ratings in the user-item matrix.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    - target_user_ratings: Series containing the target user's ratings, indexed by MovieID.
    
    Returns:
    - A Series with user IDs as the index and the Pearson correlation coefficients as the values.
    """
    similarities = {}
    
    user_item_matrix_replaced = user_item_matrix.replace(0, np.nan)

    # Loop through each user in the user-item matrix
    for user_id, user_ratings in user_item_matrix_replaced.iterrows():
        # Find common movies that both the target user and the current user have rated
        common_movies = user_ratings.dropna().index.intersection(target_user_ratings.dropna().index)
        
        if len(common_movies) < 2:
            # Assign a default similarity score of 0 for pairs with fewer than 2 common ratings assuming they don't have any similarity
            similarities[user_id] = 0 
            continue  # Skip further calculations for this pair
        correlation, _ = pearsonr(user_ratings.loc[common_movies], target_user_ratings.loc[common_movies])
        if np.isfinite(correlation):
            similarities[user_id] = correlation

    # Convert the similarities dictionary to a pandas Series
    similarity_series = pd.Series(similarities, name='Similarity').sort_values(ascending=False)
    
    return similarity_series


In [47]:
# defining function to calculate manhattan distance for ratings of each pair of users
def calculate_manhattan_similarity(user_item_matrix, target_user_ratings):
    """
    Calculate user similarities using Manhattan distance, comparing target user's ratings
    with those in the user_item_matrix.
    
    Parameters:
    - user_item_matrix: DataFrame where rows are users, columns are items, and values are ratings (training data).
    - target_user_ratings: Series or dict containing the target user's movie ratings.
    
    Returns:
    A Series with user IDs as the index and the similarity scores as the values.
    """
    similarities = {}

    for user_id in user_item_matrix.index:
        user_ratings = user_item_matrix.loc[user_id]
        
        # Calculate distance only for movies both have rated
        common_movies = user_ratings.index.intersection(target_user_ratings.index)
        if not common_movies.empty:
            distance = np.nansum(np.abs(user_ratings[common_movies] - target_user_ratings[common_movies]))
            similarity = 1 / (1 + distance) if distance != 0 else 0
        else:
            similarity = 0  # No common movies means no similarity
        
        similarities[user_id] = similarity

    similarity_series = pd.Series(similarities, name="Similarity").sort_values(ascending=False)
    return similarity_series




# Predict ratings using similarities

In [101]:
# defining a function to predict rating for each pair of user and movie based on their similarity
def predict_rating(user_item_matrix, target_user_ratings, movie_id, k, similarity_method):

    """
    Predict the rating for a given movie by a target user, based on the ratings of top-k similar users.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    - target_user_id: The ID of the user for whom the rating is being predicted.
    - movie_id: The ID of the movie for which the rating is being predicted.
    - k: Number of top similar users to consider for prediction.
    - similarity_method: Function to calculate similarity scores between users.
    
    Returns:
    - Predicted rating for the movie by the target user.
    """

    
    # Calculate similarity scores between the target user and all others
    similarities = similarity_method(user_item_matrix, target_user_ratings)
    
   # Filter users who have rated the movie
    users_who_rated_movie = user_item_matrix.index[user_item_matrix[movie_id].notnull()]
    users_with_similarity_scores = similarities.index.intersection(users_who_rated_movie)

    # Retain similarity scores for users who have rated the movie
    similarities_filtered = similarities.loc[users_with_similarity_scores]
    
    # Filter top-k similar users from those who have rated the movie
    top_k_users = similarities_filtered.nlargest(k).index
    
    # Retrieve ratings for the movie from these top-k similar users
    top_k_ratings = user_item_matrix.loc[top_k_users, movie_id]
    
 # Calculate weighted average rating
    if not top_k_ratings.isnull().all():
        weighted_ratings = top_k_ratings * similarities.loc[top_k_users]
        predicted_rating = weighted_ratings.sum() / similarities.loc[top_k_users].sum()
    else:
        # Use the average rating for the movie if available
        if user_item_matrix[movie_id].notnull().any():
            predicted_rating = user_item_matrix[movie_id].mean()
        else:
            # Default rating if the movie has not been rated by anyone
            predicted_rating = 2.5
            
    return predicted_rating




In [61]:
from sklearn.metrics import mean_squared_error
from math import sqrt

# defining a function to evaluate accuracy of prediced ratings for each pair of user and movie
def evaluate_predictions(validation_data, user_item_matrix, k, similarity_method):
    """
    Evaluate the recommendation system by predicting ratings for each user-movie pair in the validation set
    and comparing the predictions to the actual ratings using RMSE.
    
    Parameters:
    - validation_data: DataFrame containing 'CustomerID', 'MovieID', and 'Rating'.
    - user_item_matrix: DataFrame representing the user-item matrix from the training set.
    - k: The number of top similar users to consider when making predictions.
    - similarity_method: The function to calculate similarity scores between users.
    
    Returns:
    - rmse: The root mean square error of the predicted ratings against the actual ratings.
    - movie_evaluated_count: The number of unique movies for which predictions were made.
    """
    actual_ratings = []
    predicted_ratings = []

    user_ratings_map = validation_data.groupby('CustomerID').apply(lambda x: x.set_index('MovieID')['Rating'])

    # Iterate over each row in the validation data
    # Inside evaluate_predictions, before calling predict_rating:
    for _, row in validation_data.iterrows():
        user_id = row['CustomerID']
        movie_id = row['MovieID']
        actual_rating = row['Rating']
    
    # Prepare target_user_ratings as a Series
    # This assumes you have a way to extract all ratings for user_id from validation_data
    # Here's a placeholder for how you might do this, replace with your actual logic
        target_user_ratings = user_ratings_map.get(user_id, pd.Series(dtype='float64'))

    # Now call predict_rating with target_user_ratings instead of target_user_id
    if movie_id in user_item_matrix.columns and not target_user_ratings.empty:  # Check if movie exists in training data
        predicted_rating = predict_rating(user_item_matrix, target_user_ratings, movie_id, k, similarity_method)
        actual_ratings.append(actual_rating)
        predicted_ratings.append(predicted_rating)
        
    
    # Calculate RMSE between actual and predicted ratings
    rmse = sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    
    return rmse




In [93]:

# Define a dictionary to hold your similarity methods for easy access
similarity_methods = {
    'pearson': calculate_pearson_similarity,
    'cosine': calculate_cosine_similarity,
    'manhattan': calculate_manhattan_similarity  
}

# Define the range of k values you want to test
k_values = range(5, 300, 20)

# Placeholder for storing grid search results
grid_search_results = []

# Perform grid search
for k in k_values:
    for method_name, method_function in similarity_methods.items():
        # Evaluate the recommender system's performance for each combination of k and similarity method
        rmse = evaluate_predictions(validation_data, user_item_matrix, k, method_function)
        
        # Store the results
        grid_search_results.append({'method': method_name, 'k': k, 'rmse': rmse})
        
        # Optionally print the results for each iteration
        print(f"Evaluated {method_name} method with k={k}: RMSE = {rmse}")

# Find the best performing combination of k and similarity method based on RMSE
best_configuration = min(grid_search_results, key=lambda x: x['rmse'])

# Output the best combination found
print(f"Best Configuration: Method = {best_configuration['method']}, k = {best_configuration['k']}, RMSE = {best_configuration['rmse']}")




Evaluated pearson method with k=5: RMSE = 1.0
Evaluated cosine method with k=5: RMSE = 0.266099347709436
Evaluated manhattan method with k=5: RMSE = 0.31744765154224996




Evaluated pearson method with k=25: RMSE = 0.88
Evaluated cosine method with k=25: RMSE = 0.2544680613344197
Evaluated manhattan method with k=25: RMSE = 0.12164465837838923




Evaluated pearson method with k=45: RMSE = 0.6545487200724738
Evaluated cosine method with k=45: RMSE = 0.32094258461903447
Evaluated manhattan method with k=45: RMSE = 0.25736731649934164




Evaluated pearson method with k=65: RMSE = 0.552106540411931
Evaluated cosine method with k=65: RMSE = 0.37231580317879254
Evaluated manhattan method with k=65: RMSE = 0.3468774527642984




Evaluated pearson method with k=85: RMSE = 0.5453080590675472
Evaluated cosine method with k=85: RMSE = 0.3237758367520186
Evaluated manhattan method with k=85: RMSE = 0.40117386998915083




Evaluated pearson method with k=105: RMSE = 0.533404753725965
Evaluated cosine method with k=105: RMSE = 0.3684959649591414
Evaluated manhattan method with k=105: RMSE = 0.440401559041222




Evaluated pearson method with k=125: RMSE = 0.5314169890829787
Evaluated cosine method with k=125: RMSE = 0.4418894199833283
Evaluated manhattan method with k=125: RMSE = 0.4673793960297603




Evaluated pearson method with k=145: RMSE = 0.5314169890829785
Evaluated cosine method with k=145: RMSE = 0.4760509381998136
Evaluated manhattan method with k=145: RMSE = 0.4839459392009612




Evaluated pearson method with k=165: RMSE = 0.5314169890829787
Evaluated cosine method with k=165: RMSE = 0.50921925757295
Evaluated manhattan method with k=165: RMSE = 0.5298544943566895




Evaluated pearson method with k=185: RMSE = 0.5314169890829787
Evaluated cosine method with k=185: RMSE = 0.5491536480733257
Evaluated manhattan method with k=185: RMSE = 0.5760755542844112




Evaluated pearson method with k=205: RMSE = 0.5314169890829786
Evaluated cosine method with k=205: RMSE = 0.5753234027541527
Evaluated manhattan method with k=205: RMSE = 0.613823953169499




Evaluated pearson method with k=225: RMSE = 0.5314169890829787
Evaluated cosine method with k=225: RMSE = 0.6040526089176543
Evaluated manhattan method with k=225: RMSE = 0.6450854972192552




Evaluated pearson method with k=245: RMSE = 0.5314169890829787
Evaluated cosine method with k=245: RMSE = 0.6185319004984662
Evaluated manhattan method with k=245: RMSE = 0.6715049703881482




Evaluated pearson method with k=265: RMSE = 0.5314169890829787
Evaluated cosine method with k=265: RMSE = 0.6387312402118386
Evaluated manhattan method with k=265: RMSE = 0.6700150690837794




Evaluated pearson method with k=285: RMSE = 0.5314169890829785
Evaluated cosine method with k=285: RMSE = 0.639106138861109
Evaluated manhattan method with k=285: RMSE = 0.6911994795260131
Best Configuration: Method = manhattan, k = 25, RMSE = 0.12164465837838923


In [100]:
# Testing the model on unseen data
rmse_test = evaluate_predictions(testing_data, user_item_matrix, 25, calculate_manhattan_similarity)
print(f"RMSE on Testing Data: {rmse_test}")


RMSE on Testing Data: 2.3407007996073323


In [145]:
def recommend_movies_for_user(user_id, user_item_matrix, dataframe, N=10):
    user_id = str(user_id)  # Ensure user_id is treated as a string
    
    similarity_method = calculate_manhattan_similarity
    k = 25

    # Assuming the predict_rating function needs user ratings as input, not just the user ID
    target_user_ratings = user_item_matrix.loc[user_id]

    # Find all movies in the matrix to consider for rating predictions
    all_movies = user_item_matrix.columns

    predicted_ratings = {}
    for movie_id in all_movies:
        if pd.isna(target_user_ratings[movie_id]) or target_user_ratings[movie_id] == 0:
            # Predict the rating for movies that the user has not rated
            predicted_rating = predict_rating(user_item_matrix, target_user_ratings, movie_id, k, similarity_method)
            predicted_ratings[movie_id] = predicted_rating

    # Sort predicted ratings in descending order and select the top N
    top_n_movie_ids = sorted(predicted_ratings, key=predicted_ratings.get, reverse=True)[:N]

    # Fetch movie titles for the top N movies
    recommendations = [(movie_id, dataframe.loc[dataframe['MovieID'] == str(movie_id), 'title'].iloc[0]) for movie_id in top_n_movie_ids]

    return recommendations


In [154]:
# Creating customer-movie matrix
main_user_item_matrix = training_df.pivot_table(index='CustomerID', columns='MovieID', values='Rating').fillna(0)
main_user_item_matrix.index = main_user_item_matrix.index.astype(str)

In [125]:
main_user_item_matrix.head()

MovieID,1,10,100,1000,100008,100044,100046,100058,100083,100087,...,99809,99843,999,99906,99910,99912,99917,99957,99964,99986
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [157]:
# Example usage
user_id = 100  # Example user ID
recommendations = recommend_movies_for_user (user_id, main_user_item_matrix, training_df, N=2)
print(f"Top 10 Recommendations for User {user_id}:\n", recommendations)


Top 10 Recommendations for User 100:
 [('1617', 'L.A. Confidential (1997)'), ('318', 'Shawshank Redemption, The (1994)')]


In [158]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import manhattan_distances

def create_similarity_matrix_manhattan(user_item_matrix):
    """
    Create a user-user similarity matrix using Manhattan distance.

    Parameters:
    - user_item_matrix: DataFrame representing the user-item matrix 
                        (users as rows, items as columns).

    Returns:
    - DataFrame representing the user-user similarity matrix.
    """
    # Replace NaN values with 0s for the distance calculation
    user_item_matrix_filled = user_item_matrix.fillna(0)
    
    # Calculate the Manhattan distances between users
    distances = manhattan_distances(user_item_matrix_filled, user_item_matrix_filled)
    
    # Convert distances to similarities. The +1 to avoid division by zero for identical users.
    similarities = 1 / (1 + distances)
    
    # Create a DataFrame for the similarity matrix
    similarity_matrix = pd.DataFrame(similarities, index=user_item_matrix.index, columns=user_item_matrix.index)
    
    return similarity_matrix



In [159]:
# Assuming `user_item_matrix` is your user-item DataFrame
similarity_matrix_manhattan = create_similarity_matrix_manhattan(main_user_item_matrix)

# Show the similarity matrix
print(similarity_matrix_manhattan.head())

CustomerID         1        10       100      1000      1001      1002  \
CustomerID                                                               
1           1.000000  0.001483  0.003205  0.002445  0.002010  0.003317   
10          0.001483  1.000000  0.001619  0.001606  0.001284  0.001631   
100         0.003205  0.001619  1.000000  0.003436  0.002789  0.009132   
1000        0.002445  0.001606  0.003436  1.000000  0.002153  0.003350   
1001        0.002010  0.001284  0.002789  0.002153  1.000000  0.003086   

CustomerID      1003      1004      1005      1006  ...       990       991  \
CustomerID                                          ...                       
1           0.002328  0.003160  0.003868  0.003868  ...  0.003263  0.002548   
10          0.001376  0.001610  0.001779  0.001754  ...  0.001629  0.001435   
100         0.003350  0.007663  0.013793  0.013793  ...  0.007782  0.004619   
1000        0.002307  0.003190  0.003992  0.003914  ...  0.003284  0.002721   
1001   

In [160]:
def recommend_movies(user_id, user_item_matrix, similarity_matrix, df, N=10):
    """
    Recommend top N movies for a given user using a pre-calculated similarity matrix.

    Parameters:
    - user_id: The ID of the user for whom to generate recommendations.
    - user_item_matrix: DataFrame representing the user-item matrix (users as rows, movies as columns).
    - similarity_matrix: DataFrame representing the pre-calculated similarities between users.
    - movie_titles: DataFrame or Series mapping MovieIDs to movie titles.
    - N: Number of movies to recommend.

    Returns:
    - A list of tuples with (MovieID, Movie Title, Predicted Rating) for the top N recommended movies.
    """
    # Ensure user_id is the correct type
    user_id = str(user_id)
    
    # Get the top 25 most similar users to the target user
    top_25_users = similarity_matrix.loc[user_id].sort_values(ascending=False).head(25).index
    
    # Predict ratings for movies the user hasn't seen
    predicted_ratings = {}
    for movie_id in user_item_matrix.columns:
        # Skip if the user has already rated this movie
        if not pd.isna(user_item_matrix.at[user_id, movie_id]) and user_item_matrix.at[user_id, movie_id] != 0:
            continue
        
        # Calculate the weighted average of ratings from the top 25 similar users
        total_weight = 0
        weighted_sum = 0
        for similar_user in top_25_users:
            # Check if the similar user has rated the movie
            if pd.isna(user_item_matrix.at[similar_user, movie_id]) or user_item_matrix.at[similar_user, movie_id] == 0:
                continue
            similarity_score = similarity_matrix.at[user_id, similar_user]
            rating = user_item_matrix.at[similar_user, movie_id]
            weighted_sum += similarity_score * rating
            total_weight += similarity_score
        
        # Predict the rating if there were any weights, otherwise default to 0
        predicted_rating = weighted_sum / total_weight if total_weight > 0 else 0
        predicted_ratings[movie_id] = predicted_rating
    
    # Sort the predicted ratings and select the top N
    top_n_recommendations = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)[:N]
    
    # Fetch the titles for the recommended movies
    recommendations = [(movie_id, df[movie_id], rating) for movie_id, rating in top_n_recommendations]
    
    return recommendations


In [170]:
# Step 3: Extract movie titles
movie_titles = training_df[['MovieID', 'title']].drop_duplicates().set_index('MovieID')['title']

# Step 4: Use the recommend function for a specific user
user_id = '100'  # replace 'some_user_id' with an actual CustomerID from your dataset

recommendations = recommend_movies(user_id, main_user_item_matrix, similarity_matrix_manhattan, movie_titles, N=20)

# Print the recommendations
for movie_id, title, rating in recommendations:
    print(f"{title} (MovieID: {movie_id}) - Predicted Rating: {rating:.2f}")

Perks of Being a Wallflower, The (2012) (MovieID: 96821) - Predicted Rating: 4.50
L.A. Confidential (1997) (MovieID: 1617) - Predicted Rating: 4.00
Die Hard: With a Vengeance (1995) (MovieID: 165) - Predicted Rating: 4.00
Dr. Dolittle (1998) (MovieID: 1911) - Predicted Rating: 4.00
Bug's Life, A (1998) (MovieID: 2355) - Predicted Rating: 4.00
Tank Girl (1995) (MovieID: 327) - Predicted Rating: 4.00
Carlito's Way (1993) (MovieID: 431) - Predicted Rating: 4.00
American Pie 2 (2001) (MovieID: 4718) - Predicted Rating: 4.00
Sabrina (1954) (MovieID: 915) - Predicted Rating: 3.50
Lethal Weapon 2 (1989) (MovieID: 2001) - Predicted Rating: 3.00
Dirty Dancing (1987) (MovieID: 1088) - Predicted Rating: 2.50
Dovlatov (2018) (MovieID: 184989) - Predicted Rating: 2.50
Shawshank Redemption, The (1994) (MovieID: 318) - Predicted Rating: 2.49
Star Wars: Episode IV - A New Hope (1977) (MovieID: 260) - Predicted Rating: 2.01
Deep Blue Sea (1999) (MovieID: 2722) - Predicted Rating: 2.00
Lara Croft: Tomb 