In [34]:
import pandas as pd
import numpy as np
training_df = pd.read_csv("C:/Users/nafla/Downloads/movielens.csv")
training_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,num_genres,(no genres listed),Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,2008-11-03 17:52:19,Toy Story (1995),5,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,1,5.0,1996-06-26 19:06:11,Toy Story (1995),5,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
2,7,1,4.0,2000-11-18 03:27:04,Toy Story (1995),5,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,10,1,3.0,2015-05-03 15:19:54,Toy Story (1995),5,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
4,12,1,5.0,1997-05-01 15:32:18,Toy Story (1995),5,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [35]:
training_df.rename(columns={'userId': 'CustomerID', 'movieId': 'MovieID', 'rating': 'Rating'}, inplace=True)

In [36]:
training_df['CustomerID'] = training_df['CustomerID'].astype(str)
training_df['MovieID'] = training_df['MovieID'].astype(str)
training_df['Rating'] = pd.to_numeric(training_df['Rating'], errors='coerce')  # Converts to float, makes non-numeric as NaN

In [37]:
# Calculate quantiles for user activity and item popularity
user_activity_quantiles = training_df['CustomerID'].value_counts().quantile([0.25, 0.5, 0.75])
item_popularity_quantiles = training_df['MovieID'].value_counts().quantile([0.25, 0.5, 0.75])
print(user_activity_quantiles)
print(item_popularity_quantiles)

0.25    15.0
0.50    32.0
0.75    95.0
Name: CustomerID, dtype: float64
0.25    1.0
0.50    2.0
0.75    8.0
Name: MovieID, dtype: float64


In [38]:
num_sampled_rows = len(training_df)
print(f"Number of rows in the sampled DataFrame: {num_sampled_rows}")

Number of rows in the sampled DataFrame: 125351


# Splitting dataset to training, test, validation

In [39]:
from sklearn.model_selection import train_test_split



# Split the remaining data into training, testing, and validation sets
train, testing_data = train_test_split(training_df, test_size=0.2, random_state=42)
training_data , validation_data = train_test_split(train, test_size=0.2, random_state=42)



In [40]:
# Calculate the size of each split
training_size = training_data.shape[0]  # Number of rows in the training data
validation_size = validation_data.shape[0]  # Number of rows in the validation data
testing_size = testing_data.shape[0]  # Number of rows in the testing data

# Print the sizes
print(f"Training Data Size: {training_size}")
print(f"Validation Data Size: {validation_size}")
print(f"Testing Data Size: {testing_size}")

Training Data Size: 80224
Validation Data Size: 20056
Testing Data Size: 25071


In [41]:
# Assuming final_training_data, validation_data, and testing_data are your data splits

# Count unique MovieIDs in the final training data
unique_movies_training = training_data['MovieID'].nunique()

# Count unique MovieIDs in the validation data
unique_movies_validation = validation_data['MovieID'].nunique()

# Count unique MovieIDs in the testing data
unique_movies_testing = testing_data['MovieID'].nunique()

# Print the counts
print(f"Unique MovieIDs in Training Data: {unique_movies_training}")
print(f"Unique MovieIDs in Validation Data: {unique_movies_validation}")
print(f"Unique MovieIDs in Testing Data: {unique_movies_testing}")


Unique MovieIDs in Training Data: 9253
Unique MovieIDs in Validation Data: 5041
Unique MovieIDs in Testing Data: 5537


# Creating User - Item matrix

In [42]:
# Creating customer-movie matrix
user_item_matrix = training_data.pivot_table(index='CustomerID', columns='MovieID', values='Rating').fillna(0)

In [43]:
user_item_matrix.head()

MovieID,1,10,100,1000,100008,100044,100058,100083,100087,100106,...,99764,99809,99843,999,99910,99912,99917,99957,99964,99986
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Define similarity function for each given user

In [44]:
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

def calculate_cosine_similarity(user_item_matrix, target_user_ratings):
    """
    Calculate cosine similarity scores between a target user's ratings and all other users.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    - target_user_ratings: Series containing the target user's ratings, indexed by MovieID.
    
    Returns:
    A Series with user IDs as the index and the cosine similarity scores as the values.
    """
    # Ensure target_user_ratings is a DataFrame row for compatibility with cosine_similarity
    target_user_df = pd.DataFrame(target_user_ratings).T.fillna(0)
    
    # Align user_item_matrix with target_user_df to match columns (MovieIDs)
    aligned_user_item_matrix = user_item_matrix.reindex(columns=target_user_df.columns, fill_value=0)
    
    # Calculate cosine similarities
    similarities = cosine_similarity(aligned_user_item_matrix, target_user_df)
    
    # Flatten the similarities array and create a Series with user IDs as index
    similarities_series = pd.Series(similarities.flatten(), index=aligned_user_item_matrix.index)
    
    return similarities_series



In [45]:
import numpy as np
from scipy.stats import pearsonr

def calculate_pearson_similarity(user_item_matrix, target_user_id):
    # Ensure target_user_id is in the index to avoid key errors
    if target_user_id not in user_item_matrix.index:
        return "Target user ID not found in the user-item matrix."
    
    target_ratings = user_item_matrix.loc[target_user_id].values
    similarities = {}

    for user_id in user_item_matrix.index:
        if user_id == target_user_id:
            continue  # Skip similarity with the user themselves

        user_ratings = user_item_matrix.loc[user_id].values
        
        # Mask to select only common non-NaN ratings between the target user and the current user
        common_mask = ~np.isnan(target_ratings) & ~np.isnan(user_ratings)
        
        if np.sum(common_mask) < 2:  # Ensure there are at least two common ratings
            continue  # Skip this pair if not enough common ratings
        
        # Calculate Pearson correlation coefficient for common non-NaN ratings
        correlation, _ = pearsonr(target_ratings[common_mask], user_ratings[common_mask])
        
        # Store the correlation if it's a valid number
        if np.isfinite(correlation):
            similarities[user_id] = correlation
            
    return similarities




In [46]:
# Usage example:
# Assuming user_item_matrix is your DataFrame with 'CustomerID' as index
# and 'MovieID' as columns, and 'Rating' as values
target_user_id = '1007' # replace <specific_user_id> with an actual CustomerID
similarities = calculate_pearson_similarity(user_item_matrix, target_user_id)
print(similarities)

{'1': 0.026823024826183258, '10': 0.14656168456178567, '100': -0.003822105122114111, '1000': 0.08415905015609008, '1001': 0.07176290983881588, '1002': -0.0025997675890872428, '1003': 0.10172503617068897, '1004': -0.002750547116935017, '1005': 0.0834514100424833, '1006': -0.0010859033321657537, '1008': -0.0035531666425854254, '1009': 0.0395196118568979, '101': 0.012417851169330987, '1010': 0.09965337943081952, '1011': 0.04992725091142948, '1012': 0.07771710939668736, '1013': 0.049631102000288994, '1014': 0.026224085936373334, '1015': 0.17812769674942208, '1016': 0.06465190274216928, '1017': -0.002816319629863912, '1018': 0.09950066635371345, '1019': 0.13715601865320604, '102': 0.062008312623628666, '1020': 0.0905119365431614, '1021': -0.001085903332165754, '1022': 0.04538925164996234, '1023': -0.0034947538485176773, '1024': 0.0947006168763517, '1025': 0.022307117972269787, '1026': 0.06281662107155249, '1027': 0.02897653788074713, '1028': 0.09468983348730503, '1029': -0.00449946226504515

In [47]:
def calculate_manhattan_similarity(user_item_matrix, target_user_ratings):
    """
    Calculate user similarities using Manhattan distance, comparing target user's ratings
    with those in the user_item_matrix.
    
    Parameters:
    - user_item_matrix: DataFrame where rows are users, columns are items, and values are ratings (training data).
    - target_user_ratings: Series or dict containing the target user's movie ratings.
    
    Returns:
    A Series with user IDs as the index and the similarity scores as the values.
    """
    similarities = {}

    for user_id in user_item_matrix.index:
        user_ratings = user_item_matrix.loc[user_id]
        
        # Calculate distance only for movies both have rated
        common_movies = user_ratings.index.intersection(target_user_ratings.index)
        if not common_movies.empty:
            distance = np.nansum(np.abs(user_ratings[common_movies] - target_user_ratings[common_movies]))
            similarity = 1 / (1 + distance) if distance != 0 else 0
        else:
            similarity = 0  # No common movies means no similarity
        
        similarities[user_id] = similarity

    similarity_series = pd.Series(similarities, name="Similarity").sort_values(ascending=False)
    return similarity_series




# Predict ratings using similarities

In [48]:
import numpy as np
import pandas as pd

def predict_rating(user_item_matrix, target_user_ratings, movie_id, k, similarity_method):

    """
    Predict the rating for a given movie by a target user, based on the ratings of top-k similar users.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    - target_user_id: The ID of the user for whom the rating is being predicted.
    - movie_id: The ID of the movie for which the rating is being predicted.
    - k: Number of top similar users to consider for prediction.
    - similarity_method: Function to calculate similarity scores between users.
    
    Returns:
    - Predicted rating for the movie by the target user.
    """

    
    # Calculate similarity scores between the target user and all others
    similarities = similarity_method(user_item_matrix, target_user_ratings)
    
   # Filter users who have rated the movie
    users_who_rated_movie = user_item_matrix.index[user_item_matrix[movie_id].notnull()]

    # Retain similarity scores for users who have rated the movie
    similarities_filtered = similarities.loc[users_who_rated_movie]
    
    # Filter top-k similar users from those who have rated the movie
    top_k_users = similarities_filtered.nlargest(k).index
    
    # Retrieve ratings for the movie from these top-k similar users
    top_k_ratings = user_item_matrix.loc[top_k_users, movie_id]
    
 # Calculate weighted average rating
    if not top_k_ratings.isnull().all():
        weighted_ratings = top_k_ratings * similarities.loc[top_k_users]
        predicted_rating = weighted_ratings.sum() / similarities.loc[top_k_users].sum()
    else:
        # Use the average rating for the movie if available
        if user_item_matrix[movie_id].notnull().any():
            predicted_rating = user_item_matrix[movie_id].mean()
        else:
            # Default rating if the movie has not been rated by anyone
            predicted_rating = 5
            
    return predicted_rating




In [49]:
validation_data.head()

Unnamed: 0,CustomerID,MovieID,Rating,timestamp,title,num_genres,(no genres listed),Action,Adventure,Animation,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
30100,487,1354,4.5,2013-11-13 00:39:26,Breaking the Waves (1996),2,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
13084,5,318,4.0,2002-08-15 05:28:00,"Shawshank Redemption, The (1994)",2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99128,1184,1237,4.0,2015-10-29 06:17:09,"Seventh Seal, The (Sjunde inseglet, Det) (1957)",1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7734,207,2,5.0,1996-10-01 06:01:09,Jumanji (1995),3,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
51632,1184,7361,4.0,2015-10-29 06:40:10,Eternal Sunshine of the Spotless Mind (2004),3,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0


In [50]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def evaluate_predictions(validation_data, user_item_matrix, k, similarity_method):
    """
    Evaluate the recommendation system by predicting ratings for each user-movie pair in the validation set
    and comparing the predictions to the actual ratings using RMSE.
    
    Parameters:
    - validation_data: DataFrame containing 'CustomerID', 'MovieID', and 'Rating'.
    - user_item_matrix: DataFrame representing the user-item matrix from the training set.
    - k: The number of top similar users to consider when making predictions.
    - similarity_method: The function to calculate similarity scores between users.
    
    Returns:
    - rmse: The root mean square error of the predicted ratings against the actual ratings.
    - movie_evaluated_count: The number of unique movies for which predictions were made.
    """
    actual_ratings = []
    predicted_ratings = []

    
    # Iterate over each row in the validation data
    # Inside evaluate_predictions, before calling predict_rating:
    for _, row in validation_data.iterrows():
        user_id = row['CustomerID']
        movie_id = row['MovieID']
        actual_rating = row['Rating']
    
    # Prepare target_user_ratings as a Series
    # This assumes you have a way to extract all ratings for user_id from validation_data
    # Here's a placeholder for how you might do this, replace with your actual logic
        target_user_ratings = validation_data[validation_data['CustomerID'] == user_id].set_index('MovieID')['Rating']
    
    # Now call predict_rating with target_user_ratings instead of target_user_id
    if movie_id in user_item_matrix.columns:  # Check if movie exists in training data
        predicted_rating = predict_rating(user_item_matrix, target_user_ratings, movie_id, k, similarity_method)
        actual_ratings.append(actual_rating)
        predicted_ratings.append(predicted_rating)
        
    
    # Calculate RMSE between actual and predicted ratings
    rmse = sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    
    return rmse




In [51]:
# Usage example
k = 20  # Example value for k
rmse= evaluate_predictions(validation_data, user_item_matrix, k, calculate_cosine_similarity)
print(f"RMSE: {rmse}")


RMSE: 0.31270082854028936


In [52]:
# Define a range of k values to test
k_values = [5, 15,30, 70, 150, 200]

# Initialize a dictionary to store the RMSE for each k value
k_rmse_results = {}

# Loop over each k value
for k in k_values:
    # Evaluate the recommender system using the current k value
    rmse  = evaluate_predictions(validation_data, user_item_matrix, k, calculate_manhattan_similarity)
    
    # Store the RMSE in the dictionary
    k_rmse_results[k] = rmse
    
    # Print the result for the current k
    print(f"RMSE for k={k}: {rmse}")

# Identify the k value with the lowest RMSE
optimal_k = min(k_rmse_results, key=k_rmse_results.get)
optimal_rmse = k_rmse_results[optimal_k]

print(f"\nOptimal k value: {optimal_k} with RMSE: {optimal_rmse}")



RMSE for k=5: 0.31744765154224996
RMSE for k=15: 0.19818023628060177
RMSE for k=30: 0.13375837218055675
RMSE for k=70: 0.3762006118926805
RMSE for k=150: 0.4874362157404766
RMSE for k=200: 0.6050314486476456

Optimal k value: 30 with RMSE: 0.13375837218055675
