In [1]:
import pandas as pd
import numpy as np
training_df = pd.read_csv('C:/Users/nafla/OneDrive/Documents/system development/Netflix/training_data.csv')
training_df.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,RatingYear,MovieAge
0,1,1488844,3,2005-09-06,2003,2005,2
1,1,822109,5,2005-05-13,2003,2005,2
2,1,885013,4,2005-10-19,2003,2005,2
3,1,30878,4,2005-12-26,2003,2005,2
4,1,823519,3,2004-05-03,2003,2004,1


In [2]:
# Calculate quantiles for user activity and item popularity
user_activity_quantiles = training_df['CustomerID'].value_counts().quantile([0.25, 0.5, 0.75])
item_popularity_quantiles = training_df['MovieID'].value_counts().quantile([0.25, 0.5, 0.75])
print(user_activity_quantiles)
print(item_popularity_quantiles)

0.25     8.0
0.50    24.0
0.75    64.0
Name: CustomerID, dtype: float64
0.25     192.0
0.50     552.5
0.75    2539.0
Name: MovieID, dtype: float64


In [3]:
training_df['CustomerID'] = training_df['CustomerID'].astype(str)
training_df['MovieID'] = training_df['MovieID'].astype(str)
training_df['Rating'] = pd.to_numeric(training_df['Rating'], errors='coerce')  # Converts to float, makes non-numeric as NaN

# Stratified Sampling Method 

To create a representative sample of our dataset, we employ a stratified sampling method that accounts for three key dimensions: Rating Distribution, User Activity, and Item Popularity. This approach ensures our sample maintains the diversity and characteristics of the entire dataset, facilitating more reliable model training and evaluation.

- User Activity is quantified by the number of ratings a user has provided.
- Item Popularity reflects the number of ratings an item has received.

Finally, We combine User Activity, Item Popularity, and Rating into a composite stratification key for each record. This multi-dimensional key ensures our sampling process considers the distribution across all three axes.

In [4]:
# Assign each user and item to a bin based on the quantiles
training_df['UserActivityBin'] = pd.qcut(training_df.groupby('CustomerID')['Rating'].transform('size'), 
                                q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

training_df['ItemPopularityBin'] = pd.qcut(training_df.groupby('MovieID')['Rating'].transform('size'), 
                                  q=[0, .25, .5, .75, 1], labels=['low', 'medium', 'medium-high', 'high'])

# Combine these with Rating to create a stratification key
training_df['Strata'] = training_df['UserActivityBin'].astype(str) + "_" + training_df['ItemPopularityBin'].astype(str) + "_" + training_df['Rating'].astype(str)

# Perform stratified sampling
# we use groupby and  frac to specify a fraction of each strata and in case number of rows is less that 10 it takes all rows
strat_sample_df = training_df.groupby('Strata').apply(lambda x: x.sample(frac=0.0001 if len(x) > 10 else len(x)/len(x))).reset_index(drop=True)


In [5]:
num_sampled_rows = len(strat_sample_df)
print(f"Number of rows in the sampled DataFrame: {num_sampled_rows}")

Number of rows in the sampled DataFrame: 2405


# Splitting dataset to training, test, validation

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming strat_sample_df is your dataframe and it has a 'CustomerID' column

# Step 1: Group by 'CustomerID'
groups = strat_sample_df.groupby('CustomerID')

# Step 2: Split the grouped data into lists of groups for each dataset
# Convert the GroupBy object to a list of (name, group) tuples
groups = list(groups)

# Shuffle the groups to randomize before splitting (optional, but recommended)
import random
random.seed(42)  # Ensure reproducibility
random.shuffle(groups)

# Calculate split sizes
total_groups = len(groups)
training_size = int(total_groups * 0.6)
validation_size = int(total_groups * 0.2)
# Testing size is implied to be the rest

# Split the groups
training_groups = groups[:training_size]
validation_groups = groups[training_size:(training_size + validation_size)]
testing_groups = groups[(training_size + validation_size):]

# Step 3: Concatenate the records within each split to form the final datasets
training_data = pd.concat([group for _, group in training_groups])
validation_data = pd.concat([group for _, group in validation_groups])
testing_data = pd.concat([group for _, group in testing_groups])

# Now you have training_data, validation_data, and testing_data with no CustomerID overlap


In [7]:
# Calculate the size of each split
training_size = training_data.shape[0]  # Number of rows in the training data
validation_size = validation_data.shape[0]  # Number of rows in the validation data
testing_size = testing_data.shape[0]  # Number of rows in the testing data

# Print the sizes
print(f"Training Data Size: {training_size}")
print(f"Validation Data Size: {validation_size}")
print(f"Testing Data Size: {testing_size}")

Training Data Size: 1439
Validation Data Size: 485
Testing Data Size: 481


In [8]:
# Creating customer-movie matrix
user_item_matrix = training_data.pivot_table(index='CustomerID', columns='MovieID', values='Rating')

In [13]:
user_item_matrix.head()

MovieID,10,1020,1026,1027,1035,104,1043,1046,1060,1061,...,962,963,964,97,98,980,983,985,989,990
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10007,,,,,,,,,,,...,,,,,,,,,,
1004258,,,,,,,,,,,...,,,,,,,,,,
1005657,,,,,,,,,,,...,,,,,,,,,,
1006457,,,,,,,,,,,...,,,,,,,,,,
100864,,,,,,,,,,,...,,,,,,,,,,


In [28]:
import numpy as np

def custom_cosine_similarity(target_vector, other_vector):
    # Mask for common non-NaN ratings
    common_mask = ~np.isnan(target_vector) & ~np.isnan(other_vector)
    if not np.any(common_mask):
        return 0  # No common ratings
    
    # Extracting common ratings
    target_common = target_vector[common_mask]
    other_common = other_vector[common_mask]
    
    # Compute dot product and norms of common ratings
    dot_product = np.dot(target_common, other_common)
    target_norm = np.linalg.norm(target_common)
    other_norm = np.linalg.norm(other_common)
    
    # Calculate cosine similarity
    if target_norm == 0 or other_norm == 0:
        return 0  # Avoid division by zero
    similarity = dot_product / (target_norm * other_norm)
    return similarity

def calculate_all_similarities(user_item_matrix, user_id):
    # Ensure we're working with raw values for manual handling
    matrix_values = user_item_matrix.values
    user_index = user_item_matrix.index.get_loc(user_id)
    target_user_ratings = matrix_values[user_index, :]
    
    similarities = []
    for i, other_user_ratings in enumerate(matrix_values):
        if i == user_index:
            continue  # Skip comparing user to themselves
        similarity = custom_cosine_similarity(target_user_ratings, other_user_ratings)
        if similarity != 0:  # Collect non-zero similarities
            similarities.append(similarity)
    
    return similarities if similarities else 0



In [54]:
user_id = '1005657'
similarities = calculate_all_similarities(user_item_matrix, user_id)

# Print or process the similarities as needed
print(similarities)

[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [48]:
import numpy as np
from scipy.stats import pearsonr

def calculate_pearson_similarity(user_item_matrix, target_user_id):
    # Ensure target_user_id is in the index to avoid key errors
    if target_user_id not in user_item_matrix.index:
        return "Target user ID not found in the user-item matrix."
    
    target_ratings = user_item_matrix.loc[target_user_id].values
    similarities = {}

    for user_id in user_item_matrix.index:
        if user_id == target_user_id:
            continue  # Skip similarity with the user themselves

        user_ratings = user_item_matrix.loc[user_id].values
        
        # Mask to select only common non-NaN ratings between the target user and the current user
        common_mask = ~np.isnan(target_ratings) & ~np.isnan(user_ratings)
        
        if np.sum(common_mask) < 2:  # Ensure there are at least two common ratings
            continue  # Skip this pair if not enough common ratings
        
        # Calculate Pearson correlation coefficient for common non-NaN ratings
        correlation, _ = pearsonr(target_ratings[common_mask], user_ratings[common_mask])
        
        # Store the correlation if it's a valid number
        if np.isfinite(correlation):
            similarities[user_id] = correlation
            
    return similarities




In [57]:
# Usage example:
# Assuming user_item_matrix is your DataFrame with 'CustomerID' as index
# and 'MovieID' as columns, and 'Rating' as values
target_user_id = '100864' # replace <specific_user_id> with an actual CustomerID
similarities = calculate_pearson_similarity(user_item_matrix, target_user_id)
print(similarities)

{}


In [58]:
import numpy as np
import pandas as pd

def calculate_manhattan_similarity(user_item_matrix, target_user_id):
    """
    Calculate user similarities using Manhattan distance.
    
    Parameters:
    - user_item_matrix: A DataFrame where rows are users, columns are items, and values are ratings.
    - target_user_id: The user ID for which to calculate similarities.
    
    Returns:
    A Series with user IDs as the index and the similarity scores as the values.
    """
    # Ensure target_user_id is a string, matching the user IDs in your user_item_matrix index
    target_user_id = str(target_user_id)
    
    # Check if the target_user_id exists in the user_item_matrix
    if target_user_id not in user_item_matrix.index:
        return "Target user ID not found in the user-item matrix."
    
    # Retrieve the target user's ratings
    target_user_ratings = user_item_matrix.loc[target_user_id]
    
    # Initialize a dictionary to store the similarity scores
    similarities = {}
    
    # Iterate over all users in the matrix to compute Manhattan distance to the target user
    for user_id in user_item_matrix.index:
        if user_id == target_user_id:
            continue  # Skip the target user
        
        # Compute the Manhattan distance
        user_ratings = user_item_matrix.loc[user_id]
        distance = np.nansum(np.abs(target_user_ratings - user_ratings))
        
        # Convert distance to similarity, avoiding division by zero
        similarity = 1 / (1 + distance) if distance != 0 else 0
        similarities[user_id] = similarity
    
    # Convert the similarities dictionary to a pandas Series for easy handling and return
    similarity_series = pd.Series(similarities, name="Similarity")
    
    return similarity_series.sort_values(ascending=False)




In [60]:
# Example usage:
target_user_id = '100864'  # Replace with the actual CustomerID as a string
similarities = calculate_manhattan_similarity(user_item_matrix, target_user_id)
print(similarities)

2519568    0.333333
10007      0.000000
257913     0.000000
2587797    0.000000
2587481    0.000000
             ...   
1792108    0.000000
1791361    0.000000
1791124    0.000000
1790119    0.000000
99865      0.000000
Name: Similarity, Length: 1428, dtype: float64


In [61]:
import numpy as np
import pandas as pd

def predict_rating(user_item_matrix, target_user_id, movie_id, k, similarity_method):
    """
    Predict the rating for a given movie by a target user, based on the ratings of top-k similar users.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values.
    - target_user_id: The ID of the user for whom the rating is being predicted.
    - movie_id: The ID of the movie for which the rating is being predicted.
    - k: Number of top similar users to consider for prediction.
    - similarity_method: Function to calculate similarity scores between users.
    
    Returns:
    - Predicted rating for the movie by the target user.
    """
    # Calculate similarity scores between the target user and all others
    similarities = similarity_method(user_item_matrix, target_user_id)
    
    # Filter top-k similar users
    top_k_users = similarities.nlargest(k+1).drop(target_user_id, errors='ignore').index
    
    # Check if the movie has been rated by top-k users
    top_k_ratings = user_item_matrix.loc[top_k_users, movie_id]
    top_k_similarities = similarities[top_k_users]
    
    # Calculate weighted average rating
    if not top_k_ratings.isnull().all():
        weighted_ratings = top_k_ratings * top_k_similarities
        predicted_rating = weighted_ratings.sum() / top_k_similarities[top_k_ratings.notnull()].sum()
    else:
        # Use the average rating for the movie if available
        if user_item_matrix[movie_id].notnull().any():
            predicted_rating = user_item_matrix[movie_id].mean()
        else:
            # Default rating if the movie has not been rated by anyone
            predicted_rating = 3
            
    return predicted_rating




In [63]:
# Example usage:
# Ensure you've defined a similarity method like calculate_manhattan_similarity, calculate_pearson_similarity, etc.
target_user_id = '100864'  # replace with actual user ID as a string
movie_id = '1020'  # replace with actual movie ID
k = 5  # Number of top similar users to consider
predicted_rating = predict_rating(user_item_matrix, target_user_id, movie_id, k, calculate_manhattan_similarity)  # Replace similarity method as needed
print(f"Predicted Rating: {predicted_rating}")

Predicted Rating: 3.0


In [68]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def evaluate_recommender(user_item_matrix, validation_data, k, similarity_method):
    """
    Evaluate the recommender system by calculating RMSE on the validation dataset.
    
    Parameters:
    - user_item_matrix: DataFrame with users as rows, movies as columns, and ratings as values (training data).
    - validation_data: DataFrame with columns ['CustomerID', 'MovieID', 'Rating'] for validation.
    - k: Number of top similar users to consider for predicting ratings.
    - similarity_method: Function to calculate similarity scores between users.
    
    Returns:
    - rmse: The root mean square error of the predicted ratings against the actual ratings in the validation dataset.
    """
    actual_ratings = []
    predicted_ratings = []
    
    for index, row in validation_data.iterrows():
        user_id, movie_id, actual_rating = row['CustomerID'], row['MovieID'], row['Rating']
        predicted_rating = predict_rating(user_item_matrix, user_id, movie_id, k, similarity_method)
        
        actual_ratings.append(actual_rating)
        predicted_ratings.append(predicted_rating)
    
    # Calculate RMSE
    rmse = sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    return rmse




In [69]:
# Example usage:.
k = 5  # Number of top similar users to consider
rmse = evaluate_recommender(user_item_matrix, validation_data, k, calculate_manhattan_similarity)  # Replace similarity method as needed
print(f"RMSE: {rmse}")

AttributeError: 'str' object has no attribute 'nlargest'