In [6]:
import pandas as pd
import numpy as np
training_df = pd.read_csv('C:/Users/nafla/OneDrive/Documents/system development/Netflix/training_data.csv')
training_df.head()

Unnamed: 0,MovieID,CustomerID,Rating,Date,YearOfRelease,RatingYear,MovieAge
0,1,1488844,3,2005-09-06,2003,2005,2
1,1,822109,5,2005-05-13,2003,2005,2
2,1,885013,4,2005-10-19,2003,2005,2
3,1,30878,4,2005-12-26,2003,2005,2
4,1,823519,3,2004-05-03,2003,2004,1


In [7]:
training_df['CustomerID'] = training_df['CustomerID'].astype('int32')
training_df['MovieID'] = training_df['MovieID'].astype('int16')
training_df['Rating'] = training_df['Rating'].astype('float32')

In [8]:
# Stratified sampling to maintain the distribution of ratings
# First, group by 'Rating', then sample a fraction from each group

# Define the fraction of each group to sample
frac = 0.1

# Perform the stratified sampling
strat_sample_df = training_df.groupby('Rating', group_keys=False).apply(lambda x: x.sample(frac=frac, random_state=42))



Scipy Sparse Matrix Convert the DataFrame to a sparse matrix format using scipy, which is much more memory-efficient for sparse data.

In [10]:
from scipy.sparse import csr_matrix

# Create a sparse matrix
user_item_matrix_sparse = csr_matrix((strat_sample_df['Rating'], (strat_sample_df['CustomerID'], strat_sample_df['MovieID'])))



Calculate User Similarities
we Used manhattan similarity from cityblock to calculate similarities between all pairs of users. Since manhattan similarity requires no missing values, fill missing ratings with 0s (implying an "unknown" rather than a "bad" rating).

In [13]:
from scipy.spatial.distance import cityblock

def manhattan_similarity(user1_ratings, user2_ratings):
    common = user1_ratings.index.intersection(user2_ratings.index)
    if len(common) == 0:
        return 0
    # Inverse distance for similarity
    return 1 / (1 + cityblock(user1_ratings[common], user2_ratings[common]))


In [14]:
# find the k nearest neighbors for a given user who have rated a specific item:

def find_k_nearest_neighbors(user_id, item_id, k, user_item_matrix, similarity_matrix):
    # Indices of users who have rated the item
    users_who_rated_item = np.where(user_item_matrix[:, item_id].toarray().ravel() > 0)[0]
    
    # Similarities between the target user and users who rated the item
    similarities = similarity_matrix[user_id, users_who_rated_item]
    
    # Get the indices of the top k similar users
    k_nearest_neighbors_indices = np.argsort(similarities)[-k:]
    
    # Return the actual user IDs and their similarities
    return users_who_rated_item[k_nearest_neighbors_indices], similarities[k_nearest_neighbors_indices]


In [15]:
# Predict the rating for the target item by taking the weighted average of the ratings from these neighbors

def predict_rating(user_id, item_id, k, user_item_matrix, similarity_matrix):
    neighbors_indices, similarities = find_k_nearest_neighbors(user_id, item_id, k, user_item_matrix, similarity_matrix)
    
    # Ratings from neighbors for the item
    neighbor_ratings = user_item_matrix[neighbors_indices, item_id].toarray().ravel()
    
    # Compute weighted average
    if similarities.sum() > 0:
        predicted_rating = np.dot(neighbor_ratings, similarities) / similarities.sum()
    else:
        predicted_rating = np.mean(user_item_matrix[:, item_id].toarray()[user_item_matrix[:, item_id].toarray().ravel() > 0])  # Fallback to mean item rating if no similarities
    
    return predicted_rating
