 Import Necessary Packages

In [1]:
import numpy as np
import pandas as pd

Set dataset directory

In [2]:
dataset_Diretory="./Datasets/ml-100k"
dataset_with_credit="./Datasets"

# 1.Data Preparing

 Get the Genres from the dataset

In [3]:
# Genres
genre_df = pd.read_csv(f'{dataset_Diretory}/u.genre', sep='|', encoding='latin-1')
genre_columns = ["unknown"] + list(genre_df[genre_df.columns[0]].values)
genre_df.head()

Unnamed: 0,unknown,0
0,Action,1
1,Adventure,2
2,Animation,3
3,Children's,4
4,Comedy,5


### Loading the Movies dataset

In [4]:
# Movie
movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies_df = pd.read_csv(f'{dataset_with_credit}/movies_with_credits.csv', sep=',',
                     encoding='latin-1')
movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
# movies_df = pd.read_csv(f'{dataset_Diretory}/u.item', sep='|', names=movie_columns+genre_columns,
#                      encoding='latin-1')

movies_df.head()

Unnamed: 0,movie_id,title,release_date,video_release_date,imdb_url,unknown,Action,Adventure,Animation,Children's,...,Jared Harris,Jonathan Rhys Meyers,Julie T. Wallace,Thomas Schlamme,Werner Herzog,Vittorio Mezzogiorno,Stefan Glowacz,Mathilda May,Al Waxman,Gunilla Karlzen
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Loading ratings dataset

In [5]:
#Ratings
ratings_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings_df = pd.read_csv(f'{dataset_Diretory}/u.data', sep='\t', names=ratings_columns)
ratings_df.drop( "unix_timestamp", inplace = True, axis = 1 ) 
ratings_df.head(2)

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3


### Remove test data from ratings dataframe


In [6]:
# Group by user_id and take 2 random samples per user
test_ratings=ratings_df.groupby('user_id', group_keys=False).apply(lambda x: x.sample(n=2, random_state=42))

# Drop those sampled rows from the original dataframe
remaining_df = ratings_df.drop(test_ratings.index)

# Copy the dataframe into ratings_df
ratings_df=remaining_df.copy()

  test_ratings=ratings_df.groupby('user_id', group_keys=False).apply(lambda x: x.sample(n=2, random_state=42))


# 2.content-based approach

### Making user profiles 
At this stage we are making the profiles to understand user's taste in movies.
In order to do that we are multiplying the movie features with the rating user has provided and 
summing up all the features for each user to get a score.

In [7]:
# Step 1: Merge the ratings and movies dataframes on 'movie_id'
merged_df = pd.merge(ratings_df, movies_df, on='movie_id')

# Step 2: Identify the genre & artists columns (columns from index 5 onwards in movies_df)
genre_and_artists_cols = movies_df.columns[5:]

# Multiply each genre column by the 'rating' to get weighted genres
merged_df[genre_and_artists_cols] = merged_df[genre_and_artists_cols].mul(merged_df['rating'], axis=0)

# Step 3: Group by 'user_id' and sum the genre columns
user_genre_scores = merged_df.groupby('user_id')[genre_and_artists_cols].sum().reset_index()

# Resulting dataset
user_genre_scores.head()


Unnamed: 0,user_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Jared Harris,Jonathan Rhys Meyers,Julie T. Wallace,Thomas Schlamme,Werner Herzog,Vittorio Mezzogiorno,Stefan Glowacz,Mathilda May,Al Waxman,Gunilla Karlzen
0,1,4,246,123,40,55,316,82,24,417,...,0,0,0,0,0,0,0,0,0,0
1,2,0,38,13,4,12,61,29,0,126,...,0,0,0,0,0,0,0,0,0,0
2,3,0,39,14,0,0,26,30,5,57,...,0,0,0,0,0,0,0,0,0,0
3,4,0,26,9,0,0,20,15,5,27,...,0,0,0,0,0,0,0,0,0,0
4,5,4,172,107,53,71,243,35,0,72,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# normalize data 
from os import system
import warnings


def normalize_rows_vectorized(df: pd.DataFrame) -> pd.DataFrame:
    df_normalized = df.copy()
    numeric_cols = df_normalized.columns[1:]
    df_normalized[numeric_cols] = df_normalized[numeric_cols].apply(pd.to_numeric, errors='coerce').astype(float)

    # Normalize columns 1–19
    part1 = df_normalized.iloc[:, 1:20]
    max1 = part1.max(axis=1).replace(0, np.nan)
    df_normalized.iloc[:, 1:20] = part1.div(max1, axis=0)

    # Normalize columns 20–end
    part2 = df_normalized.iloc[:, 20:]
    if not part2.empty:
        max2 = part2.max(axis=1).replace(0, np.nan)
        df_normalized.iloc[:, 20:] = part2.div(max2, axis=0)

    return df_normalized

user_genre_scores = normalize_rows_vectorized(user_genre_scores)

In [9]:
user_genre_scores.to_csv("user_genre_scores",index=False)

In [10]:
def get_unrated_movies(user_id, movies_df, ratings_df):
    
    # Get all unique movie IDs from the movies dataset
    all_movies = movies_df['movie_id'].unique()
    
    # Get movies rated by the specific user
    user_rated = ratings_df.loc[ratings_df['user_id'] == user_id, 'movie_id'].unique()
    
    # Find movies not rated by the user using set difference
    unrated_movies = np.setdiff1d(all_movies, user_rated)
    
    return unrated_movies

# Example: Get movies not rated by user_id = 1
unrated_movies = get_unrated_movies(user_id=1, movies_df=movies_df, ratings_df=ratings_df)
print("Rated Movies:",len(movies_df)-len(unrated_movies))
print("Unrated Movies:", len(unrated_movies))

Rated Movies: 270
Unrated Movies: 1412


In [11]:
mod_mov=movies_df.drop(["video_release_date","release_date","imdb_url","release_date","movie_id","title"],axis=1,inplace=False)
mov_matrix=mod_mov.values

mod_user=user_genre_scores.drop(["user_id"],axis=1,inplace=False)
user_matrix=mod_user.values

score = np.dot(mov_matrix[5], user_matrix[1])
score

nan

In [12]:
type(mov_matrix)

numpy.ndarray

### Testing to come up with top 10 recommendation for a specific user

In [13]:
def recommend_top_n_content(user_id, mov_matrix, user_matrix, movies_df, top_n=10):
    """
    Recommend top-N movies for a given user based on dot product scores.
    
    Parameters:
        user_id (int): The target user index.
        mov_matrix (np.ndarray): Movie feature matrix (e.g., item profiles).
        user_matrix (np.ndarray): User-item rating matrix.
        movies_df (DataFrame): Original movies DataFrame with metadata.
        top_n (int): Number of recommendations to return.
    
    Returns:
        DataFrame: Top-N recommended movies with prediction scores.
    """
    user_id = user_id - 1
    # Compute scores for all movies for the target user
    scores = np.dot(mov_matrix, user_matrix[user_id])
    
    # Get indices of top-N scores
    top_indices = np.argsort(scores)[-top_n:][::-1]
    top_scores = scores[top_indices]
    
    # Get corresponding movie IDs from movies_df
    top_movie_ids = movies_df.iloc[top_indices]['movie_id'].values

    # Return as list of (movie_id, prediction_score)
    return list(zip(top_movie_ids, top_scores))

In [14]:
recommend_top_n_content(
    user_id=1,
    mov_matrix=mov_matrix,         # NumPy array
    user_matrix=user_matrix,       # NumPy array
    movies_df=movies_df,           # DataFrame with movie metadata
    top_n=10
)




[(1682, nan),
 (578, nan),
 (554, nan),
 (555, nan),
 (556, nan),
 (557, nan),
 (558, nan),
 (559, nan),
 (560, nan),
 (561, nan)]

# 3.Collaborative Filtering

## 3.1 Item-Item Approach

### Make items_profile:

A matrix where each row represents a movie in terms of users ratings.

In [15]:
user_item_matrix = ratings_df.pivot(index='movie_id', columns='user_id', values='rating')
print("User-Item Matrix:")
print(user_item_matrix)

User-Item Matrix:
user_id   1    2    3    4    5    6    7    8    9    10   ...  934  935  \
movie_id                                                    ...             
1         5.0  4.0  NaN  NaN  4.0  4.0  NaN  NaN  NaN  4.0  ...  2.0  3.0   
2         3.0  NaN  NaN  NaN  3.0  NaN  NaN  NaN  NaN  NaN  ...  4.0  NaN   
3         4.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
4         3.0  NaN  NaN  NaN  NaN  NaN  5.0  NaN  NaN  4.0  ...  5.0  NaN   
5         3.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
...       ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
1678      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
1679      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
1680      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
1681      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   
1682      NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN

In [16]:
#to call a movie use .loc
user_item_matrix.loc[1]

#to call a user use 
user_item_matrix[1]

movie_id
1       5.0
2       3.0
3       4.0
4       3.0
5       3.0
       ... 
1678    NaN
1679    NaN
1680    NaN
1681    NaN
1682    NaN
Name: 1, Length: 1681, dtype: float64

### Similarity Matrix:

To calculate the similarity between two movies, we will use cosine similarity.

In [17]:

def cosine_similarity(vec1, vec2):
    # Handle NaNs by treating them as zeros or ignoring them
    mask = ~np.isnan(vec1) & ~np.isnan(vec2)
    if np.sum(mask) == 0:
        return 0  # no overlap in ratings
    
    v1 = vec1[mask]
    v2 = vec2[mask]
    numerator = np.dot(v1, v2)
    denominator = np.linalg.norm(v1) * np.linalg.norm(v2)
    if denominator == 0:
        return 0
    return numerator / denominator


In [18]:
cosine_similarity(user_item_matrix.loc[1].values, user_item_matrix.loc[2].values)

0.9488806385308644

We want to calculate the similarity  between each movie and every other movie, the similarity Matrix.

In [19]:

# Your user_item_matrix: rows = movie_id, columns = user_id
# Make sure your index is movie_id and columns are user_id
movies = user_item_matrix.index.tolist()

# Initialize empty DataFrame to store similarities
similarity_matrix = pd.DataFrame(index=movies, columns=movies, dtype=float)

for i, movie_i in enumerate(movies):
    vec_i = user_item_matrix.loc[movie_i].values
    for j, movie_j in enumerate(movies):
        if j < i:
            # Similarity matrix is symmetric, copy value
            similarity_matrix.at[movie_i, movie_j] = similarity_matrix.at[movie_j, movie_i]
        else:
            vec_j = user_item_matrix.loc[movie_j].values
            sim = cosine_similarity(vec_i, vec_j)
            similarity_matrix.at[movie_i, movie_j] = sim

print(similarity_matrix)


          1         2         3         4         5         6         7     \
1     1.000000  0.948881  0.915048  0.942102  0.960450  0.955119  0.950152   
2     0.948881  1.000000  0.911985  0.939195  0.942688  0.955090  0.943322   
3     0.915048  0.911985  1.000000  0.898737  0.942472  0.968364  0.920467   
4     0.942102  0.939195  0.898737  1.000000  0.891994  0.919037  0.947734   
5     0.960450  0.942688  0.942472  0.891994  1.000000  0.996241  0.935982   
...        ...       ...       ...       ...       ...       ...       ...   
1678  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1679  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1680  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
1681  1.000000  1.000000  0.000000  1.000000  0.000000  0.000000  1.000000   
1682  1.000000  1.000000  1.000000  1.000000  1.000000  0.000000  1.000000   

          8         9         10    ...  1673  1674  1675  1676

### Predict ratings:

We’ll infer a missing rating of a user for an item by taking the normalized weighted sum of all the other ratings that user has given to different items. However, we will use only the top k most similar movies. The principle behind this is that the rating should be influenced primarily by the closest, most similar items.

In [20]:
def predict_rating(user_id, movie_id, user_item_matrix, similarity_matrix, k=5): 
    if movie_id not in user_item_matrix.index:
        return np.nan

    sims = similarity_matrix.loc[movie_id]
    user_ratings = user_item_matrix[user_id]

    # Drop target movie rating if exists
    user_ratings = user_ratings.drop(movie_id, errors='ignore')

    # Filter non-NaN pairs
    mask = ~np.isnan(sims) & ~np.isnan(user_ratings)
    sims = sims[mask]
    user_ratings = user_ratings[mask]

    # Select top-k most similar movies
    top_k = sims.abs().sort_values(ascending=False).head(k)
    user_ratings = user_ratings.loc[top_k.index]
    sims = sims.loc[top_k.index]

    numerator = np.sum(sims * user_ratings)
    denominator = np.sum(np.abs(sims))

    if denominator == 0:
        # fallback: user mean or global mean (choose what fits your data)
        return user_ratings.mean()

    return numerator / denominator


In [21]:
predict_rating(user_id=4, movie_id=1, user_item_matrix=user_item_matrix, similarity_matrix=similarity_matrix, k=10)

4.102745634508049

### Recommend the top 10 movies

In [22]:
def recommend_N ( user_id, user_item_matrix, similarity_matrix, N=10, k=5):
    if user_id not in user_item_matrix.columns:
        return []

    # Get all movies rated by the user
    rated_movies = user_item_matrix[user_id].dropna().index.tolist()
    
    # Initialize a list to store predictions
    predictions = []

    for movie_id in user_item_matrix.index:
        if movie_id not in rated_movies:
            predicted_rating = predict_rating(user_id, movie_id, user_item_matrix, similarity_matrix, k)
            if not np.isnan(predicted_rating):
                predictions.append((movie_id, predicted_rating))

    # Sort predictions by rating in descending order and take top N
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    return  predictions[:N]

In [23]:
recommend_N( user_id=4, user_item_matrix=user_item_matrix, similarity_matrix=similarity_matrix, N=10, k=5)


[(21, 5.000000000000001),
 (49, 5.000000000000001),
 (57, 5.000000000000001),
 (84, 5.000000000000001),
 (101, 5.000000000000001),
 (106, 5.000000000000001),
 (110, 5.000000000000001),
 (169, 5.000000000000001),
 (184, 5.000000000000001),
 (198, 5.000000000000001)]

## 3.1 Item-Item Approach

It is the transpose of the items_profile matrix, where each row represents a user based on their reactions.

In [24]:
user_profile = user_item_matrix.T

Similarity Matrix:

In [25]:
# user_item_matrix: rows = user_id, columns = user_id

users = user_profile.index.tolist()

# Initialize empty DataFrame to store similarities
similarity_matrix_users = pd.DataFrame(index=users, columns=users, dtype=float)

for i, user_i in enumerate(users):
    vec_i = user_profile.loc[user_i].values
    for j, user_j in enumerate(users):
        if j < i:
            # Similarity matrix is symmetric, copy value
            similarity_matrix_users.at[user_i, user_j] = similarity_matrix_users.at[user_j, user_i]
        else:
            vec_j = user_profile.loc[user_j].values
            sim = cosine_similarity(vec_i, vec_j)
            similarity_matrix_users.at[user_i, user_j] = sim

print(similarity_matrix_users)

          1         2         3         4         5         6         7    \
1    1.000000  0.957477  0.857075  0.898146  0.931616  0.953331  0.942811   
2    0.957477  1.000000  0.935601  0.946544  0.979660  0.956301  0.963967   
3    0.857075  0.935601  1.000000  0.919528  1.000000  0.890713  0.879543   
4    0.898146  0.946544  0.919528  1.000000  1.000000  0.931108  0.855344   
5    0.931616  0.979660  1.000000  1.000000  1.000000  0.933706  0.905738   
..        ...       ...       ...       ...       ...       ...       ...   
939  0.944993  0.974901  0.993884  1.000000  0.930136  0.922565  0.957014   
940  0.944952  0.944038  0.867865  0.961247  0.918769  0.949294  0.940097   
941  0.972207  0.940225  0.973223  0.998538  0.975643  0.945658  0.937400   
942  0.918030  0.948493  0.897758  0.977106  0.941836  0.960481  0.983202   
943  0.929704  0.973154  0.989949  1.000000  0.908721  0.939325  0.930511   

          8         9         10   ...       934       935       936  \
1  

Predict rating : 

In [26]:
def predict_rating_user(user_id, movie_id, user_profile, similarity_matrix_users, k=5):
    """
    Predict the rating of a user for a movie using user-based CF.
    """
    if movie_id not in user_profile.columns:
        return np.nan  # movie unknown

    # Similarities of target user to all other users
    sims = similarity_matrix_users.loc[user_id]

    # Ratings of other users for the target movie
    movie_ratings = user_profile[movie_id]

    # Filter out users who haven't rated the movie
    mask = movie_ratings.notna()
    sims = sims[mask]
    movie_ratings = movie_ratings[mask]

    # Select top-k most similar users
    top_k = sims.abs().sort_values(ascending=False).head(k)
    movie_ratings = movie_ratings.loc[top_k.index]
    sims = sims.loc[top_k.index]

    # Weighted average
    numerator = np.sum(sims * movie_ratings)
    denominator = np.sum(np.abs(sims))

    if denominator == 0:
        # Fallback: return global mean rating of movie (or user mean)
        return movie_ratings.mean()

    return numerator / denominator


In [27]:
predict_rating_user(4, 1, user_profile, similarity_matrix_users, k=5)

3.2

### Recommend the top 10 movies

In [28]:
def recommend_top_n_movies_user(user_id, user_profile, similarity_matrix_users, n=10, k=5):
    """
    Recommend top-N movies to a user based on predicted ratings.
    
    Parameters:
    - user_id: target user
    - user_profile: DataFrame with users as rows, movies as columns
    - similarity_matrix_users: user-user similarity matrix
    - n: number of movies to recommend
    - k: number of neighbors to use in prediction
    
    Returns:
    - List of (movie_id, predicted_rating), sorted by predicted_rating descending
    """
    # Movies the user has already rated
    user_ratings = user_profile.loc[user_id]
    rated_movies = user_ratings[user_ratings.notna()].index.tolist()
    
    # Movies the user hasn't rated
    unrated_movies = [movie for movie in user_profile.columns if movie not in rated_movies]
    
    predictions = []
    for movie_id in unrated_movies:
        pred_rating = predict_rating_user(user_id, movie_id, user_profile, similarity_matrix_users, k)
        predictions.append((movie_id, pred_rating))
    
    # Sort movies by predicted rating descending
    predictions.sort(key=lambda x: x[1], reverse=True)
    
    # Return top N
    return predictions[:n]


In [29]:
recommend_top_n_movies_user(4, user_profile, similarity_matrix_users, n=10, k=5)

[(814, 5.000000000000001),
 (1293, 5.000000000000001),
 (1653, 5.000000000000001),
 (408, 5.0),
 (498, 5.0),
 (1122, 5.0),
 (1201, 5.0),
 (1467, 5.0),
 (1500, 5.0),
 (1599, 5.0)]

# 4.Evaluating models


In [30]:
test_set = test_ratings.pivot(index='movie_id', columns='user_id', values='rating')


## Approach 1 : 

The first approach: We evaluate whether our system is able to assign a high rating to a movie (i.e., above its average rating) and a low rating to a movie (i.e., below its average rating).


### 1.content based approach

In [42]:
def evaluate_content_based(test_ratings, ratings_df, movies_df, mov_matrix, user_matrix, verbose=False):
    """
    Evaluate the content-based approach on the test set.

    Parameters:
        test_ratings (DataFrame): Test ratings with columns ['user_id', 'movie_id', 'rating']
        ratings_df (DataFrame): Full ratings dataset to compute user averages
        movies_df (DataFrame): Movies dataframe to get movie names
        mov_matrix (ndarray): Matrix of movie features (num_movies x num_features)
        user_matrix (ndarray): Matrix of user profiles (num_users x num_features)
        verbose (bool): If True, print detailed output for each example

    Returns:
        float: Average accuracy (%)
    """

    total_test_cases = len(test_ratings)
    hit = 0

    print(f"Total test cases: {total_test_cases}")

    for index, row in test_ratings.iterrows():
        user_id = row['user_id']
        movie_id = row['movie_id']
        user_rating_for_movie = row['rating']

        # Get movie name (optional)
        movie = movies_df[movies_df['movie_id'] == movie_id].iloc[0]
        movie_name = movie['title']

        # Compute user average rating
        user_average_rating = round(ratings_df[ratings_df['user_id'] == user_id]['rating'].mean(), 3)

        # Compute predicted score for the movie
        score_for_movie = np.dot(mov_matrix[movie_id - 1], user_matrix[user_id - 1])

        # Compute average score for the user (across all movies)
        all_scores_for_user = np.dot(mov_matrix, user_matrix[user_id - 1])
        average_score_for_user = np.mean(all_scores_for_user)

        # Classify actual rating vs average
        user_has_rated = "Below Average"
        if user_rating_for_movie > (user_average_rating - user_average_rating * 0.10):
            user_has_rated = "Above Average"

        # Classify predicted score vs average
        score_has_been_given = "Below Average"
        if score_for_movie > (average_score_for_user - average_score_for_user * 0.10):
            score_has_been_given = "Above Average"

        # Compare
        if user_has_rated == score_has_been_given:
            hit += 1

        # Optional print
        if verbose:
            print(f"user id: {user_id}, movie_name: {movie_name}")
            print(f"User rating: {user_rating_for_movie}, User avg: {user_average_rating}")
            print(f"Predicted score: {score_for_movie}, Avg score for user: {average_score_for_user}")
            print(f"user_rated: {user_has_rated}, score_given: {score_has_been_given}\n")

    # Compute accuracy
    average_accuracy = round((hit / total_test_cases) * 100, 2)
    print(f"Average accuracy is: {average_accuracy} % ")

    return average_accuracy


In [43]:
evaluate_content_based(test_ratings, ratings_df, movies_df, mov_matrix, user_matrix, verbose=False)

Total test cases: 1886
Average accuracy is: 50.74 % 


50.74

### 2. Item_item approach

In [46]:
def evaluate_item_based(test_set, user_item_matrix, similarity_matrix, k=5):
    hits = 0
    
    for user_id in test_set.columns.unique():
        rated_movies = test_set.index[test_set[user_id].notna()].tolist()
        user_average_rating = remaining_df[remaining_df['user_id'] == user_id]['rating'].mean()
        for movie_id in rated_movies:
            predicted_rating = predict_rating(user_id, movie_id, user_item_matrix, similarity_matrix, k)
            actual_rating = test_set.at[movie_id, user_id]
            if not np.isnan(predicted_rating) and not np.isnan(actual_rating):
                # Check if the predicted rating is above or below the user's average rating
                user_has_rated = "Above Average" if actual_rating > (user_average_rating - user_average_rating * 0.10) else "Below Average"
                score_has_been_given = "Above Average" if predicted_rating > (user_average_rating - user_average_rating * 0.10) else "Below Average"
                
                if user_has_rated == score_has_been_given:
                    hits += 1

    accuracy = (hits / (2*len(test_set.columns.unique()))) * 100 if len(test_set.columns.unique()) > 0 else 0
    return accuracy

In [47]:
evaluate_item_based(test_set, user_item_matrix, similarity_matrix, k=5)

61.87698833510075

## 3- User_user approach

In [48]:
def evaluate_user_based(test_set, user_profile, similarity_matrix_users, k=5):
    hits = 0
    
    for user_id in test_set.columns.unique():
        rated_movies = test_set.index[test_set[user_id].notna()].tolist()
        user_average_rating = user_profile.loc[user_id].mean()
        for movie_id in rated_movies:
            predicted_rating = predict_rating_user(user_id, movie_id, user_profile, similarity_matrix_users, k)
            actual_rating = test_set.at[movie_id, user_id]
            if not np.isnan(predicted_rating) and not np.isnan(actual_rating):
                # Check if the predicted rating is above or below the user's average rating
                user_has_rated = "Above Average" if actual_rating > (user_average_rating - user_average_rating * 0.10) else "Below Average"
                score_has_been_given = "Above Average" if predicted_rating > (user_average_rating - user_average_rating * 0.10) else "Below Average"
                
                if user_has_rated == score_has_been_given:
                    hits += 1

    accuracy = (hits / (2*len(test_set.columns.unique()))) * 100 if len(test_set.columns.unique()) > 0 else 0
    return accuracy

In [49]:
evaluate_user_based(test_set, user_profile, similarity_matrix_users, k=5)

64.95227995758218

### Second Approach

The second approach Evaluate whether the recommender system can suggest a movie that the user has interacted with, regardless of the rating (whether good or bad). This can be considered a good approach, because we assume that if a user has rated a movie, it means they were initially interested in it.

In [36]:
""""
def evaluate_recommendations(test_set):
    hits = 0

    for user_id in test_set.columns.unique():
        recommendations = recommend_N(user_id, user_item_matrix, similarity_matrix)

        # Convert recommendations to a set of movie IDs
        recommended_movie_ids = {movie_id for movie_id, _ in recommendations}
        
        # Get the actual ratings from the test set
        actual_ratings = test_ratings[test_ratings['user_id'] == user_id]['movie_id'].values
        
        # Calculate hits (recommended movies that were actually rated)
        hit = len(set(recommended_movie_ids) & set(actual_ratings))

        hits += hit
    hits = hits / len(test_set.columns.unique()) if len(test_set.columns.unique()) > 0 else 0    
    return hits

"""

'"\ndef evaluate_recommendations(test_set):\n    hits = 0\n\n    for user_id in test_set.columns.unique():\n        recommendations = recommend_N(user_id, user_item_matrix, similarity_matrix)\n\n        # Convert recommendations to a set of movie IDs\n        recommended_movie_ids = {movie_id for movie_id, _ in recommendations}\n        \n        # Get the actual ratings from the test set\n        actual_ratings = test_ratings[test_ratings[\'user_id\'] == user_id][\'movie_id\'].values\n        \n        # Calculate hits (recommended movies that were actually rated)\n        hit = len(set(recommended_movie_ids) & set(actual_ratings))\n\n        hits += hit\n    hits = hits / len(test_set.columns.unique()) if len(test_set.columns.unique()) > 0 else 0    \n    return hits\n\n'

It's taking more than 119 minutes to run this program, so let's use parallelism.
The code below performs the same task as above, but it distributes the workload across multiple CPU cores to compute recommendations for all users.
It splits the user IDs into batches, processes them in parallel, and then aggregates the results.
This is made possible thanks to the joblib library.

In [37]:
pip install joblib





[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [38]:
from joblib import Parallel, delayed
import numpy as np


def recommend_all(test_ratings,recommend_function, user_ids, **kwargs):
    """
    Recommend movies for all users in parallel, using any recommender function.

    Parameters:
        recommend_function (function): A recommender function that takes at least a user_id as first argument.
        user_ids (list): List of user IDs to generate recommendations for.
        **kwargs: Any additional keyword arguments required by the recommender function.

    Returns:
        dict: Dictionary {user_id: list of recommendations}
    """
    def wrapped_function(user_id):
        return user_id, recommend_function(user_id, **kwargs)

    # Run in parallel across user_ids
    recommendations_all_users = Parallel(n_jobs=-1)(
        delayed(wrapped_function)(user_id) for user_id in user_ids
    )

    # Convert result to a dictionary
    recommendation_dict = dict(recommendations_all_users)

    """
    Computes the hit rate of recommendations by checking how many recommended items
    were actually rated by the user in the test set.
    """
    hits = 0
    total_users = 0
    
    # Ensure test_ratings is a DataFrame with columns: user_id, movie_id
    if not isinstance(test_ratings, pd.DataFrame):
        raise ValueError("test_ratings must be a DataFrame with 'user_id' and 'movie_id' columns.")
    
    for user_id, recommendations in recommendation_dict.items():
        # Get recommended movie IDs
        recommended_movie_ids = {movie_id for movie_id, _ in recommendations}

        # Get movies the user has rated in the test set
        actual_rated = test_ratings[test_ratings['user_id'] == user_id]['movie_id'].values

        # Compute intersection (hits)
        hit = len(set(recommended_movie_ids) & set(actual_rated))
        hits += hit
        total_users += 1

    # Average hits per user
    return (hits / total_users) if total_users > 0 else 0
    



### 1-content based approach

In [39]:
#recommend_all(test_ratings, recommend_top_n_content, user_ids=test_set.columns.unique(), mov_matrix=mov_matrix, user_matrix=user_matrix, movies_df=movies_df, top_n=10)

### 2- item_item approach 

In [40]:
#recommend_all(test_ratings=test_ratings, recommend_function=recommend_N, user_ids=test_set.columns.unique(), user_item_matrix=user_item_matrix, similarity_matrix=similarity_matrix, N=10, k=5)

### 3-  (User_user) approach

In [41]:
#recommend_all(test_ratings=test_ratings, recommend_function=recommend_top_n_movies_user, user_ids=test_set.columns.unique(), user_profile=user_profile, similarity_matrix_users=similarity_matrix_users, n=10, k=5)