##Author=
Michele Russo [michele.russo@ugent.be]

In [106]:
import pandas as pd
import numpy as np
from numpy import dot
from numpy.linalg import norm
import math
from datetime import datetime
from collections import Counter

###Helper Functions used for Analysis task

In [107]:
#this function counts the number of times a genre is rated by all the users
def genres_counter(matrix1, matrix2):
    # Merge the two matrices on the 'movieId' column
    merged_matrix = pd.merge(matrix1, matrix2, on='movieId', how='left')

    # Create dummy variables for genres
    genres = merged_matrix["genres"].str.get_dummies(sep='|')

    # Sum the occurrences of each genre across all movies
    genre_sum = genres.sum(axis=0)

    # Create a dictionary with genre names as keys and their respective counts as values
    result_dict = dict(zip(genres.columns, genre_sum))

    # Return the dictionary containing the count of each genre
    return result_dict  # comment this function


In [108]:
#given the movie one hot encode this function calculates the average number of genres per each move
#the minimum number of genres and the maximum
def movie_metrics(movie_encode):
    # Extract genres columns from the movie_encode dataframe
    movie_genres = movie_encode.iloc[:, 1:]

    # Sum the number of genres for each movie
    num_movie_genres = movie_genres.sum(axis=1)

    # Calculate the average number of genres per movie
    average_value = num_movie_genres.mean()

    # Find the minimum number of genres among all movies
    min_value = num_movie_genres.min()

    # Find the maximum number of genres among all movies
    max_value = num_movie_genres.max()

    # Return the calculated average, minimum, and maximum values
    return average_value, min_value, max_value  # comment


In [109]:
# Function to retrieve the rating and timestamp for a specific user and movie from a ratings DataFrame
def take_user_movie(ratings, user_id, movie_id):
    # Filter rows based on user_id and movie_id
    row = ratings[(ratings["userId"] == user_id) & (ratings["movieId"] == movie_id)]

    # Extract the rating and timestamp for the specified user and movie
    rating = row["rating"]
    timestamp = row["timestamp"]

    # Return the rating and timestamp for the specified user and movie
    return rating, timestamp  # comment


In [110]:
# Function to retrieve metadata for a specific movie ID from a movies DataFrame
def get_movie_data(movies, movie_id):
    # Filter the movies DataFrame to get metadata for the specified movie ID
    metadata = movies[movies["movieId"] == movie_id]

    # Return the metadata for the specified movie
    return metadata


In [111]:
# Function to retrieve the rating history and corresponding movie data for a specific user from ratings and movies DataFrames
def get_user_history(ratings, movies, user_id):
    # Filter the ratings DataFrame to get the rating history of the specified user
    user_history = ratings[ratings["userId"] == user_id]

    # Merge the user history with movie metadata based on movieId
    dataset = pd.merge(user_history.drop(columns=['timestamp']), movies, on='movieId', how='left')

    # Return the dataset containing the user's rating history along with movie metadata
    return dataset


In [112]:
def analyze_ratings(ratings):
    # Function to analyze ratings data and calculate various statistics

    # Calculate average and median rating
    average_rating = ratings['rating'].mean()
    median_rating = ratings['rating'].median()

    # Calculate share of positive/neutral/negative ratings
    positive_ratings = ratings[ratings['rating'] > 0.0]
    neutral_ratings = ratings[ratings['rating'] == 0.0]
    negative_ratings = ratings[ratings['rating'] < 0.0]

    total_ratings = len(ratings)
    total_positive_ratings = len(positive_ratings)
    total_neutral_ratings = len(neutral_ratings)
    total_negative_ratings = len(negative_ratings)

    share_positive_ratings = (total_positive_ratings / total_ratings) * 100
    share_neutral_ratings = (total_neutral_ratings / total_ratings) * 100
    share_negative_ratings = (total_negative_ratings / total_ratings) * 100

    # Create a dictionary to store the results
    results = {
        "average_rating": average_rating,
        "median_rating": median_rating,
        "share_positive_ratings": share_positive_ratings,
        "share_neutral_ratings": share_neutral_ratings,
        "share_negative_ratings": share_negative_ratings
    }

    # Return the results dictionary
    return results


In [113]:
def top_N_score(score_matrix, userid,n, movies,ratings):

  user_ratings=ratings[ratings["userId"]==userid]
  rated_movies=user_ratings["movieId"].tolist()

  io=score_matrix.loc[userid]

  # Sort the movie ratings in descending order and take the top n
  top_n_movies = io.sort_values(ascending=False)

  # Initialize an empty list to store recommendations
  recommendations = []

  # Iterate over the top_n_movies and gather information
  for movie_id, score in top_n_movies.items():
        # Retrieve movie information from the 'movies' DataFrame
        movie_info = movies.loc[movies['movieId'] == movie_id]

        # Extract movie title and genres
        movie_title = movie_info['title'].values[0]
        movie_genres = movie_info['genres'].values[0]

        # Append movie identifier, title, genres, and recommendation score to recommendations list
        #if the user haven't seen yet the movie
        if movie_id not in rated_movies:

          recommendations.append((int(movie_id), movie_title, movie_genres, float(score)))
  recommendations.sort(key=lambda x: (-x[3], x[0]))

  #print(recommendations)
  return recommendations[:n]



###Programming Task

In [114]:
#load the data
def load_data(table):
  #load the data give the file
  df = pd.read_csv(table)

  return df


In [115]:
#rescale the matrix
#takes the rating matrix and rescales all the rates of a fixed quantity -3
def rescaling(matrix):
  matrix['rating'] = matrix['rating'] - 3
  return matrix


In [116]:
# This function takes a matrix representing movie genre data as input.
# The function returns both the normalized count (after square root transformation) and the sum of genre counts for each movie.

def count_genres(matrix):

    # Sum the values of the columns for each row
    row_sum = matrix.iloc[:, :].sum(axis=1)

    # Take the square root of the sum of genres count for each movie
    movie_encode_sqrt = row_sum.apply(lambda x: x ** -0.5)

    # Return the normalized count and the sum of genres count for each movie
    return movie_encode_sqrt, row_sum


In [117]:
#crete user profile
#takes the ratings to build the user profile
def user_profiles(matrix,movies,normalization=0):

  selected_columns_matrix = matrix[['userId', 'movieId' , 'rating']]
  selected_columns_movies = movies[['movieId', 'genres']]
  #print(selected_columns_matrix)
  # Extract all unique genres from the movies DataFrame
  all_genres = list(set('|'.join(movies['genres']).split('|')))

  #merge the matrix to get the results
  df = pd.merge(selected_columns_matrix, selected_columns_movies, on="movieId", how="inner")
  genres = df['genres'].str.get_dummies(sep='|')
  #print(df)
  #take the ratings
  ratings=df["rating"]
  if normalization:
      #print(genres)
      normalization_factor,sum_genres=count_genres(genres)
      #print(normalization_factor)
      genres=genres.mul(normalization_factor,axis=0)
      #print(genres)

  result = genres.mul(ratings, axis=0)
  #print(result)
  df_new=pd.concat([df,result],axis=1)
  #get the user profile
  user_profile = df_new[['userId'] + all_genres]
  #print(user_profile)
    # Group by user ID and sum the other columns

  user_profile = user_profile.groupby("userId").sum().reset_index()
  #print("output", user_profile)
  #order the cols of the matrix
  user_profile_first_col = user_profile.iloc[:, 0]
  # Sort the remaining columns based on movieId for movie_encode
  user_profile_sorted = user_profile.reindex(sorted(user_profile.columns[1:]), axis=1)
  # Sort the columns of user_profile based on the movie IDs from movie_encode
  user_profile = pd.concat([user_profile_first_col, user_profile_sorted], axis=1)
  return user_profile


In [118]:
#create movies one hot encode
def movie_hot_encode(matrix):
  #generate the one hot encode
  genres = matrix['genres'].str.get_dummies(sep='|')
  # Extract all unique genres from the movies DataFrame
  all_genres = list(set('|'.join(matrix['genres']).split('|')))
  #create the matrix containing only movie id and the hot encode
  movie_profile = pd.concat([matrix[['movieId']],genres ],axis=1)
  #print(movie_profile)
  #order the cols
  movie_encode_first_col = movie_profile.iloc[:, 0]
  # Sort the remaining columns based on movieId for movie_encode
  movie_encode_sorted = movie_profile.reindex(sorted(movie_profile.columns[1:]), axis=1)
  # Concatenate the first column with the sorted columns for movie_encode
  movie_encode = pd.concat([movie_encode_first_col, movie_encode_sorted], axis=1)
  return movie_encode

In [119]:
# Function to normalize movie genre data and create a matrix
#movie is the matrix containing the movie id and all_genres is a list contains all possible genres
def movie_normalization_matrix(movie, all_genres):

    # Extract genres information from the movie DataFrame
    genres = movie['genres']

    # Create a DataFrame with one-hot encoding for genres
    one_hot_df = genres.str.get_dummies(sep='|')

    # Reindex the DataFrame with all possible genres to include missing genres
    one_hot_df = one_hot_df.reindex(columns=sorted(all_genres), fill_value=0)

    # Calculate normalization factor and sum of genre counts
    normalization_factor, sum_genres = count_genres(one_hot_df)

    # Apply normalization
    genres_normalized = one_hot_df.mul(normalization_factor, axis=0)

    # Create DataFrame with normalized genres
    genres_df = pd.DataFrame(genres_normalized, columns=sorted(all_genres))

    # Concatenate movieId column with normalized genres DataFrame
    matrix2 = pd.concat([movie["movieId"], genres_df], axis=1)

    # Return the normalized matrix containing movieId and genres information
    return matrix2  # make the same and put a comment that explains in short what the function does


In [120]:
# Function to calculate the score between users and movies based on genre similarity
#matrix1 contains all the user user profile
def score(matrix1, movie, normalization=0):

    # Copy the input matrix to avoid modifying the original data
    matrix1_copy = matrix1.copy()

    # Set the index of matrix1_copy to 'userId'
    matrix1_copy.set_index('userId', inplace=True)

    # Extract all genres from the movie DataFrame
    all_genres = list(set('|'.join(movie['genres']).split('|')))

    # Create a one-hot encoded DataFrame for genres
    genres = movie["genres"].str.get_dummies(sep='|')

    # Apply normalization if specified
    if normalization:
        # Calculate normalization factor and sum of genre counts
        normalization_factor, sum_genres = count_genres(genres)

        # Apply normalization to the genres DataFrame
        genres = genres.mul(normalization_factor, axis=0)

    # Create DataFrame with normalized genres
    genres_df = pd.DataFrame(genres, columns=all_genres)

    # Concatenate movieId column with genres DataFrame
    matrix2 = pd.concat([movie["movieId"], genres_df], axis=1)

    # Set the index of matrix2 to 'movieId'
    matrix2.set_index('movieId', inplace=True)

    # Transpose matrix2 for easier calculation
    matrix2_transposed = matrix2.T

    # Perform element-wise multiplication to calculate the score
    result_matrix = matrix1_copy.dot(matrix2_transposed)

    # Return the resulting score matrix
    return result_matrix


In [121]:
# Function to calculate inverse document frequency (IDF) for each genre

def idf(matrix):

    # Convert non-zero values in the matrix to True
    true_matrix = matrix.iloc[:, 1:] != 0.0

    # Count the number of documents (movies) each genre appears in
    df = true_matrix.sum()

    # Calculate IDF for each genre
    idf = 1 / df

    # Return IDF values
    return idf


In [122]:
# Function to rescale the user vectors in the matrix using IDF values
def rescaling_idf(matrix, idf_rescaling):

    # Extract movie vectors from the matrix
    matrix_movies = matrix.iloc[:, 1:]

    # Rescale the user vectors using IDF values
    rescaled_user_vector = matrix_movies.multiply(idf_rescaling, axis=1)

    # Concatenate the user ID column with the rescaled user vectors
    final_matrix = pd.concat([matrix["userId"], rescaled_user_vector], axis=1)

    # Return the final rescaled matrix
    return final_matrix


In [123]:
# Function to transform a list of tuples into a DataFram
def transform_to_matrix(data):

    # Create a DataFrame from the list of tuples with columns 'movieId', 'title', 'genres', and 'score'
    df = pd.DataFrame(data, columns=['movieId', 'title', 'genres', 'score'])

    # Drop the 'score' column
    df.drop(columns=['score'], inplace=True)

    # Return the resulting DataFrame
    return df


In [124]:
# Function to compute cosine similarity between two vectors
def cosine_similarity(tuple1, tuple2):

    # Remove the first element from each tuple
    tuple1 = tuple1[1:]
    tuple2 = tuple2[1:]

    # Compute dot product
    dot_product = sum(a * b for a, b in zip(tuple1, tuple2))

    # Compute magnitudes
    mag1 = math.sqrt(sum(a * a for a in tuple1))
    mag2 = math.sqrt(sum(b * b for b in tuple2))

    # Avoid division by zero
    if mag1 == 0 or mag2 == 0:
        return 0

    # Compute cosine similarity
    similarity = dot_product / (mag1 * mag2)

    # Return the computed cosine similarity
    return similarity

# Function to find the maximum cosine similarity between an item and a list of diversified items

def max_cosine_similarity(item, diversified_items):

    # Initialize max similarity
    max_similarity = 0.0

    # Iterate over diversified items
    for item2 in diversified_items:
        # Calculate cosine similarity between item and each diversified item
        sim = cosine_similarity(item, item2)

        # Update max similarity if the calculated similarity is greater
        if sim > max_similarity:
            max_similarity = sim

    # Return the maximum cosine similarity
    return max_similarity


In [125]:
# Function to convert a DataFrame into a set of tuples
def matrix_to_set(matrix):

    # Initialize an empty list to store tuples
    row_set = []

    # Iterate over rows in the DataFrame
    for _, row in matrix.iterrows():
        # Convert each row to a tuple and add it to the set
        row_set.append(tuple(row))

    # Return the set of tuples
    return row_set


In [126]:
def ranking(movie_encode,user_vector,num_recommendations,coef=0.5):
  related_items=matrix_to_set(movie_encode)

  #print(related_items)
  #contains a row with the first element that is the id of the movie
  diversified_items=set()
  deb=[]
  while len(diversified_items)< num_recommendations:
    #contains a row with the first element that is the id of the movie
    best_item=()
    best_item_score=-1000000000
    best_relevanceToQuery=0.0
    #predere la riga della matrice
    for item in related_items:
      relevanceToQuery=cosine_similarity(item, tuple(user_vector))
      #print(relevanceToQuery)
      combinedScore=0.0
      if len(diversified_items)!=0 :

        similarityWithSelectedItems=max_cosine_similarity(item,diversified_items)
        combinedScore=coef*relevanceToQuery-(1-coef)*similarityWithSelectedItems
      else:
        combinedScore=coef*relevanceToQuery
      #print(combinedScore)
      if combinedScore>best_item_score:
          #print("entrato")
          best_item=item
          #print("item in iteration: ",item)
          #print("iter: ",len(diversified_items))
          best_item_score=combinedScore
          best_relevanceToQuery=relevanceToQuery
    diversified_items.add(best_item)
    #print(best_item)
    deb.append((int(best_item[0]),float(best_item_score)))
    related_items.remove(best_item)
  deb.sort(key=lambda x: (-x[1], x[0]))
  #print(diversified_items)
  return deb

##4 Analisys Task

###4.1 Exploratory Data Analysis

In [135]:
ratings = load_data("ratings.csv")
movies = load_data("movies.csv")

# Print number of genres
all_genres = list(set('|'.join(movies['genres']).split('|')))
print("Number of movie genres:", len(all_genres))

# Find the five most common genres
counter = genres_counter(ratings, movies)
# Get the top 5 elements
top_5 = dict(sorted(counter.items(), key=lambda x: x[1], reverse=True)[:5])
# Get the least 5 elements
least_5 = dict(sorted(counter.items(), key=lambda x: x[1])[:5])
print("Five least common genres:", least_5)
print("Five most common genres:", top_5)

# How many genres does a movie have on average? What is the minimum and maximum number of genres a movie has?

# Create the one-hot encode for the movie genre in matrix movies
movie_encode = movie_hot_encode(movies)
# Take the average, minimum, and maximum
avg, min, max = movie_metrics(movie_encode)
print("Average number of genres per movie:", avg)
print("Minimum number of genres per movie:", min)
print("Maximum number of genres per movie:", max)

# Take rating and timestamp for user id 289 and movie id 1125
rating, time_stamp = take_user_movie(ratings, 289, 1125)
print("Rating for user id 289 and movie id 1125:", rating)
date = pd.to_datetime(time_stamp, unit='s')
print("Timestamp for user id 289 and movie id 1125:", date)

# Take metadata for movie id 1125
metadata=get_movie_data(movies, 1125)
print(metadata)

Number of movie genres: 20
Five least common genres: {'(no genres listed)': 18, 'Film-Noir': 1140, 'Documentary': 1564, 'Western': 1912, 'IMAX': 3156}
Five most common genres: {'Drama': 44752, 'Comedy': 38026, 'Action': 27056, 'Thriller': 25240, 'Adventure': 22017}
Average number of genres per movie: 2.2290410958904108
Minimum number of genres per movie: 1
Maximum number of genres per movie: 10
Rating for user id 289 and movie id 1125: 39624    3.0
Name: rating, dtype: float64
Timestamp for user id 289 and movie id 1125: 39624   2012-02-11 04:44:30
Name: timestamp, dtype: datetime64[ns]
     movieId                                   title        genres
902     1125  Return of the Pink Panther, The (1975)  Comedy|Crime


###4.2 Basic Recommender

In [136]:
# Rescale the ratings matrix
ratings_rescaled = rescaling(ratings)
# Retrieve the rating history for user 526
user_history = get_user_history(ratings_rescaled, movies, 526)
print("User rating history for user 526:")
print(user_history)

# Analyze the ratings history
results = analyze_ratings(user_history)
print("Analysis of user 526's ratings history:")
print(results)

# Generate user profile based on rescaled ratings
user_profile = user_profiles(ratings_rescaled, movies)
# Print user profile for user 526
print("User profile for user 526:")
print(user_profile[user_profile["userId"] == 526])

# Score movies based on user profile
score_matrix = score(user_profile, movies)

# Get top N scored movies for user 526
top_scores = top_N_score(score_matrix, 526, 5, movies, ratings)
print("Top 5 movie recommendations for user 526:")
print(top_scores)


User rating history for user 526:
    userId  movieId  rating  \
0      526        1     1.0   
1      526     1653     1.0   
2      526     4226     1.0   
3      526     4370     1.5   
4      526     4720     1.0   
5      526     4973     0.5   
6      526     4995     2.0   
7      526     5445     2.0   
8      526     5502     1.5   
9      526     5679     1.0   
10     526     6333     1.0   
11     526     6373     1.0   
12     526     7153     0.5   
13     526     8368     1.0   
14     526     8644     0.5   
15     526     8961     1.0   
16     526    39427     1.5   
17     526    47099     0.5   
18     526    50872     1.5   
19     526    60069     1.5   
20     526    79132     1.0   
21     526    85414     0.0   
22     526    88744     1.0   
23     526    91658     1.0   
24     526   109487     2.0   
25     526   112556     1.0   
26     526   134853     1.5   

                                                title  \
0                                    Toy

 ### 4.3 Normalizing features

In [137]:
# Retrieve the top five recommendations for user 526

# Generate user profile based on rescaled ratings
user_profile = user_profiles(ratings_rescaled, movies, 1)
# Calculate scores for movies based on user profile
score_matrix = score(user_profile, movies, 1)
# Get top 5 scored movies for user 526
top5 = top_N_score(score_matrix, 526, 5, movies, ratings)
print("Top 5 movie recommendations for user 526:")
print(top5)

# Retrieve user rating history for user 14
user_history = get_user_history(ratings_rescaled, movies, 14)
print("User rating history for user 14:")
print(user_history)
# Analyze the rating history for user 14
analysis_14 = analyze_ratings(user_history)
print("Analysis of user 14's ratings history:")
print(analysis_14)

# Print the score matrix
print("Score matrix:")
print(score_matrix)

# Retrieve top recommendations for user 14 with normalized scores
top5_14 = top_N_score(score_matrix, 14, 5, movies, ratings)
print("Top 5 movie recommendations for user 14 with normalized scores:")
print(top5_14)


Top 5 movie recommendations for user 526:
[(52328, 'Sunshine (2007)', 'Adventure|Drama|Sci-Fi|Thriller', 14.29021272462629), (8361, 'Day After Tomorrow, The (2004)', 'Action|Adventure|Drama|Sci-Fi|Thriller', 14.221406070274385), (48774, 'Children of Men (2006)', 'Action|Adventure|Drama|Sci-Fi|Thriller', 14.221406070274385), (58025, 'Jumper (2008)', 'Action|Adventure|Drama|Sci-Fi|Thriller', 14.221406070274385), (91500, 'The Hunger Games (2012)', 'Action|Adventure|Drama|Sci-Fi|Thriller', 14.221406070274385)]
User rating history for user 14:
    userId  movieId  rating  \
0       14      594    -2.0   
1       14     1196     1.0   
2       14     1721     0.0   
3       14     2038     0.0   
4       14     2355    -1.0   
5       14     2394     0.0   
6       14     2628     0.0   
7       14     2683    -1.0   
8       14     2716     0.0   
9       14     2720    -1.0   
10      14     2724     0.0   
11      14     2861    -1.0   
12      14     3114     1.0   
13      14     3157  

###4.4 Accounting for differences in frequency


In [138]:
# Update user profile with rescaled ratings
user_profile_updated = user_profiles(ratings_rescaled, movies, 1)

# Calculate the Inverse Document Frequency (IDF) for each movie category
category_idf = idf(movie_encode)
print("IDF for each movie category:")
print(category_idf)

# Rescale the user vector using IDF values
rescaled_user_vector = rescaling_idf(user_profile_updated, category_idf)
print("Rescaled user vector:")
print(rescaled_user_vector[rescaled_user_vector["userId"] == 526])

# Calculate the updated score matrix
score_matrix_update = score(rescaled_user_vector, movies, 1)
# Print the score matrix for user 526
print("Updated score matrix for user 526:")
print(score_matrix_update.loc[526, :])

# Get top 5 recommendations for user 526 with updated scores
top5_recommendations_526_rescaled = top_N_score(score_matrix_update, 526, 5, movies, ratings)
print("Top 5 recommendations for user 526 with updated scores:")
print(top5_recommendations_526_rescaled)


IDF for each movie category:
(no genres listed)    0.055556
Action                0.000647
Adventure             0.000895
Animation             0.002237
Children              0.001715
Comedy                0.000302
Crime                 0.000909
Documentary           0.002020
Drama                 0.000229
Fantasy               0.001529
Film-Noir             0.007519
Horror                0.001140
IMAX                  0.006536
Musical               0.002538
Mystery               0.001842
Romance               0.000647
Sci-Fi                0.001263
Thriller              0.000578
War                   0.002725
Western               0.005952
dtype: float64
Rescaled user vector:
     userId  (no genres listed)    Action  Adventure  Animation  Children  \
525     526                 0.0  0.002084   0.004137   0.006809  0.005221   

       Comedy     Crime  Documentary     Drama  ...  Film-Noir    Horror  \
525  0.000712  0.001157          0.0  0.001919  ...        0.0  0.002216   

      

###4.5 Diversifying the recommendations


In [139]:
# Take the top 200 recommendations with IDF score for user 526

# Update user profile with rescaled ratings
user_profile_updated = user_profiles(ratings_rescaled, movies, 1)

# Calculate the IDF for each movie category
category_idf = idf(movie_encode)

# Rescale the user vector using IDF values
rescaled_user_vector = rescaling_idf(user_profile_updated, category_idf)

# Get all genres for movies
all_genres = list(set('|'.join(movies['genres']).split('|')))

# Filter user 526 from the rescaled user vector
rescaled_user526_vector = rescaled_user_vector[rescaled_user_vector["userId"] == 526]

# Calculate the updated score matrix
score_matrix_update = score(rescaled_user_vector, movies, 1)

# Take the top 200 recommendations for user 526 with updated scores
top200_526_rescaled = top_N_score(score_matrix_update, 526, 200, movies, ratings)

# Convert the top 200 recommendations into a DataFrame
ranked_movies = transform_to_matrix(top200_526_rescaled)

# Calculate the movie matrix normalized
movie_normalized_matrix = movie_normalization_matrix(ranked_movies, all_genres)

# Rerank the recommendations
top5_reranked = ranking(movie_normalized_matrix, rescaled_user526_vector.iloc[0, :], 5)
print("Top 5 reranked recommendations:")
print(top5_reranked)

# Retrieve movie information for the reranked recommendations
print("Movie information for the reranked recommendations:")
for item in top5_reranked:
    movie_id = item[0]
    print(get_movie_data(movies, movie_id))


Top 5 reranked recommendations:
[(5882, 0.40575049627373694), (85510, 0.18284358877109694), (87306, 0.1415741730339744), (81417, 0.1044352730930149), (90746, 0.08707847848750794)]
Movie information for the reranked recommendations:
      movieId                   title  \
4363     5882  Treasure Planet (2002)   

                                        genres  
4363  Adventure|Animation|Children|Sci-Fi|IMAX  
      movieId                title                        genres
7771    85510  Sucker Punch (2011)  Action|Fantasy|Thriller|IMAX
      movieId           title                        genres
7822    87306  Super 8 (2011)  Mystery|Sci-Fi|Thriller|IMAX
      movieId                         title       genres
7654    81417  Paranormal Activity 2 (2010)  Horror|IMAX
      movieId                             title                         genres
7927    90746  Adventures of Tintin, The (2011)  Action|Animation|Mystery|IMAX


In [140]:
# Take the top 200 recommendations with IDF score for user 225

# Retrieve user history for user 225
user_history = get_user_history(ratings_rescaled, movies, 225)
print("User history for user 225:")
print(user_history)

# Update user profile with rescaled ratings
user_profile_updated = user_profiles(ratings_rescaled, movies, 1)

# Calculate the IDF for each movie category
category_idf = idf(movie_encode)

# Rescale the user vector using IDF values
rescaled_user_vector = rescaling_idf(user_profile_updated, category_idf)

# Get all genres for movies
all_genres = list(set('|'.join(movies['genres']).split('|')))

# Filter user 225 from the rescaled user vector
rescaled_user225_vector = rescaled_user_vector[rescaled_user_vector["userId"] == 225]

# Calculate the updated score matrix
score_matrix_update = score(rescaled_user_vector, movies, 1)

# Take the top 200 recommendations for user 225 with updated scores
top200_225_rescaled = top_N_score(score_matrix_update, 225, 200, movies, ratings)

# Convert the top 200 recommendations into a DataFrame
ranked_movies = transform_to_matrix(top200_225_rescaled)

# Calculate the movie matrix normalized
movie_normalized_matrix = movie_normalization_matrix(ranked_movies, all_genres)

# Rerank the recommendations
top5_reranked = ranking(movie_normalized_matrix, rescaled_user225_vector.iloc[0, :], 5)
print("Top 5 reranked recommendations for user 225:")
print(top5_reranked)

# Retrieve movie information for the reranked recommendations
print("Movie information for the reranked recommendations:")
for item in top5_reranked:
    movie_id = item[0]
    print(get_movie_data(movies, movie_id))

# Get genre counter for user 225
counter = genres_counter(ratings[ratings["userId"] == 225], movies)
print("Genre counter for user 225:")
print(counter)


User history for user 225:
    userId  movieId  rating  \
0      225       47     2.0   
1      225      110     1.0   
2      225      150     1.0   
3      225      153     1.0   
4      225      185     0.0   
5      225      208     1.0   
6      225      231     0.0   
7      225      253     2.0   
8      225      296     0.0   
9      225      316     1.0   
10     225      329     1.0   
11     225      344     1.0   
12     225      356     2.0   
13     225      364     1.0   
14     225      367     2.0   
15     225      380     1.0   
16     225      410     0.0   
17     225      454     0.0   
18     225      457     2.0   
19     225      480     2.0   
20     225      588     2.0   
21     225      589     2.0   
22     225      590     2.0   
23     225      592     0.0   
24     225      593     1.0   
25     225      595     2.0   
26     225      661     0.0   
27     225      724     1.0   

                                                title  \
0               

In [141]:
# Take the top 200 recommendations with IDF score for user 341

# Update user profile with rescaled ratings
user_profile_updated = user_profiles(ratings_rescaled, movies, 1)

# Calculate the IDF for each movie category
category_idf = idf(movie_encode)

# Rescale the user vector using IDF values
rescaled_user_vector = rescaling_idf(user_profile_updated, category_idf)

# Get all genres for movies
all_genres = list(set('|'.join(movies['genres']).split('|')))

# Filter user 341 from the rescaled user vector
rescaled_user341_vector = rescaled_user_vector[rescaled_user_vector["userId"] == 341]

# Calculate the updated score matrix
score_matrix_update = score(rescaled_user_vector, movies, 1)

# Take the top 300 recommendations for user 341 with updated scores
top300_341_rescaled = top_N_score(score_matrix_update, 341, 300, movies, ratings)

# Convert the top 300 recommendations into a DataFrame
ranked_movies = transform_to_matrix(top300_341_rescaled)

# Calculate the movie matrix normalized
movie_normalized_matrix = movie_normalization_matrix(ranked_movies, all_genres)

# Rerank the recommendations
top5_reranked = ranking(movie_normalized_matrix, rescaled_user341_vector.iloc[0, :], 5)
print("Top 5 reranked recommendations for user 341:")
print(top5_reranked)

# Retrieve movie information for the reranked recommendations
print("Movie information for the reranked recommendations:")
for item in top5_reranked:
    movie_id = item[0]
    print(get_movie_data(movies, movie_id))

# Get genre counter for user 341
counter = genres_counter(ratings[ratings["userId"] == 341], movies)
print("Genre counter for user 341:")
print(counter)


Top 5 reranked recommendations for user 341:
[(389, 0.22930561786888734), (27397, -0.08901210480536509), (1450, -0.10443427852885986), (61236, -0.1335550109109648), (665, -0.14293696970907274)]
Movie information for the reranked recommendations:
     movieId                       title             genres
351      389  Colonel Chabert, Le (1994)  Drama|Romance|War
      movieId                                              title  \
5907    27397  Joint Security Area (Gongdong gyeongbi guyeok ...   

                                genres  
5907  Crime|Drama|Mystery|Thriller|War  
      movieId                                              title genres
1174     1450  Prisoner of the Mountains (Kavkazsky plennik) ...    War
      movieId                                      title  \
7027    61236  Waltz with Bashir (Vals im Bashir) (2008)   

                               genres  
7027  Animation|Documentary|Drama|War  
     movieId               title            genres
571      665  Under