<a href="https://www.kaggle.com/code/nizarbousabat/movie-recommendation-system-project?scriptVersionId=212916812" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Reading the main datasets

In [None]:
df = pd.read_csv('/kaggle/input/movielens-100k-dataset/ml-100k/u.data', sep='\t', header=None, names=['user_id', 'movie_id', 'rating','timestamp'])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df_info=pd.read_csv('/kaggle/input/movielens-100k-dataset/ml-100k/u.info', sep='\t', header=None)

In [None]:
df_info.head()

In [None]:
column_names = [ 'movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ] 
# Read the file 
df_items = pd.read_csv( '/kaggle/input/movielens-100k-dataset/ml-100k/u.item', sep='|', header=None, names=column_names, encoding='ISO-8859-1' )

In [None]:
df_items.head()

In [None]:
df_genre=pd.read_csv('/kaggle/input/movielens-100k-dataset/ml-100k/u.genre', sep='|', header=None)

In [None]:
df_genre.head()

In [None]:
df_user= pd.read_csv('/kaggle/input/movielens-100k-dataset/ml-100k/u.user', sep='|', header=None, names=['user_id','age','gender','occupation','zip code'])

In [None]:
df_user.head()

## Merging tables

#### Here we will merge all the table in one table so we can see the relationship between users and movies

In [None]:
df_merged1 = pd.merge(df, df_user, on='user_id', how='inner')

In [None]:
df_merged1.head()

In [None]:
df_merged1.shape

In [None]:
df_merged = pd.merge(df_merged1, df_items, on='movie_id', how='inner')

## Explore the merged data

In [None]:
df_merged.head()

In [None]:
df_merged.info()

#### we see that there are some nan values that we need to clear

## Clean the data

In [None]:
df_cleaned = df_merged.dropna(axis=1, how='all')

In [None]:
df_cleaned.info()

In [None]:
df_cleaned = df_cleaned.dropna()

In [None]:
df_cleaned.info()

#### we didn't lost so much data so it's ok to delete the nan values

In [None]:
df_cleaned["unknown"].head()

### we see that unknown column has zeros so we should delete it

In [None]:
df_cleaned= df_cleaned.drop(columns=['unknown'])

## checking for duplicates

In [None]:
duplicate_count = df_cleaned.duplicated().sum()
print(f'Number of duplicate rows: {duplicate_count}')

In [None]:
df_cleaned.head()

### Let's convert timestamp to datetime object

In [None]:
df_cleaned['timestamp'] = pd.to_datetime(df_cleaned['timestamp'], unit='s') 


In [None]:
df_cleaned.head()

### we will merge the data with zip_codes to get more geographical information

In [None]:
zip_codes=pd.read_csv("/kaggle/input/zipcodes-county-fips-crosswalk/ZIP-COUNTY-FIPS_2017-06.csv")

In [None]:
zip_codes.head()

In [None]:
zip_codes["COUNTYNAME"].unique()

### checking for duplicates and nan values

In [None]:
zip_codes.info()

In [None]:
duplicate_zips = zip_codes['ZIP'].duplicated().sum()
print(f'Number of duplicate ZIP codes: {duplicate_zips}')


### Removing duplicates from zip and merging it with the cleaned data

In [None]:
zip_codes = zip_codes.drop_duplicates(subset=['ZIP'])
df_cleaned['zip code'] = df_cleaned['zip code'].astype(str) 
zip_codes['ZIP'] = zip_codes['ZIP'].astype(str)
df_final = pd.merge(df_cleaned, zip_codes,left_on='zip code',right_on='ZIP', how='inner')

In [None]:
df_final.shape

In [None]:
df_cleaned.shape

### we will add a new feature to calculate the number of dates between release and rating

In [None]:
# Calculate time difference between rating date and release date
df_final['timestamp'] = pd.to_datetime(df_final['timestamp'], errors='coerce') 
df_final['release_date'] = pd.to_datetime(df_final['release_date'], errors='coerce')
df_final['days_since_release'] = (df_final['timestamp'] - df_final['release_date']).dt.days

In [None]:
df_final.head()

### we will now extract the release year

In [None]:
df_final['year_of_release'] = df_final['release_date'].dt.year

In [None]:
df_final.info()

In [None]:
df_final.head()

In [None]:
df_final['rating'] = pd.to_numeric(df_final['rating'], downcast='unsigned')
df_final['age'] = pd.to_numeric(df_final['age'], downcast='unsigned')

# Visualisation part

### Plot 1:the count of ratings by age

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Create the countplot
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.countplot(x='gender', data=df_final, palette='Set2')

# Add title and labels
plt.title('Countplot by Gender')
plt.xlabel('Gender')
plt.ylabel('Count')

# Show the plot
plt.show()

#### we see that males have more ratings in this dataset

### Plot 2: the number of ratings by occupation

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
# Create the countplot
sns.set(style="whitegrid")
plt.figure(figsize=(8, 6))
sns.countplot(x='occupation', data=df_final, palette='Set2')

# Add title and labels
plt.xticks(rotation=90)
plt.title('Countplot by Gender')
plt.xlabel('occupation')
plt.ylabel('Count')

# Show the plot
plt.show()

#### we see that students are the most type who rated in this dataset

### Plot 3:the number of ratings by movie type

In [None]:
movie_type_columns = [ 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ]
df_movie_types = df_final[movie_type_columns]
movie_counts = df_movie_types.sum()
movie_counts = movie_counts.reset_index() 
movie_counts.columns = ['Movie_Type', 'Count'] 
# Plot the counts using Seaborn 
sns.set(style="whitegrid") 
plt.figure(figsize=(14, 8)) 
sns.barplot(x='Movie_Type', y='Count', data=movie_counts, palette='Set2') 
# Add title and labels
plt.xticks(rotation=90)
plt.title('Count of Movie Types') 
plt.xlabel('Movie Type') 
plt.ylabel('Count') 
# Rotate x-axis labels for better readability plt.xticks(rotation=45) 
# Show the plot 
plt.show()

#### we see that the top 3 are drama,comedy and action

### Plot 4:the number of users by age

In [None]:
# Plot the distribution of user ages
plt.figure(figsize=(10, 6))
sns.histplot(df_final['age'], bins=10, kde=True)
plt.title('Distribution of User Ages')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Define age groups
bins = [0, 18, 25, 35, 45, 50, 56, 100]
labels = ['<18', '18-24', '25-34', '35-44', '45-49', '50-55', '56+']
df_final['age_group'] = pd.cut(df_final['age'], bins=bins, labels=labels, right=False, include_lowest=True)

# Plot the count of users by age group and gender
plt.figure(figsize=(14, 8))
sns.countplot(x='age_group', hue='gender', data=df_final, palette='Set2')
plt.title('User Demographics by Age Group and Gender')
plt.xlabel('Age Group')
plt.ylabel('Count')
plt.show()

#### we see that young people are the majortity in this dataset

### Plot 5: the most ratings given in this dataset

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

# Remove infinite values
df_final = df_final.replace([np.inf, -np.inf], np.nan).dropna(subset=['rating'])

# Plot the distribution of ratings
plt.figure(figsize=(10, 6))
sns.histplot(df_final['rating'], bins=10, kde=True)
plt.title('Distribution of Movie Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')
plt.show()


#### we see that rating 4 is the most common rating in this dataset

### Plot 6:the number of ratings by gender

In [None]:

# Plot the ratings by genre
plt.figure(figsize=(14, 8))
sns.boxplot(x='gender', y='rating', data=df_final, palette='Set2')
plt.title('Ratings by Genre')
plt.xlabel('Genre')
plt.ylabel('Rating')
plt.xticks(rotation=45)
plt.show()


#### we see that both gender have the same distribution

### plot 7:the top 20 rated movies

In [None]:
# Count the number of ratings for each movie
popular_movies = df_final['movie_title'].value_counts().reset_index()
popular_movies.columns = ['Movie Title', 'Count']

# Plot the most popular movies
plt.figure(figsize=(14, 8))
sns.barplot(x='Count', y='Movie Title', data=popular_movies.head(20), palette='Set2')
plt.title('Top 20 Most Rated Movies')
plt.xlabel('Number of Ratings')
plt.ylabel('Movie Title')
plt.show()


### Plot 8:the highest rated movies

In [None]:
# Calculate the average rating for each movie
avg_ratings = df_final.groupby('movie_title')['rating'].mean().reset_index()
avg_ratings.columns = ['Movie Title', 'Average Rating']

# Plot the top 20 highest rated movies
plt.figure(figsize=(14, 8))
sns.barplot(x='Average Rating', y='Movie Title', data=avg_ratings.nlargest(20, 'Average Rating'), palette='Set2')
plt.title('Top 20 Highest Rated Movies')
plt.xlabel('Average Rating')
plt.ylabel('Movie Title')
plt.show()


#### we cannot take so much information from that plot given that a movie can be rated only once 5

### Plot 9:Average rating for each genre

In [None]:

movie_genre_columns = [
    'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 
    'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

# Calculate average ratings for each genre
genre_ratings = {}
for genre in movie_genre_columns:
    genre_ratings[genre] = df_final.loc[df_final[genre] == 1, 'rating'].mean()

# Convert the dictionary to a DataFrame for plotting
genre_ratings_df = pd.DataFrame(list(genre_ratings.items()), columns=['Genre', 'Average Rating'])

# Plot the average ratings for each genre
plt.figure(figsize=(14, 8))
sns.barplot(x='Genre', y='Average Rating', data=genre_ratings_df, palette='Set2')
plt.title('Average Ratings for Each Genre')
plt.xlabel('Genre')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)
plt.show()


#### we see that war and drama have the highest rating average

### Plot 10:Average rating by genre for each gender

In [None]:

# List of columns representing movie genres
movie_genre_columns = [
    'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
    'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 
    'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'
]

# Calculate average ratings for each genre by gender
genre_ratings_male = {}
genre_ratings_female = {}

for genre in movie_genre_columns:
    genre_ratings_male[genre] = df_final.loc[(df_final[genre] == 1) & (df_final['gender'] == 'M'), 'rating'].mean()
    genre_ratings_female[genre] = df_final.loc[(df_final[genre] == 1) & (df_final['gender'] == 'F'), 'rating'].mean()

# Convert the dictionaries to DataFrames for plotting
genre_ratings_male_df = pd.DataFrame(list(genre_ratings_male.items()), columns=['Genre', 'Average Rating (Male)'])
genre_ratings_female_df = pd.DataFrame(list(genre_ratings_female.items()), columns=['Genre', 'Average Rating (Female)'])

# Plot the average ratings for each genre by gender
fig, axes = plt.subplots(2, 1, figsize=(14, 16), sharex=True)

sns.barplot(ax=axes[0], x='Genre', y='Average Rating (Male)', data=genre_ratings_male_df, palette='Blues_d')
axes[0].set_title('Average Ratings for Each Genre by Male Users')
axes[0].set_xlabel('')
axes[0].set_ylabel('Average Rating')
axes[0].tick_params(axis='x', rotation=45)

sns.barplot(ax=axes[1], x='Genre', y='Average Rating (Female)', data=genre_ratings_female_df, palette='Reds_d')
axes[1].set_title('Average Ratings for Each Genre by Female Users')
axes[1].set_xlabel('Genre')
axes[1].set_ylabel('Average Rating')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


#### we see that both male and female like film-noir and war the most 

In [None]:
df_final["gender"].value_counts()

In [None]:

# Function to plot genre preferences by occupation
def plot_genre_preferences(genre_list, title):
    genre_ratings = []
    for genre in genre_list:
        ratings = df_final[df_final[genre] == 1].groupby('occupation')['rating'].mean().reset_index()
        ratings['Genre'] = genre
        genre_ratings.append(ratings)
    
    genre_ratings_df = pd.concat(genre_ratings, ignore_index=True)

    plt.figure(figsize=(14, 8))
    sns.barplot(x='Genre', y='rating', hue='occupation', data=genre_ratings_df, palette='Set2')
    plt.title(title)
    plt.xlabel('Genre')
    plt.ylabel('Average Rating')
    plt.legend(title='Occupation')
    plt.show()

# Define genre groups
genre_groups = {
    'Group 1: Action, Adventure, Animation, Children\'s, Comedy': ['Action', 'Adventure', 'Animation', "Children's", 'Comedy'],
    'Group 2: Crime, Documentary, Drama, Fantasy, Film-Noir': ['Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir'],
    'Group 3: Horror, Musical, Mystery, Romance, Sci-Fi': ['Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi'],
    'Group 4: Thriller, War, Western': ['Thriller', 'War', 'Western']
}

# Plot each group
for title, genres in genre_groups.items():
    plot_genre_preferences(genres, title)


In [None]:

# Define age groups
bins = [0, 18, 25, 35, 45, 50, 56, 100]
labels = ['<18', '18-24', '25-34', '35-44', '45-49', '50-55', '56+']
df_final['age_group'] = pd.cut(df_final['age'], bins=bins, labels=labels, right=False, include_lowest=True)

# Function to plot genre preferences by age group
def plot_genre_preferences(genre_list, title):
    genre_ratings = []
    for genre in genre_list:
        ratings = df_final[df_final[genre] == 1].groupby('age_group', observed=True)['rating'].mean().reset_index()
        ratings['Genre'] = genre
        genre_ratings.append(ratings)
    
    genre_ratings_df = pd.concat(genre_ratings, ignore_index=True)

    plt.figure(figsize=(14, 8))
    sns.barplot(x='Genre', y='rating', hue='age_group', data=genre_ratings_df, palette='Set2')
    plt.title(title)
    plt.xlabel('Genre')
    plt.ylabel('Average Rating')
    plt.legend(title='Age Group')
    plt.xticks(rotation=45)
    plt.show()

# Define genre groups
genre_groups = {
    'Group 1: Action, Adventure, Animation, Children\'s, Comedy': ['Action', 'Adventure', 'Animation', "Children's", 'Comedy'],
    'Group 2: Crime, Documentary, Drama, Fantasy, Film-Noir': ['Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir'],
    'Group 3: Horror, Musical, Mystery, Romance, Sci-Fi': ['Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi'],
    'Group 4: Thriller, War, Western': ['Thriller', 'War', 'Western']
}

# Plot each group
for title, genres in genre_groups.items():
    plot_genre_preferences(genres, title)


### Implementing collaborative filtering system

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Assuming the dataframe is already loaded as df_final

# Create user-item interaction matrix
interaction_matrix = df_final.pivot(index='user_id', columns='movie_id', values='rating').fillna(0).values

# Useful Values
num_movies, num_users = interaction_matrix.shape
num_features = 50  # Adjusted to accommodate new features

# User Metadata
user_metadata = df_final[['age', 'gender', 'occupation']]

# Item Metadata (Genres)
item_metadata = df_final[['movie_id', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
                          'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                          'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']].drop_duplicates(subset=['movie_id']).set_index('movie_id')

# Encode categorical features
encoder = OneHotEncoder(sparse=False)
encoded_gender = encoder.fit_transform(user_metadata[['gender']])
encoded_occupation = encoder.fit_transform(user_metadata[['occupation']])

# Combine user features
user_features = np.hstack((user_metadata[['age']].values, encoded_gender, encoded_occupation))
scaler_user = StandardScaler()
user_features_normalized = scaler_user.fit_transform(user_features)

# Combine item features
item_features = item_metadata.values
scaler_item = StandardScaler()
item_features_normalized = scaler_item.fit_transform(item_features)

# Set Initial Parameters (W, X), use tf.Variable to track these variables
tf.random.set_seed(1234) # for consistent results
W = tf.Variable(tf.random.normal((num_users, num_features), dtype=tf.float64), name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features), dtype=tf.float64), name='X')
b = tf.Variable(tf.random.normal((1, num_users), dtype=tf.float64), name='b')

# Instantiate an optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)

# Define the loss function with L2 regularization
lambda_reg = 0.01  # Adjusted regularization strength

def loss_fn():
    pred = tf.matmul(X, W, transpose_b=True) + b
    mask = tf.cast(interaction_matrix > 0, dtype=tf.float64)
    error = mask * (interaction_matrix - pred)
    loss = tf.reduce_sum(tf.square(error))
    reg_loss = lambda_reg * (tf.reduce_sum(tf.square(W)) + tf.reduce_sum(tf.square(X)))
    return loss + reg_loss

# Function to train the model
def train_model(epochs=500):
    for epoch in range(epochs):
        with tf.GradientTape() as tape:
            loss = loss_fn()
        gradients = tape.gradient(loss, [W, X, b])
        optimizer.apply_gradients(zip(gradients, [W, X, b]))
        if (epoch + 1) % 50 == 0:
            print(f'Epoch {epoch + 1}, Loss: {loss.numpy()}')

# K-Fold Cross Validation
kf = KFold(n_splits=5)
r2_scores = []
rmse_scores = []
mae_scores = []

for train_index, test_index in kf.split(interaction_matrix):
    train_data, test_data = interaction_matrix[train_index], interaction_matrix[test_index]
    
    # Reset variables
    W.assign(tf.random.normal((num_users, num_features), dtype=tf.float64))
    X.assign(tf.random.normal((num_movies, num_features), dtype=tf.float64))
    b.assign(tf.random.normal((1, num_users), dtype=tf.float64))
    
    # Train the model
    train_model(epochs=500)
    
    # Make predictions
    pred = tf.matmul(X, W, transpose_b=True) + b
    
    # Flatten predictions and true ratings for evaluation
    true_ratings = test_data[test_data > 0]
    predicted_ratings = pred.numpy()[test_index][test_data > 0]
    
    # Calculate evaluation metrics
    r2 = r2_score(true_ratings, predicted_ratings)
    rmse = mean_squared_error(true_ratings, predicted_ratings, squared=False)
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    
    r2_scores.append(r2)
    rmse_scores.append(rmse)
    mae_scores.append(mae)

# Average scores
avg_r2 = np.mean(r2_scores)
avg_rmse = np.mean(rmse_scores)
avg_mae = np.mean(mae_scores)

print(f'Average R² Score across folds: {avg_r2}')
print(f'Average RMSE across folds: {avg_rmse}')
print(f'Average MAE across folds: {avg_mae}')


### Implementing based content recommondation system

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Assuming the dataframe is already loaded as df_final

# Item Metadata (Genres and other attributes)
item_metadata = df_final[['movie_id', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime',
                          'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                          'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western', 'year_of_release']].drop_duplicates(subset=['movie_id']).set_index('movie_id')

# Normalize numerical features (e.g., year_of_release)
scaler_item = StandardScaler()
item_metadata[['year_of_release']] = scaler_item.fit_transform(item_metadata[['year_of_release']])

# User Metadata
user_metadata = df_final[['user_id', 'age', 'gender', 'occupation']].drop_duplicates(subset=['user_id']).set_index('user_id')

# Encode categorical features
encoder = OneHotEncoder(sparse=False)
encoded_gender = encoder.fit_transform(user_metadata[['gender']])
encoded_occupation = encoder.fit_transform(user_metadata[['occupation']])

# Combine user features
user_features = np.hstack((user_metadata[['age']].values, encoded_gender, encoded_occupation))
scaler_user = StandardScaler()
user_features_normalized = scaler_user.fit_transform(user_features)

# Combine item features
item_features = item_metadata.values
scaler_item = StandardScaler()
item_features_normalized = scaler_item.fit_transform(item_features)

# Convert to sparse matrices
item_features_sparse = csr_matrix(item_features_normalized)
user_features_sparse = csr_matrix(user_features_normalized)

# Compute cosine similarity matrices for users and items in batches
def compute_similarity_in_batches_sparse(features, batch_size=500):
    num_items = features.shape[0]
    similarity_matrix = np.zeros((num_items, num_items))
    
    for start in range(0, num_items, batch_size):
        end = min(start + batch_size, num_items)
        batch_features = features[start:end].toarray()
        similarity_matrix[start:end] = cosine_similarity(batch_features, features.toarray())
    
    return similarity_matrix

user_sim_matrix = compute_similarity_in_batches_sparse(user_features_sparse)
item_sim_matrix = compute_similarity_in_batches_sparse(item_features_sparse)

# Function to predict ratings based on content-based similarity
def predict_ratings(user_id, movie_id):
    # Find similar movies
    movie_idx = item_metadata.index.get_loc(movie_id)
    sim_scores = item_sim_matrix[movie_idx]
    sim_scores[movie_idx] = 0  # Exclude the movie itself

    # Get top similar movies
    top_similar_indices = np.argsort(-sim_scores)[:5]  # Top 5 similar movies
    top_similar_movies = item_metadata.iloc[top_similar_indices].index

    # Get actual ratings for these similar movies by the user
    actual_ratings = df_final[(df_final['user_id'] == user_id) & (df_final['movie_id'].isin(top_similar_movies))]['rating']

    if not actual_ratings.empty:
        # Predict the rating as the average rating of the top similar movies
        predicted_rating = actual_ratings.mean()
    else:
        # If no similar movies have been rated, use the average rating of the user
        predicted_rating = df_final[df_final['user_id'] == user_id]['rating'].mean()

    return predicted_rating

# Evaluate the model using R²
def evaluate_model(df_final):
    true_ratings = []
    predicted_ratings = []

    for index, row in df_final.iterrows():
        user_id = row['user_id']
        movie_id = row['movie_id']
        true_rating = row['rating']
        predicted_rating = predict_ratings(user_id, movie_id)

        true_ratings.append(true_rating)
        predicted_ratings.append(predicted_rating)

    r2 = r2_score(true_ratings, predicted_ratings)
    rmse = mean_squared_error(true_ratings, predicted_ratings, squared=False)
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    
    return r2, rmse, mae

# Example usage
r2, rmse, mae = evaluate_model(df_final)
print(f'R² Score: {r2}')
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')


### Implementing hybrid filtering system

In [None]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse.linalg import svds

# Create user-item interaction matrix
interaction_matrix = df_final.pivot(index='user_id', columns='movie_id', values='rating').fillna(0).values
interaction_matrix_sparse = csr_matrix(interaction_matrix)

# Perform SVD
U, sigma, Vt = svds(interaction_matrix_sparse, k=50)

# Reconstruct the original matrix
sigma = np.diag(sigma)
predicted_ratings = np.dot(np.dot(U, sigma), Vt)

# Function to predict ratings using a hybrid approach
def hybrid_predict_ratings(user_id, movie_id):
    # Content-based similarity
    movie_idx = item_metadata.index.get_loc(movie_id)
    sim_scores = item_sim_matrix[movie_idx]
    sim_scores[movie_idx] = 0  # Exclude the movie itself

    # Get top similar movies
    top_similar_indices = np.argsort(-sim_scores)[:5]  # Top 5 similar movies
    top_similar_movies = item_metadata.iloc[top_similar_indices].index

    # Content-based prediction
    actual_ratings = df_final[(df_final['user_id'] == user_id) & (df_final['movie_id'].isin(top_similar_movies))]['rating']
    if not actual_ratings.empty:
        content_pred_rating = actual_ratings.mean()
    else:
        content_pred_rating = df_final[df_final['user_id'] == user_id]['rating'].mean()

    # Collaborative prediction
    user_idx = user_metadata.index.get_loc(user_id)
    collaborative_pred_rating = predicted_ratings[user_idx, movie_idx]

    # Hybrid prediction
    hybrid_pred_rating = (content_pred_rating + collaborative_pred_rating) / 2
    return hybrid_pred_rating

# Evaluate the hybrid model
def evaluate_hybrid_model(df_final):
    true_ratings = []
    predicted_ratings = []

    for index, row in df_final.iterrows():
        user_id = row['user_id']
        movie_id = row['movie_id']
        true_rating = row['rating']
        predicted_rating = hybrid_predict_ratings(user_id, movie_id)

        true_ratings.append(true_rating)
        predicted_ratings.append(predicted_rating)

    r2 = r2_score(true_ratings, predicted_ratings)
    rmse = mean_squared_error(true_ratings, predicted_ratings, squared=False)
    mae = mean_absolute_error(true_ratings, predicted_ratings)
    
    return r2, rmse, mae

# Example usage
r2, rmse, mae = evaluate_hybrid_model(df_final)
print(f'Hybrid R² Score: {r2}')
print(f'Hybrid RMSE: {rmse}')
print(f'Hybrid MAE: {mae}')
