In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


In [2]:
df = pd.read_csv("/kaggle/input/movielens-9000-movies-dataset/ml-latest-small/ratings.csv")

In [3]:
df.info() # basically show schema

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
df.userId.nunique()

610

In [5]:
df.movieId.nunique()

9724

In [6]:
df.rating.value_counts() #check value distribution

rating
4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: count, dtype: int64

In [7]:
df.shape

(100836, 4)

In [8]:
# Get the number of unique users and movies
n_users = df.userId.nunique()
n_movies = df.movieId.max() + 1



In [9]:
class MovieDataset:
    def __init__(self, df, n_users, n_movies):
        self.users = df.userId.values
        self.movies = df.movieId.values
        self.ratings = df.rating.values
        self.n_users = n_users
        self.n_movies = n_movies

    def __len__(self):
        return len(self.users)

    def __getitem__(self, item):
        user = self.users[item]
        movie = self.movies[item]
        rating = self.ratings[item]

        # Check and correct user index
        if user >= self.n_users:
            user = self.n_users - 1

        # One-hot encode user and movie IDs
        user_onehot = torch.zeros(self.n_users)
        user_onehot[user] = 1.0

        movie_onehot = torch.zeros(self.n_movies)
        movie_onehot[movie] = 1.0

        return {
            "users_onehot": user_onehot,
            "movies_onehot": movie_onehot,
            "ratings": torch.tensor(rating, dtype=torch.float32),
        }

# Create an instance of MovieDataset with your data
dataset = MovieDataset(df, n_users, n_movies)

# Check the max indices
print("Max user index:", dataset.users.max())
print("Max movie index:", dataset.movies.max())

Max user index: 610
Max movie index: 193609


In [10]:
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_movies, emb_size=32):
        super().__init__()
        self.user_embed = nn.Linear(n_users, emb_size, bias=False)
        self.movie_embed = nn.Linear(n_movies, emb_size, bias=False)
        self.hidden1 = nn.Linear(emb_size * 2, 32)
        self.hidden2 = nn.Linear(32, 16)
        self.hidden3 = nn.Linear(16, 8)
        self.out = nn.Linear(8, 1)

    def forward(self, users_onehot, movies_onehot):
        user_embeds = self.user_embed(users_onehot)
        movie_embeds = self.movie_embed(movies_onehot)
        user_embeds = user_embeds.view(-1, user_embeds.size(1))
        movie_embeds = movie_embeds.view(-1, movie_embeds.size(1))
        embedding = torch.cat([user_embeds, movie_embeds], dim=1)
        embedding = F.relu(self.hidden1(embedding))
        embedding = F.relu(self.hidden2(embedding))
        embedding = F.relu(self.hidden3(embedding))
        output = self.out(embedding)
        return output

    def predict_ratings(self, users_onehot, movies_onehot):
        with torch.no_grad():
            output = self(users_onehot, movies_onehot)
        return output.squeeze().tolist()  # Convert tensor to list of ratings

In [11]:

from sklearn.model_selection import train_test_split

# Assuming 'userId' is the column representing users in your DataFrame
df_train, df_valid = train_test_split(df, test_size=0.1, random_state=42, stratify=df['userId'].values)
print("Size of Training Set:", len(df_train))
print("Size of Validation Set:", len(df_valid))
# Extract highest rated movie for each user in the training set
highest_ratings_train = df_train.loc[df_train.groupby('userId')['rating'].idxmax()]

# Remove these highest rated movies from the training set
df_train = df_train.drop(highest_ratings_train.index)

# Add these highest rated movies to the validation set
df_valid = pd.concat([df_valid, highest_ratings_train], ignore_index=True)
# Extract lowest rated movie for each user in the validation set
lowest_ratings_valid = df_valid.loc[df_valid.groupby('userId')['rating'].idxmin()]

# Remove these lowest rated movies from the validation set
df_valid = df_valid.drop(lowest_ratings_valid.index)

# Add these lowest rated movies to the training set
df_train = pd.concat([df_train, lowest_ratings_valid], ignore_index=True)

print("Size of Training Set after removing highest ratings:", len(df_train))
print("Size of Validation Set after adding highest ratings:", len(df_valid))

# Create datasets
train_dataset = MovieDataset(df_train, n_users, n_movies)
valid_dataset = MovieDataset(df_valid, n_users, n_movies)

# Create data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True, num_workers=2)
validation_loader = DataLoader(dataset=valid_dataset, batch_size=4, shuffle=True, num_workers=2)
#print("Unique User IDs in Training Set (Sorted):")
#print(sorted(df_train['userId'].unique()))

#print("\nUnique User IDs in Validation Set (Sorted):")
#print(sorted(df_valid['userId'].unique()))


Size of Training Set: 90752
Size of Validation Set: 10084
Size of Training Set after removing highest ratings: 90752
Size of Validation Set after adding highest ratings: 10084


In [12]:
# Choose the first user ID (assuming user IDs start from 1)
first_user_id = 1

# Filter the training set for the first user
training_ratings_first_user = df_train[df_train['userId'] == first_user_id]

# Filter the validation set for the first user
validation_ratings_first_user = df_valid[df_valid['userId'] == first_user_id]

# Print information for the training set
print(f"Training Set - User {first_user_id} Ratings:")
print(training_ratings_first_user[['movieId', 'rating']])

# Print information for the validation set
print(f"\nValidation Set - User {first_user_id} Ratings:")
print(validation_ratings_first_user[['movieId', 'rating']])


Training Set - User 1 Ratings:
       movieId  rating
1097      1552     4.0
1525      2253     2.0
2051      1219     2.0
2808       954     5.0
2861      1805     4.0
...        ...     ...
88061     2899     5.0
88246     2099     4.0
88534     2617     2.0
89683       47     5.0
90142     2389     2.0

[209 rows x 2 columns]

Validation Set - User 1 Ratings:
       movieId  rating
328       1777     4.0
495        590     4.0
743       3479     4.0
1095      1029     5.0
1307      2174     4.0
1761      2596     5.0
1868      2985     4.0
2178      2273     4.0
2468      1136     5.0
2523      1240     5.0
2577       943     4.0
2775      3247     3.0
2854      2944     5.0
3170      1377     3.0
3662       356     4.0
5534      1517     5.0
5833      2078     5.0
6361      2478     4.0
6882      3740     4.0
8634      2502     5.0
8681      1804     5.0
9929      1258     3.0
10084     2115     5.0


In [13]:
dataiter = iter(train_loader)

for dataloader_data in dataiter:
    print(dataloader_data)
    break

{'users_onehot': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'movies_onehot': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'ratings': tensor([3.5000, 0.5000, 5.0000, 4.0000])}


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [15]:
model = RecSysModel(
    n_users, n_movies, emb_size=32
).to(device)

# Optimizer and scheduler
#optimizer = torch.optim.Adam(model.parameters())
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)  # Setting learning rate to 0.001

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)

# Loss function
loss_func = nn.MSELoss()

In [16]:
print(n_users)
print(n_movies)
print(df.movieId.max() + 1)  # Adding 1 because movieId starts from 0 after one-hot encoding
print(len(train_dataset))


610
193610
193610
90752


In [17]:
with torch.no_grad():
    model_output = model(dataloader_data['users_onehot'], 
                         dataloader_data['movies_onehot'])

    print(f"model_output: {model_output}, size: {model_output.size()}")


model_output: tensor([[0.3654],
        [0.3654],
        [0.3657],
        [0.3651]]), size: torch.Size([4, 1])


In [18]:
rating = dataloader_data["ratings"]
print(rating)
print(rating.view(4, -1))
print(model_output)

print(rating.sum())

print(model_output.sum() - rating.sum())

tensor([3.5000, 0.5000, 5.0000, 4.0000])
tensor([[3.5000],
        [0.5000],
        [5.0000],
        [4.0000]])
tensor([[0.3654],
        [0.3654],
        [0.3657],
        [0.3651]])
tensor(13.)
tensor(-11.5384)


In [None]:

epochs = 1
total_loss = 0
plot_steps, print_steps = 5000, 5000
step_cnt = 0
all_losses_list = [] 

model.train() 
for epoch_i in range(epochs):
    for i, train_data in enumerate(train_loader):
        output = model(train_data["users_onehot"], 
                       train_data["movies_onehot"]
                      ) 
        
        rating = train_data["ratings"].view(4, -1).to(torch.float32)

        loss = loss_func(output, rating)
        total_loss = total_loss + loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        step_cnt = step_cnt + len(train_data["users_onehot"])
        

        if(step_cnt % plot_steps == 0):
            avg_loss = total_loss/(len(train_data["users_onehot"]) * plot_steps)
            print(f"epoch {epoch_i} loss at step: {step_cnt} is {avg_loss}")
            all_losses_list.append(avg_loss)
            total_loss = 0  # reset total_loss

epoch 0 loss at step: 5000 is 0.5115638348847628
epoch 0 loss at step: 10000 is 0.07607634874880313


In [None]:
from sklearn.metrics import mean_squared_error

model_output_list = []
target_rating_list = []

model.eval()

with torch.no_grad():
    for i, batched_data in enumerate(validation_loader): 
        model_output = model(batched_data['users_onehot'], 
                             batched_data['movies_onehot'])
        
        model_output_list.append(model_output.sum().item() / len(batched_data['users_onehot']))

        target_rating = batched_data["ratings"]
        
        target_rating_list.append(target_rating.sum().item() / len(batched_data['users_onehot']))

# squared If True returns MSE value, if False returns RMSE value.
rms = mean_squared_error(target_rating_list, model_output_list, squared=False)
print(f"rms: {rms}")


In [None]:
predicted_ratings_list = []

with torch.no_grad():
    for i, batched_data in enumerate(validation_loader): 
        
        model_output = model(batched_data['users_onehot'], batched_data['movies_onehot'])
        
        for user_idx in range(len(batched_data['users_onehot'])):
            # Get the index of the test item (movie with the highest rating)
            test_item_index = torch.argmax(batched_data["ratings"][user_idx]).item()

            # Predict rating for the test item
            predicted_rating = model_output[user_idx, test_item_index].item()
            predicted_ratings_list.append(predicted_rating)

        
# Calculate average predicted rating
average_predicted_rating = np.mean(predicted_ratings_list)
print(f"Average Predicted Rating of Test Item: {average_predicted_rating}")


In [None]:
print(df.movieId.max() + 1)

In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count = 0
total_users = 0

# Iterate over each user
for user_id in df['userId'].unique():
    if user_id == 610:
        continue 
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 99 unrated movie IDs from the overall dataset
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(99, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user from the validation set
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if highest_rated_movie_id_valid in top_10_recommendations:
        hits_count += 1
    
    total_users += 1

# Calculate hit rate
hit_rate = hits_count / total_users if total_users > 0 else 0
print("Hit rate:", hit_rate)


In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:122]
bottom20_users = sorted_users[-122:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 20 frequent users:", hit_rate_top20)
print("Hit rate for bottom 20 frequent users:", hit_rate_bottom20)





In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:61]
bottom20_users = sorted_users[-61:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 10 frequent users:", hit_rate_top20)
print("Hit rate for bottom 10 frequent users:", hit_rate_bottom20)






In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:31]
bottom20_users = sorted_users[-31:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 5 frequent users:", hit_rate_top20)
print("Hit rate for bottom 5 frequent users:", hit_rate_bottom20)







In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:20]
bottom20_users = sorted_users[-20:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 20 frequent users:", hit_rate_top20)
print("Hit rate for bottom 20 frequent users:", hit_rate_bottom20)








In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:10]
bottom20_users = sorted_users[-10:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 10 frequent users:", hit_rate_top20)
print("Hit rate for bottom 10 frequent users:", hit_rate_bottom20)








In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:5]
bottom20_users = sorted_users[-5:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 5 frequent users:", hit_rate_top20)
print("Hit rate for bottom 5 frequent users:", hit_rate_bottom20)








In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:31]
bottom20_users = sorted_users[-31:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 10 frequent users:", hit_rate_top20)
print("Hit rate for bottom 10 frequent users:", hit_rate_bottom20)








In [None]:
# Print the indices of the top 10 users and their total rating count
print("Indices and total rating count of the top 10 users:")
for index, user_id in enumerate(sorted_users[:10]):
    print("Index:", index, "User ID:", user_id, "Total Rating Count:", user_rating_counts[user_id])

# Print the indices of the bottom 10 users and their total rating count
print("Indices and total rating count of the bottom 10 users:")
for index, user_id in enumerate(sorted_users[-10:], start=len(sorted_users) - 10):
    print("Index:", index, "User ID:", user_id, "Total Rating Count:", user_rating_counts[user_id])


In [None]:
# Print the indices of the top 10 users
print("Indices of the top 10 users:")
for index, user_id in enumerate(sorted_users[:10]):
    print(index, user_id)

# Print the indices of the bottom 10 users
print("Indices of the bottom 10 users:")
for index, user_id in enumerate(sorted_users[-10:], start=len(sorted_users) - 10):
    print(index, user_id)



In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:122]
bottom20_users = sorted_users[-122:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 5 frequent users:", hit_rate_top20)
print("Hit rate for bottom 5 frequent users:", hit_rate_bottom20)





In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:61]
bottom20_users = sorted_users[-61:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 10 frequent users:", hit_rate_top20)
print("Hit rate for bottom 10 frequent users:", hit_rate_bottom20)




In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:31]
bottom20_users = sorted_users[-31:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 5 frequent users:", hit_rate_top20)
print("Hit rate for bottom 5 frequent users:", hit_rate_bottom20)





In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:10]
bottom20_users = sorted_users[-10:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 10 frequent users:", hit_rate_top20)
print("Hit rate for bottom 10 frequent users:", hit_rate_bottom20)


In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:10]
bottom20_users = sorted_users[-10:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 10 frequent users:", hit_rate_top20)
print("Hit rate for bottom 10 frequent users:", hit_rate_bottom20)



In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:5]
bottom20_users = sorted_users[-5:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 5 frequent users:", hit_rate_top20)
print("Hit rate for bottom 5 frequent users:", hit_rate_bottom20)




In [None]:
# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
            # Print highest_rated_movie_id_valid and top_10_recommendations for each user
        print(f"User top {user_id}:")
        print("Highest Rated Movie ID (Valid):", highest_rated_movie_id_valid)
        print("Top 10 Recommendations:", top_10_recommendations)
        print()
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
            # Print highest_rated_movie_id_valid and top_10_recommendations for each user
        print(f"User bottom {user_id}:")
        print("Highest Rated Movie ID (Valid):", highest_rated_movie_id_valid)
        print("Top 10 Recommendations:", top_10_recommendations)
        print()
        total_users_bottom20 += 1
    
    

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 5 frequent users:", hit_rate_top20)
print("Hit rate for bottom 5 frequent users:", hit_rate_bottom20)


In [None]:
# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
    if user_id in top20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id_valid in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1
    
    # Print highest_rated_movie_id_valid and top_10_recommendations for each user
    print(f"User {user_id}:")
    print("Highest Rated Movie ID (Valid):", highest_rated_movie_id_valid)
    print("Top 10 Recommendations:", top_10_recommendations)
    print()

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 5 frequent users:", hit_rate_top20)
print("Hit rate for bottom 5 frequent users:", hit_rate_bottom20)


In [None]:
# Initialize dictionaries to store top 10 and test movie indices for each user in top 5 and bottom 5
top10_movie_indices_top5 = {}
test_movie_indices_top5 = {}
top10_movie_indices_bottom5 = {}
test_movie_indices_bottom5 = {}

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id_valid = df_valid[df_valid['userId'] == user_id].nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id_valid)
    
    # Store top 10 and test movie indices for each user in top 5 and bottom 5
    if user_id in top20_users:
        top10_movie_indices_top5[user_id] = sampled_movie_ids[:10]
        test_movie_indices_top5[user_id] = highest_rated_movie_id_valid
    elif user_id in bottom20_users:
        top10_movie_indices_bottom5[user_id] = sampled_movie_ids[:10]
        test_movie_indices_bottom5[user_id] = highest_rated_movie_id_valid

# Print top 10 and test movie indices for each user separately in top 5 and bottom 5
print("Top 10 movie indices for each user in top 5:")
for user_id, movie_indices in top10_movie_indices_top5.items():
    print(f"User {user_id}: {movie_indices}")

print("\nTest movie indices for each user in top 5:")
for user_id, movie_index in test_movie_indices_top5.items():
    print(f"User {user_id}: {movie_index}")

print("\nTop 10 movie indices for each user in bottom 5:")
for user_id, movie_indices in top10_movie_indices_bottom5.items():
    print(f"User {user_id}: {movie_indices}")

print("\nTest movie indices for each user in bottom 5:")
for user_id, movie_index in test_movie_indices_bottom5.items():
    print(f"User {user_id}: {movie_index}")


In [None]:
import numpy as np

# Initialize variables to track NDCG@10
ndcg_sum = 0
num_users = 0

# Iterate over each user
for user_id in df['userId'].unique():
    if user_id == 610:
        continue 
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Get the index of the highest rated item for the user
    highest_rated_item_index = np.where(sampled_movie_ids == highest_rated_movie_id)[0][0]
    
    # Compute DCG@10 for the user
    dcg = 0
    for rank, (movie_id, _) in enumerate(movie_ratings[:10], start=1):
        relevance = 1 if np.where(sampled_movie_ids == movie_id)[0][0] == highest_rated_item_index else 0
        dcg += (2 ** relevance - 1) / np.log2(rank + 1)
    
    # Compute ideal DCG@10 for the user
    ideal_dcg = sum((2 ** 1 - 1) / np.log2(rank + 1) for rank in range(1, min(11, len(rated_movies) + 1)))
    
    # Accumulate NDCG@10
    if ideal_dcg > 0:
        ndcg_sum += dcg / ideal_dcg
        num_users += 1

# Calculate average NDCG@10
ndcg_at_10 = ndcg_sum / num_users if num_users > 0 else 0
print("Average NDCG@10:", ndcg_at_10)


In [None]:
import numpy as np

# Initialize variables to track NDCG@10 and the number of users
ndcg_sum = 0
num_users = 0

# Iterate over each user
for user_id in df['userId'].unique():
    if user_id == 610:
        continue 
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the test movie is in the top 10 recommendations
    if highest_rated_movie_id in [movie_id for movie_id, _ in movie_ratings[:10]]:
        num_users += 1  # Increment the number of users
    
        # Get the index of the highest rated item for the user
        highest_rated_item_index = np.where(sampled_movie_ids == highest_rated_movie_id)[0][0]
        
        # Compute DCG@10 for the user
        dcg = 0
        for rank, (movie_id, _) in enumerate(movie_ratings[:10], start=1):
            relevance = 1 if np.where(sampled_movie_ids == movie_id)[0][0] == highest_rated_item_index else 0
            dcg += (2 ** relevance - 1) / np.log2(rank + 1)
        
        # Compute ideal DCG@10 for the user
        ideal_dcg = sum((2 ** 1 - 1) / np.log2(rank + 1) for rank in range(1, min(11, len(rated_movies) + 1)))
        
        # Accumulate NDCG@10
        if ideal_dcg > 0:
            ndcg_sum += dcg / ideal_dcg

# Calculate average NDCG@10
ndcg_at_10 = ndcg_sum / num_users if num_users > 0 else 0
print("NDCG@10:", ndcg_at_10)


In [None]:
import numpy as np

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Get the top 20 and bottom 20 frequent users
top_20_users = sorted_users[:20]
bottom_20_users = sorted_users[-20:]

# Function to calculate NDCG@10 for a given set of users
def calculate_ndcg(users):
    # Initialize variables to track NDCG@10 and the number of users
    ndcg_sum = 0
    num_users = 0

    # Iterate over each user
    for user_id in users:
        if user_id == 610:
            continue 
        # Get movies rated by the user
        rated_movies = df[df['userId'] == user_id]

        # Sample 100 unrated movie IDs if available
        unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
        sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)

        # Append the highest rated movie for the user
        highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
        sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)

        # Initialize list to store movie ratings
        movie_ratings = []

        # Iterate over each sampled movie
        for movie_id in sampled_movie_ids:
            # One-hot encode user and movie (assuming n_users and n_movies are defined)
            user_tensor = torch.tensor([user_id], dtype=torch.long)
            movie_tensor = torch.tensor([movie_id], dtype=torch.long)
            user_onehot = torch.zeros(1, n_users)
            user_onehot[0, user_id] = 1.0
            movie_onehot = torch.zeros(1, n_movies)
            movie_onehot[0, movie_id] = 1.0

            # Get predictions from the model
            model_output = model(user_onehot, movie_onehot)

            # Store the movie ID and its predicted rating
            movie_ratings.append((movie_id, model_output.item()))

        # Sort the movie ratings based on predicted ratings
        movie_ratings.sort(key=lambda x: x[1], reverse=True)

        # Check if the test movie is in the top 10 recommendations
        if highest_rated_movie_id in [movie_id for movie_id, _ in movie_ratings[:10]]:
            num_users += 1  # Increment the number of users

            # Get the index of the highest rated item for the user
            highest_rated_item_index = np.where(sampled_movie_ids == highest_rated_movie_id)[0][0]

            # Compute DCG@10 for the user
            dcg = 0
            for rank, (movie_id, _) in enumerate(movie_ratings[:10], start=1):
                relevance = 1 if np.where(sampled_movie_ids == movie_id)[0][0] == highest_rated_item_index else 0
                dcg += (2 ** relevance - 1) / np.log2(rank + 1)

            # Compute ideal DCG@10 for the user
            ideal_dcg = sum((2 ** 1 - 1) / np.log2(rank + 1) for rank in range(1, min(11, len(rated_movies) + 1)))

            # Accumulate NDCG@10
            if ideal_dcg > 0:
                ndcg_sum += dcg / ideal_dcg

    # Calculate average NDCG@10
    average_ndcg_at_10 = ndcg_sum / num_users if num_users > 0 else 0
    return average_ndcg_at_10

# Calculate NDCG@10 for the top 20 frequent users
ndcg_top_20 = calculate_ndcg(top_20_users)
print("NDCG@10 for top 20 frequent users:", ndcg_top_20)

# Calculate NDCG@10 for the bottom 20 frequent users
ndcg_bottom_20 = calculate_ndcg(bottom_20_users)
print("NDCG@10 for bottom 20 frequent users:", ndcg_bottom_20)


In [None]:
import numpy as np

# Initialize variables to track hits, DCG, and IDCG
hits_count = 0
dcg_sum = 0
idcg_sum = 0

# Iterate over each user
for user_id in df['userId'].unique():
    if user_id == 610:
        continue 
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Get the index of the highest rated item for the user
    highest_rated_item_index = np.where(sampled_movie_ids == highest_rated_movie_id)[0][0]
    
    # Compute DCG@10 for the user
    dcg = 0
    for rank, (movie_id, _) in enumerate(movie_ratings[:10], start=1):
        relevance = 1 if np.where(sampled_movie_ids == movie_id)[0][0] == highest_rated_item_index else 0
        dcg += (2 ** relevance - 1) / np.log2(rank + 1)
    
    # Compute ideal DCG@10 for the user
    ideal_dcg = sum((2 ** 1 - 1) / np.log2(rank + 1) for rank in range(1, min(11, len(rated_movies) + 1)))
    
    # Update sums
    hits_count += 1 if highest_rated_movie_id in [movie_id for movie_id, _ in movie_ratings[:10]] else 0
    dcg_sum += dcg
    idcg_sum += ideal_dcg

# Calculate NDCG@10
ndcg = dcg_sum / idcg_sum if idcg_sum > 0 else 0
hit_rate = hits_count / total_users if total_users > 0 else 0

#print("Hit rate:", hit_rate)
print("NDCG@10:", ndcg)


In [None]:
import numpy as np

# Initialize variables to track DCG and IDCG
dcg_sum = 0
idcg_sum = 0

# Iterate over each user
for user_id in df['userId'].unique():
    if user_id == 610:
        continue 
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Get the relevance scores (ratings) for the recommended movies
    relevance_scores = [rated_movies[rated_movies['movieId'] == movie_id]['rating'].values[0] if movie_id in rated_movies['movieId'].values else 0 for movie_id, _ in movie_ratings[:10]]
    
    # Calculate DCG
    dcg = np.sum(relevance_scores / np.log2(np.arange(2, len(relevance_scores) + 2)))
    
    # Calculate IDCG
    ideal_relevance_scores = sorted(rated_movies['rating'].values, reverse=True)[:min(10, len(rated_movies))]
    idcg = np.sum(ideal_relevance_scores / np.log2(np.arange(2, len(ideal_relevance_scores) + 2)))
    
    # Update sums
    dcg_sum += dcg
    idcg_sum += idcg

# Calculate NDCG@10
# Calculate NDCG@10
ndcg = (dcg_sum * 10) / idcg_sum if idcg_sum > 0 else 0
print("NDCG@10:", ndcg)



In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count = 0
total_users = 0

# Specify the user for evaluation
user_id = 2  # Assuming the user ID is 123

# Get movies rated by the user
rated_movies = df[df['userId'] == user_id]

# Sample 100 unrated movie IDs if available
unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)

# Append the highest rated movie for the user
highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)

# Initialize list to store movie ratings
movie_ratings = []

# Iterate over each sampled movie
for movie_id in sampled_movie_ids:
    # One-hot encode user and movie
    user_tensor = torch.tensor([user_id], dtype=torch.long)
    movie_tensor = torch.tensor([movie_id], dtype=torch.long)
    
    user_onehot = torch.zeros(1, n_users)
    user_onehot[0, user_id] = 1.0
    
    movie_onehot = torch.zeros(1, n_movies)
    movie_onehot[0, movie_id] = 1.0
    
    # Get predictions from the model
    model_output = model(user_onehot, movie_onehot)
    
    # Store the movie ID and its predicted rating
    movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating

# Sort the movie ratings based on predicted ratings
movie_ratings.sort(key=lambda x: x[1], reverse=True)

# Print the test movie ID
print("Test Movie ID:", highest_rated_movie_id)

# Print the movie indices of top 10 recommendations
top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:10]]
print("Top 10 Recommended Movie IDs:", top_10_recommendations)

# Map the indices back to original movieIds
top_10_original_movieIds = [sampled_movie_ids[np.where(sampled_movie_ids == movie_id)[0][0]] for movie_id in top_10_recommendations]
print("Top 10 Recommended Original Movie IDs:", top_10_original_movieIds)


In [None]:
# Check if the highest rated item is among the top 10 recommendations for this user
if highest_rated_movie_id in top_10_original_movieIds:
    print("Hit: Highest rated movie is among the top 10 recommended movies for User", user_id)
    hits_count += 1
else:
    print("Miss: Highest rated movie is not among the top 10 recommended movies for User", user_id)


In [None]:
# Choose the first user ID (assuming user IDs start from 1)
first_user_id = 414

# Filter the training set for the first user
training_ratings_first_user = df_train[df_train['userId'] == first_user_id]

# Filter the validation set for the first user
validation_ratings_first_user = df_valid[df_valid['userId'] == first_user_id]

# Print information for the training set
print(f"Training Set - User {first_user_id} Ratings:")
print(training_ratings_first_user[['movieId', 'rating']])

# Print information for the validation set
print(f"\nValidation Set - User {first_user_id} Ratings:")
print(validation_ratings_first_user[['movieId', 'rating']])

# Check if movie ID 100 is present in the training set for the first user
is_movie_100_in_training = 100 in training_ratings_first_user['movieId'].values
print(f"Is Movie ID 100 present in the training set for User {first_user_id}? {is_movie_100_in_training}")

# Check if movie ID 100 is present in the validation set for the first user
is_movie_100_in_validation = 100 in validation_ratings_first_user['movieId'].values
print(f"Is Movie ID 100 present in the validation set for User {first_user_id}? {is_movie_100_in_validation}")


In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count = 0
total_users = 0

# Iterate over each user
for user_id in df['userId'].unique():
    if user_id == 610:
        continue 
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Check if the highest rated movie for the user is 5
    if rated_movies['rating'].max() == 5:
        # Sample 100 unrated movie IDs if available
        unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
        sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
        
        # Append the highest rated movie for the user
        highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
        sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)
        
        # Initialize list to store movie ratings
        movie_ratings = []
        
        # Iterate over each sampled movie
        for movie_id in sampled_movie_ids:
            # One-hot encode user and movie
            user_tensor = torch.tensor([user_id], dtype=torch.long)
            movie_tensor = torch.tensor([movie_id], dtype=torch.long)
            
            user_onehot = torch.zeros(1, n_users)
            user_onehot[0, user_id] = 1.0
            
            movie_onehot = torch.zeros(1, n_movies)
            movie_onehot[0, movie_id] = 1.0
            
            # Get predictions from the model
            model_output = model(user_onehot, movie_onehot)
            
            # Store the movie ID and its predicted rating
            movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
        
        # Sort the movie ratings based on predicted ratings
        movie_ratings.sort(key=lambda x: x[1], reverse=True)
        
        # Check if the highest rated item is among the top 10 recommendations
        top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:5]]
        if highest_rated_movie_id in top_10_recommendations:
            hits_count += 1
        
        total_users += 1

# Calculate hit rate
hit_rate = hits_count / total_users if total_users > 0 else 0
print("Hit rate:", hit_rate)


In [None]:
# Set the model to evaluation mode
model.eval()

# Initialize variables to track hits
hits_count_top20 = 0
hits_count_bottom20 = 0
total_users_top20 = 0
total_users_bottom20 = 0

# Get the number of ratings provided by each user
user_rating_counts = df['userId'].value_counts()

# Sort users based on the number of ratings they have provided
sorted_users = user_rating_counts.index.tolist()

# Select the top 20 and bottom 20 frequent users
top20_users = sorted_users[:20]
bottom20_users = sorted_users[-20:]

# Iterate over each user
for user_id in df['userId'].unique():
    # Skip user 610
    if user_id == 610:
        continue
    
    # Get movies rated by the user
    rated_movies = df[df['userId'] == user_id]
    
    # Sample 100 unrated movie IDs if available
    unrated_movie_ids = df[~df['movieId'].isin(rated_movies['movieId'])]['movieId'].unique()
    sampled_movie_ids = np.random.choice(unrated_movie_ids, size=min(100, len(unrated_movie_ids)), replace=False)
    
    # Append the highest rated movie for the user
    highest_rated_movie_id = rated_movies.nlargest(1, 'rating')['movieId'].values[0]
    sampled_movie_ids = np.append(sampled_movie_ids, highest_rated_movie_id)
    
    # Initialize list to store movie ratings
    movie_ratings = []
    
    # Iterate over each sampled movie
    for movie_id in sampled_movie_ids:
        # One-hot encode user and movie
        user_tensor = torch.tensor([user_id], dtype=torch.long)
        movie_tensor = torch.tensor([movie_id], dtype=torch.long)
        
        user_onehot = torch.zeros(1, n_users)
        user_onehot[0, user_id] = 1.0
        
        movie_onehot = torch.zeros(1, n_movies)
        movie_onehot[0, movie_id] = 1.0
        
        # Get predictions from the model
        model_output = model(user_onehot, movie_onehot)
        
        # Store the movie ID and its predicted rating
        movie_ratings.append((movie_id, model_output.item()))  # Assuming model_output contains the predicted rating
    
    # Sort the movie ratings based on predicted ratings
    movie_ratings.sort(key=lambda x: x[1], reverse=True)
    
    # Check if the highest rated item is among the top 10 recommendations
    top_10_recommendations = [movie_id for movie_id, _ in movie_ratings[:5]]
    if user_id in top20_users:
        if highest_rated_movie_id in top_10_recommendations:
            hits_count_top20 += 1
        total_users_top20 += 1
    elif user_id in bottom20_users:
        if highest_rated_movie_id in top_10_recommendations:
            hits_count_bottom20 += 1
        total_users_bottom20 += 1

# Calculate hit rate for top 20 and bottom 20 users
hit_rate_top20 = hits_count_top20 / total_users_top20 if total_users_top20 > 0 else 0
hit_rate_bottom20 = hits_count_bottom20 / total_users_bottom20 if total_users_bottom20 > 0 else 0

print("Hit rate for top 20 frequent users:", hit_rate_top20)
print("Hit rate for bottom 20 frequent users:", hit_rate_bottom20)