In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt 
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


In [2]:
df = pd.read_csv("/kaggle/input/movielens-9000-movies-dataset/ml-latest-small/ratings.csv")

In [3]:
df.info() # basically show schema

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [4]:
df.userId.nunique()

610

In [5]:
df.movieId.nunique()

9724

In [6]:
df.rating.value_counts() #check value distribution

rating
4.0    26818
3.0    20047
5.0    13211
3.5    13136
4.5     8551
2.0     7551
2.5     5550
1.0     2811
1.5     1791
0.5     1370
Name: count, dtype: int64

In [7]:
df.shape

(100836, 4)

In [8]:
# Get the number of unique users and movies
n_users = df.userId.nunique()
n_movies = df.movieId.max() + 1



In [9]:
class MovieDataset:
    def __init__(self, df, n_users, n_movies):
        self.users = df.userId.values
        self.movies = df.movieId.values
        self.ratings = df.rating.values
        self.n_users = n_users
        self.n_movies = n_movies

    def __len__(self):
        return len(self.users)

    def __getitem__(self, item):
        user = self.users[item]
        movie = self.movies[item]
        rating = self.ratings[item]

        # Check and correct user index
        if user >= self.n_users:
            user = self.n_users - 1

        # One-hot encode user and movie IDs
        user_onehot = torch.zeros(self.n_users)
        user_onehot[user] = 1.0

        movie_onehot = torch.zeros(self.n_movies)
        movie_onehot[movie] = 1.0

        return {
            "users_onehot": user_onehot,
            "movies_onehot": movie_onehot,
            "ratings": torch.tensor(rating, dtype=torch.float32),
        }

# Create an instance of MovieDataset with your data
dataset = MovieDataset(df, n_users, n_movies)

# Check the max indices
print("Max user index:", dataset.users.max())
print("Max movie index:", dataset.movies.max())

Max user index: 610
Max movie index: 193609


In [10]:
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_movies, emb_size=32):
        super().__init__()
        self.user_embed = nn.Linear(n_users, emb_size, bias=False)
        self.movie_embed = nn.Linear(n_movies, emb_size, bias=False)
        self.hidden1 = nn.Linear(emb_size * 2, 32)
        self.hidden2 = nn.Linear(32, 16)
        self.hidden3 = nn.Linear(16, 8)
        self.out = nn.Linear(8, 1)

    def forward(self, users_onehot, movies_onehot):
        user_embeds = self.user_embed(users_onehot)
        movie_embeds = self.movie_embed(movies_onehot)
        user_embeds = user_embeds.view(-1, user_embeds.size(1))
        movie_embeds = movie_embeds.view(-1, movie_embeds.size(1))
        embedding = torch.cat([user_embeds, movie_embeds], dim=1)
        embedding = F.relu(self.hidden1(embedding))
        embedding = F.relu(self.hidden2(embedding))
        embedding = F.relu(self.hidden3(embedding))
        output = self.out(embedding)
        return output

    def predict_ratings(self, users_onehot, movies_onehot):
        with torch.no_grad():
            output = self(users_onehot, movies_onehot)
        return output.squeeze().tolist()  # Convert tensor to list of ratings


In [11]:

from sklearn.model_selection import train_test_split

# Assuming 'userId' is the column representing users in your DataFrame
df_train, df_valid = train_test_split(df, test_size=0.1, random_state=42, stratify=df['userId'].values)
print("Size of Training Set:", len(df_train))
print("Size of Validation Set:", len(df_valid))

# Create datasets
train_dataset = MovieDataset(df_train, n_users, n_movies)
valid_dataset = MovieDataset(df_valid, n_users, n_movies)

# Create data loaders
train_loader = DataLoader(dataset=train_dataset, batch_size=4, shuffle=True, num_workers=2)
validation_loader = DataLoader(dataset=valid_dataset, batch_size=4, shuffle=True, num_workers=2)
#print("Unique User IDs in Training Set (Sorted):")
#print(sorted(df_train['userId'].unique()))

#print("\nUnique User IDs in Validation Set (Sorted):")
#print(sorted(df_valid['userId'].unique()))


Size of Training Set: 90752
Size of Validation Set: 10084


In [12]:
# Choose the first user ID (assuming user IDs start from 1)
first_user_id = 1

# Filter the training set for the first user
training_ratings_first_user = df_train[df_train['userId'] == first_user_id]

# Filter the validation set for the first user
validation_ratings_first_user = df_valid[df_valid['userId'] == first_user_id]

# Print information for the training set
print(f"Training Set - User {first_user_id} Ratings:")
print(training_ratings_first_user[['movieId', 'rating']])

# Print information for the validation set
print(f"\nValidation Set - User {first_user_id} Ratings:")
print(validation_ratings_first_user[['movieId', 'rating']])


Training Set - User 1 Ratings:
     movieId  rating
134     2115     5.0
99      1552     4.0
143     2253     2.0
76      1219     2.0
48       954     5.0
..       ...     ...
204     3168     4.0
186     2899     5.0
132     2099     4.0
170     2617     2.0
3         47     5.0

[209 rows x 2 columns]

Validation Set - User 1 Ratings:
     movieId  rating
109     1777     4.0
32       590     4.0
216     3479     4.0
53      1029     5.0
141     2174     4.0
168     2596     5.0
193     2985     4.0
145     2273     4.0
67      1136     5.0
81      1240     5.0
47       943     4.0
207     3247     3.0
188     2944     5.0
92      1377     3.0
20       356     4.0
98      1517     5.0
127     2078     5.0
160     2478     4.0
226     3740     4.0
162     2502     5.0
152     2389     2.0
111     1804     5.0
83      1258     3.0


In [13]:
dataiter = iter(train_loader)

for dataloader_data in dataiter:
    print(dataloader_data)
    break

{'users_onehot': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'movies_onehot': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'ratings': tensor([3.5000, 5.0000, 4.5000, 3.5000])}


In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [15]:
model = RecSysModel(
    n_users, n_movies, emb_size=32
).to(device)

# Optimizer and scheduler
#optimizer = torch.optim.Adam(model.parameters())
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)  # Setting learning rate to 0.001

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)

# Loss function
loss_func = nn.MSELoss()

In [16]:
print(n_users)
print(n_movies)
print(df.movieId.max() + 1)  # Adding 1 because movieId starts from 0 after one-hot encoding
print(len(train_dataset))


610
193610
193610
90752


In [17]:
with torch.no_grad():
    model_output = model(dataloader_data['users_onehot'], 
                         dataloader_data['movies_onehot'])

    print(f"model_output: {model_output}, size: {model_output.size()}")


model_output: tensor([[0.1391],
        [0.1391],
        [0.1388],
        [0.1384]]), size: torch.Size([4, 1])


In [18]:
rating = dataloader_data["ratings"]
print(rating)
print(rating.view(4, -1))
print(model_output)

print(rating.sum())

print(model_output.sum() - rating.sum())

tensor([3.5000, 5.0000, 4.5000, 3.5000])
tensor([[3.5000],
        [5.0000],
        [4.5000],
        [3.5000]])
tensor([[0.1391],
        [0.1391],
        [0.1388],
        [0.1384]])
tensor(16.5000)
tensor(-15.9447)


In [19]:
epochs = 1
total_loss = 0
plot_steps, print_steps = 5000, 5000
step_cnt = 0
all_losses_list = [] 

model.train() 
for epoch_i in range(epochs):
    for i, train_data in enumerate(train_loader):
        output = model(train_data["users_onehot"], 
                       train_data["movies_onehot"]
                      ) 
        
        rating = train_data["ratings"].view(4, -1).to(torch.float32)

        loss = loss_func(output, rating)
        total_loss = total_loss + loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        step_cnt = step_cnt + len(train_data["users_onehot"])
        

        if(step_cnt % plot_steps == 0):
            avg_loss = total_loss/(len(train_data["users_onehot"]) * plot_steps)
            print(f"epoch {epoch_i} loss at step: {step_cnt} is {avg_loss}")
            all_losses_list.append(avg_loss)
            total_loss = 0  # reset total_loss

epoch 0 loss at step: 5000 is 0.5233647870644927
epoch 0 loss at step: 10000 is 0.07531063436465338
epoch 0 loss at step: 15000 is 0.06002317675082013
epoch 0 loss at step: 20000 is 0.05626815644307062
epoch 0 loss at step: 25000 is 0.05493996717752889
epoch 0 loss at step: 30000 is 0.05164325619270094
epoch 0 loss at step: 35000 is 0.05170601395922713
epoch 0 loss at step: 40000 is 0.05100405651591718
epoch 0 loss at step: 45000 is 0.04932908554757014
epoch 0 loss at step: 50000 is 0.0489154177788645
epoch 0 loss at step: 55000 is 0.049780421742517504
epoch 0 loss at step: 60000 is 0.04939890119056217
epoch 0 loss at step: 65000 is 0.04796836871029809
epoch 0 loss at step: 70000 is 0.04958104000575841
epoch 0 loss at step: 75000 is 0.04787509916299023
epoch 0 loss at step: 80000 is 0.04890434596342966
epoch 0 loss at step: 85000 is 0.04848163078334183
epoch 0 loss at step: 90000 is 0.04865908454684541


In [20]:
from sklearn.metrics import mean_squared_error

model_output_list = []
target_rating_list = []

model.eval()

with torch.no_grad():
    for i, batched_data in enumerate(validation_loader): 
        model_output = model(batched_data['users_onehot'], 
                             batched_data['movies_onehot'])
        
        model_output_list.append(model_output.sum().item() / len(batched_data['users_onehot']))

        target_rating = batched_data["ratings"]
        
        target_rating_list.append(target_rating.sum().item() / len(batched_data['users_onehot']))

# squared If True returns MSE value, if False returns RMSE value.
rms = mean_squared_error(target_rating_list, model_output_list, squared=False)
print(f"rms: {rms}")


rms: 0.43832233830271006


In [21]:
predicted_ratings_list = []

with torch.no_grad():
    for i, batched_data in enumerate(validation_loader): 
        
        model_output = model(batched_data['users_onehot'], batched_data['movies_onehot'])
        
        for user_idx in range(len(batched_data['users_onehot'])):
            # Get the index of the test item (movie with the highest rating)
            test_item_index = torch.argmax(batched_data["ratings"][user_idx]).item()

            # Predict rating for the test item
            predicted_rating = model_output[user_idx, test_item_index].item()
            predicted_ratings_list.append(predicted_rating)

        
# Calculate average predicted rating
average_predicted_rating = np.mean(predicted_ratings_list)
print(f"Average Predicted Rating of Test Item: {average_predicted_rating}")


Average Predicted Rating of Test Item: 3.4544063937753875


In [22]:
import pandas as pd
import numpy as np

# Group by user ID and collect rated movie IDs and their ratings for each user
rated_movies_by_user = df.groupby('userId').apply(lambda x: x.nlargest(1, 'rating')).reset_index(drop=True)

# Initialize a list to store HR evaluation data
hr_evaluation_data = []

# Iterate through each user
for user_id, rated_movie_data in rated_movies_by_user.iterrows():
    rated_movie_id = rated_movie_data['movieId']
    rated_movie_rating = rated_movie_data['rating']
    
    # Extract unrated movie IDs
    all_movie_ids = df['movieId'].unique()
    rated_movie_ids = df[df['userId'] == user_id]['movieId'].unique()
    unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]
    
    # Sample 100 unrated movie IDs if available
    if len(unrated_movie_ids) >= 100:
        sampled_movie_ids = np.random.choice(unrated_movie_ids, size=100, replace=False)
    else:
        sampled_movie_ids = unrated_movie_ids
    
    # Create HR evaluation data for the user
    for movie_id in sampled_movie_ids:
        hr_evaluation_data.append({'userId': user_id, 'movieId': movie_id, 'rating': 0})
    
    # Append the movie with the highest rating for the user
    hr_evaluation_data.append({'userId': user_id, 'movieId': rated_movie_id, 'rating': rated_movie_rating})

# Create a DataFrame from the HR evaluation data
hr_evaluation_df = pd.DataFrame(hr_evaluation_data)

# Save the DataFrame to a CSV file
hr_evaluation_df.to_csv('/kaggle/working/hr_evaluation_data.csv', index=False)


In [23]:
hr_evaluation_df['movieId'] = hr_evaluation_df['movieId'].astype(int)


In [24]:
hr_evaluation_df

Unnamed: 0,userId,movieId,rating
0,0,116738,0.0
1,0,2672,0.0
2,0,138036,0.0
3,0,42191,0.0
4,0,110773,0.0
...,...,...,...
61605,609,58870,0.0
61606,609,99813,0.0
61607,609,596,0.0
61608,609,33188,0.0


In [25]:
# Get the number of unique users and movies
nn_users = hr_evaluation_df.userId.nunique()
nn_movies = hr_evaluation_df.movieId.max() + 1




In [26]:
print(nn_users)
print(nn_movies)

610
193610


In [27]:
hr_eval_dataset = MovieDataset(hr_evaluation_df, nn_users, nn_movies)

In [28]:
hr_eval_loader = DataLoader(dataset=hr_eval_dataset, batch_size=4, shuffle=True, num_workers=2)

In [29]:
dataitern = iter(hr_eval_loader)

for dataloader_dataa in dataitern:
    print(dataloader_dataa)
    break

{'users_onehot': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'movies_onehot': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'ratings': tensor([0., 0., 0., 0.])}


In [30]:
# Get all user IDs
all_user_ids = df['userId'].unique()


In [31]:
model_output_list = []
target_rating_list = []

model.eval()

with torch.no_grad():
    for i, batched_data in enumerate(hr_eval_loader): 
        model_output = model(batched_data['users_onehot'], 
                             batched_data['movies_onehot'])
        
        model_output_list.append(model_output.sum().item() / len(batched_data['users_onehot']))

        target_rating = batched_data["ratings"]
        
        target_rating_list.append(target_rating.sum().item() / len(batched_data['users_onehot']))



In [32]:
import pandas as pd

# Create an empty list to store predictions for each user
predictions_per_user = []

# Iterate over the predictions and store them in a list
for i, batched_data in enumerate(hr_eval_loader): 
    model_output = model(batched_data['users_onehot'], 
                         batched_data['movies_onehot'])
    
    # Store the predictions for the current batch
    predictions_per_user.extend(model_output.tolist())

# Initialize a variable to count hits
hits_count = 0

# Iterate over each user's predictions
for user_predictions, batched_data in zip(predictions_per_user, hr_eval_loader):
    # Rank the top 10 movies according to predicted ratings
    top_10_recommendations = sorted(enumerate(user_predictions), key=lambda x: x[1], reverse=True)[:10]
    
    # Get the index of the highest rated item for the user
    highest_rated_item_index = batched_data['ratings'].argmax().item()
    
    # Check if the index of the highest rated item is present in the top 10 recommendations
    if highest_rated_item_index in [idx for idx, _ in top_10_recommendations]:
        hits_count += 1

# Calculate hit rate
hit_rate = hits_count / len(predictions_per_user)
print("Hit rate:", hit_rate)


Hit rate: 0.24278526213277066


In [33]:
# Initialize a variable to count hits and total users with the highest rating of 5
hits_count = 0
users_with_highest_rating_5 = 0

# Iterate over each user's predictions
for user_predictions, batched_data in zip(predictions_per_user, hr_eval_loader):
    # Get the index of the highest rated item for the user
    highest_rated_item_index = batched_data['ratings'].argmax().item()
    
    # Check if the highest rating for the user is 5
    if batched_data['ratings'][highest_rated_item_index] == 5:
        users_with_highest_rating_5 += 1
        
        # Rank the top 10 movies according to predicted ratings
        top_10_recommendations = sorted(enumerate(user_predictions), key=lambda x: x[1], reverse=True)[:10]
        
        # Check if the index of the highest rated item is present in the top 10 recommendations
        if highest_rated_item_index in [idx for idx, _ in top_10_recommendations]:
            hits_count += 1

# Calculate hit rate only for users with the highest rating of 5
if users_with_highest_rating_5 > 0:
    hit_rate = hits_count / users_with_highest_rating_5
else:
    hit_rate = 0  # To avoid division by zero error

print("Hit rate for users with highest rating of 5:", hit_rate)


Hit rate for users with highest rating of 5: 0.2596491228070175


In [34]:
# Initialize a variable to count hits
hits_count = 0

# Iterate over each user's predictions
for user_predictions, batched_data in zip(predictions_per_user, hr_eval_loader):
    # Rank the movies according to predicted ratings
    ranked_movies = sorted(enumerate(user_predictions), key=lambda x: x[1], reverse=True)
    
    # Get the index of the highest rated item for the user
    highest_rated_item_index = batched_data['ratings'].argmax().item()
    
    # Initialize k to 0
    k = 0
    
    # Track the previous rating to detect rank changes
    previous_rating = None
    
    # Iterate through the top 10 recommendations
    for idx, (movie_idx, rating) in enumerate(ranked_movies[:10]):
        if rating != previous_rating:
            # Include all movies with the same rating until a change in the ranking
            k += 1
            if k == 11:  # Limit to top 10 recommendations
                break
        # Check if the index of the highest rated item is present in the top k recommendations
        if movie_idx == highest_rated_item_index:
            hits_count += 1
            break  # No need to continue if hit is found
        previous_rating = rating
    
# Calculate hit rate
hit_rate = hits_count / len(predictions_per_user)
print("Hit rate considering rank ties and top 10 recommendations:", hit_rate)


Hit rate considering rank ties and top 10 recommendations: 0.24301249797110858
