In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt 
from torch.utils.data import Dataset, DataLoader

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#torch.cuda.is_available()
print(device)

In [3]:
df = pd.read_csv("rating_complete.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57633278 entries, 0 to 57633277
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 1.3 GB


In [5]:
df.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [6]:
df['user_id'].describe()

count    5.763328e+07
mean     1.768878e+05
std      1.020117e+05
min      0.000000e+00
25%      8.827800e+04
50%      1.772910e+05
75%      2.654190e+05
max      3.534040e+05
Name: user_id, dtype: float64

In [7]:
df['anime_id'].describe()

count    5.763328e+07
mean     1.583147e+04
std      1.326114e+04
min      1.000000e+00
25%      3.091000e+03
50%      1.188700e+04
75%      2.899900e+04
max      4.845600e+04
Name: anime_id, dtype: float64

In [8]:
df.user_id.nunique()

310059

In [9]:
df.anime_id.nunique()

16872

In [10]:
df.rating.value_counts() #check value distribution

8     14642156
7     13325549
9      9773857
6      6849293
10     6716048
5      3436250
4      1455102
3       696048
2       405556
1       333419
Name: rating, dtype: int64

In [11]:
df.shape

(57633278, 3)

#### Training Dataset Class Wrapper

In [12]:
class AnimeDataset:
    def __init__(self, users, animes, ratings):
        self.users = users
        self.animes = animes
        self.ratings = ratings
    # len(movie_dataset)
    def __len__(self):
        return len(self.users)
    # movie_dataset[1] 
    def __getitem__(self, item):

        users = self.users[item] 
        animes = self.animes[item]
        ratings = self.ratings[item]
        
        return {
            "users": torch.tensor(users, dtype=torch.long),
            "animes": torch.tensor(animes, dtype=torch.long),
            "ratings": torch.tensor(ratings, dtype=torch.long),
        }

#### Create the model

In [13]:
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_animes):
        super().__init__()
        # trainable lookup matrix for shallow embedding vectors
        
        self.user_embed = nn.Embedding(n_users, 32)
        self.anime_embed = nn.Embedding(n_animes, 32)
        # user, movie embedding concat
        self.out = nn.Linear(64, 1)

    
    def forward(self, users, animes, ratings=None):
        user_embeds = self.user_embed(users)
        anime_embeds = self.anime_embed(animes)
        output = torch.cat([user_embeds, anime_embeds], dim=1)
        
        output = self.out(output)
        
        return output

In [14]:
# encode the user and movie id to start from 0 so we don't run into index out of bound with Embedding
lbl_user = preprocessing.LabelEncoder()
lbl_anime = preprocessing.LabelEncoder()
df.user_id = lbl_user.fit_transform(df.user_id.values)
df.anime_id = lbl_anime.fit_transform(df.anime_id.values)

df_train, df_valid = model_selection.train_test_split(
    df, test_size=0.1, random_state=42, stratify=df.rating.values
)

train_dataset = AnimeDataset(
    users=df_train.user_id.values,
    animes=df_train.anime_id.values,
    ratings=df_train.rating.values
)

valid_dataset = AnimeDataset(
    users=df_valid.user_id.values,
    animes=df_valid.anime_id.values,
    ratings=df_valid.rating.values
)


In [15]:
for i in range(5):  # Print the first 5 data points
    data_point = train_dataset[i]
    print(data_point)

print(len(train_dataset))
print(len(valid_dataset))

{'users': tensor(211007), 'animes': tensor(13830), 'ratings': tensor(7)}
{'users': tensor(123627), 'animes': tensor(3532), 'ratings': tensor(9)}
{'users': tensor(40441), 'animes': tensor(9151), 'ratings': tensor(1)}
{'users': tensor(251116), 'animes': tensor(20), 'ratings': tensor(9)}
{'users': tensor(85875), 'animes': tensor(7254), 'ratings': tensor(5)}
51869950
5763328


In [17]:
from torch.utils.data import DataLoader

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=4,
                          shuffle=True)

validation_loader = DataLoader(dataset=valid_dataset,
                               batch_size=4,
                               shuffle=True)

dataiter = iter(train_loader)
dataloader_data = next(dataiter)
print(dataloader_data)
print(len(train_loader))
print(len(validation_loader))

{'users': tensor([ 58097, 227042, 295765, 163845]), 'animes': tensor([11583,  1571,  7383,    31]), 'ratings': tensor([ 8,  9, 10,  5])}
12967488
1440832


In [18]:
model = RecSysModel(
    n_users=len(lbl_user.classes_),
    n_animes=len(lbl_anime.classes_),
).to(device)

optimizer = torch.optim.Adam(model.parameters())  
sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.7)

loss_func = nn.MSELoss()

In [19]:
print(len(lbl_user.classes_))
print(len(lbl_anime.classes_))
print(df.anime_id.max())
print(len(train_dataset))

310059
16872
16871
51869950


#### Manually run a forward path

In [20]:
print(dataloader_data['users'])

print(dataloader_data['users'].size())
print(dataloader_data['animes'] )
print(dataloader_data['animes'].size())

user_embed = nn.Embedding(len(lbl_user.classes_), 32)
anime_embed = nn.Embedding(len(lbl_anime.classes_), 32)

out = nn.Linear(64, 1)

tensor([ 58097, 227042, 295765, 163845])
torch.Size([4])
tensor([11583,  1571,  7383,    31])
torch.Size([4])


In [21]:
user_embeds = user_embed(dataloader_data['users'])
anime_embeds = anime_embed(dataloader_data['animes'])
print(f"user_embeds {user_embeds.size()}")
print(f"user_embeds {user_embeds}")
print(f"anime_embeds {anime_embeds.size()}")
print(f"anime_embeds {anime_embeds}")

user_embeds torch.Size([4, 32])
user_embeds tensor([[ 7.7243e-01, -1.5965e+00, -6.0004e-01,  5.0266e-01, -1.0823e+00,
         -1.1506e+00,  8.5844e-01,  8.4515e-01,  7.3376e-01, -2.9686e-01,
         -1.1021e+00,  3.6744e-02, -1.3023e+00, -4.6652e-01,  5.8638e-01,
          1.2709e-01,  6.8724e-01,  1.2631e-01, -7.3367e-01,  7.6715e-01,
          7.6379e-01, -5.8569e-01, -1.0590e+00,  1.2386e-01,  1.6112e+00,
         -2.8967e-01,  1.0036e+00, -8.9182e-01, -1.3622e-01, -2.3797e-01,
          2.9906e-02, -4.5136e-01],
        [-1.5184e-01, -1.7212e+00,  6.7305e-01,  3.6604e-01,  6.2090e-01,
          6.3496e-01,  2.9518e-01, -1.4230e+00, -1.8566e+00,  2.4161e+00,
         -2.7963e-01,  6.9170e-01, -7.8318e-01, -1.3372e+00,  6.4404e-01,
          1.1244e+00, -1.0874e+00, -1.9663e+00, -6.1607e-01, -2.1759e+00,
         -3.7866e-01,  1.8794e+00, -1.8901e-01, -1.1275e+00,  1.5761e-01,
          1.8646e+00,  7.7471e-01, -1.7129e+00, -8.9713e-01, -8.6602e-01,
          1.4229e+00,  4.1028e-0

In [22]:
output = torch.cat([user_embeds, anime_embeds], dim=1) 
print(f"output: {output.size()}")
print(f"output: {output}")
output = out(output)
print(f"output: {output}")

output: torch.Size([4, 64])
output: tensor([[ 7.7243e-01, -1.5965e+00, -6.0004e-01,  5.0266e-01, -1.0823e+00,
         -1.1506e+00,  8.5844e-01,  8.4515e-01,  7.3376e-01, -2.9686e-01,
         -1.1021e+00,  3.6744e-02, -1.3023e+00, -4.6652e-01,  5.8638e-01,
          1.2709e-01,  6.8724e-01,  1.2631e-01, -7.3367e-01,  7.6715e-01,
          7.6379e-01, -5.8569e-01, -1.0590e+00,  1.2386e-01,  1.6112e+00,
         -2.8967e-01,  1.0036e+00, -8.9182e-01, -1.3622e-01, -2.3797e-01,
          2.9906e-02, -4.5136e-01, -1.1740e+00, -3.8761e-01,  6.6059e-01,
         -7.4245e-01,  3.8504e-01,  2.5229e+00, -7.4707e-01,  2.0519e+00,
          4.9144e-02, -1.1749e+00,  1.0898e+00,  8.6778e-01, -6.7003e-01,
          1.7910e+00,  7.7294e-01,  5.2404e-01, -5.2329e-01,  7.3376e-01,
          3.8422e-01, -7.0036e-01, -1.1896e+00, -1.5485e+00,  6.1016e-01,
         -7.1759e-01,  1.4432e+00,  1.1927e+00,  1.1731e+00, -1.1786e+00,
          9.8466e-01, -2.5896e-02, -1.3171e+00, -1.0724e+00],
        [-1.51

In [24]:
import torch

# Assuming you have a GPU available (cuda:0)
device = torch.device("cuda:0")

# Move the model to the GPU
model.to(device)

# Move the input tensors to the GPU
dataloader_data['users'] = dataloader_data['users'].to(device)
dataloader_data['animes'] = dataloader_data['animes'].to(device)

# Now you can use the model and input tensors together without any device mismatch error
with torch.no_grad():
    model_output = model(dataloader_data['users'], dataloader_data['animes'])
    print(f"model_output: {model_output}, size: {model_output.size()}")


model_output: tensor([[ 0.7031],
        [-0.5308],
        [-0.5884],
        [-0.4143]], device='cuda:0'), size: torch.Size([4, 1])


In [25]:
rating = dataloader_data["ratings"]
print(rating)
print(rating.view(4, -1))
print(model_output)

print(rating.sum())

print(model_output.sum() - rating.sum())

tensor([ 8,  9, 10,  5])
tensor([[ 8],
        [ 9],
        [10],
        [ 5]])
tensor([[ 0.7031],
        [-0.5308],
        [-0.5884],
        [-0.4143]], device='cuda:0')
tensor(32)
tensor(-32.8303, device='cuda:0')


#### Run the training loop

In [None]:
epochs = 1
total_loss = 0
plot_steps, print_steps = 5000, 5000
step_cnt = 0
all_losses_list = [] 

model.train() 

# Assuming you have a GPU available (cuda:0)
device = torch.device("cuda:0")

for epoch_i in range(epochs):
    for i, train_data in enumerate(train_loader):

        # Move train_data to the GPU
        train_data = {key: value.to(device) for key, value in train_data.items()}

        output = model(train_data["users"], train_data["animes"])
        
        # .view(4, -1) is to reshape the rating to match the shape of model output which is 4x1
        rating = train_data["ratings"].view(4, -1).to(torch.float32).to(device)

        loss = loss_func(output, rating)
        total_loss = total_loss + loss.sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        step_cnt = step_cnt + len(train_data["users"])
        
        if(step_cnt % plot_steps == 0):
            avg_loss = total_loss / (len(train_data["users"]) * plot_steps)
            print(f"epoch {epoch_i} loss at step: {step_cnt} is {avg_loss}")
            all_losses_list.append(avg_loss)
            total_loss = 0  # reset total_loss


epoch 0 loss at step: 5000 is 3.023882845926285
epoch 0 loss at step: 10000 is 1.9771920445680617
epoch 0 loss at step: 15000 is 1.1762241766512394
epoch 0 loss at step: 20000 is 0.6962746537368745
epoch 0 loss at step: 25000 is 0.4473756550785154
epoch 0 loss at step: 30000 is 0.30860093593299387
epoch 0 loss at step: 35000 is 0.2442272994570434
epoch 0 loss at step: 40000 is 0.1912742623878643
epoch 0 loss at step: 45000 is 0.18337213502191008
epoch 0 loss at step: 50000 is 0.1768284473521635
epoch 0 loss at step: 55000 is 0.16804007335193455
epoch 0 loss at step: 60000 is 0.16960205589663238
epoch 0 loss at step: 65000 is 0.16165804571742193
epoch 0 loss at step: 70000 is 0.16408938187602906
epoch 0 loss at step: 75000 is 0.1562148043267429
epoch 0 loss at step: 80000 is 0.15470766914086417
epoch 0 loss at step: 85000 is 0.16074717778009362
epoch 0 loss at step: 90000 is 0.16170650744186715
epoch 0 loss at step: 95000 is 0.16077488029636441
epoch 0 loss at step: 100000 is 0.16046097

In [None]:
import torch
import torch.optim as optim

# Your code for model and data loaders goes here...

# Assuming you have a GPU available (cuda:0)
device = torch.device("cuda:0")

# Initialize optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_func = torch.nn.MSELoss()

# Number of epochs and steps to continue training
epochs = 1
total_steps_to_continue = 6250000  # Number of steps the model has already trained

# Your code for other variables and setup...

model.train()

# Load the model checkpoint if available
try:
    checkpoint = torch.load("model_checkpoint.pth")
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    step_cnt = checkpoint["step_cnt"]
    all_losses_list = checkpoint["all_losses_list"]
    print("Model checkpoint loaded successfully.")
except FileNotFoundError:
    step_cnt = 0
    all_losses_list = []
    print("No previous model checkpoint found. Starting training from scratch.")

for epoch_i in range(epochs):
    for i, train_data in enumerate(train_loader):
        # Move train_data to the GPU
        train_data = {key: value.to(device) for key, value in train_data.items()}

        output = model(train_data["users"], train_data["animes"])
        rating = train_data["ratings"].view(4, -1).to(torch.float32).to(device)

        loss = loss_func(output, rating)
        total_loss = loss.sum().item()  # Calculate the loss per batch correctly
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        step_cnt = step_cnt + len(train_data["users"])

        if step_cnt >= total_steps_to_continue and step_cnt % plot_steps == 0:
            avg_loss = total_loss / (len(train_data["users"]) * plot_steps)
            print(f"epoch {epoch_i} loss at step: {step_cnt} is {avg_loss}")
            all_losses_list.append(avg_loss)
            total_loss = 0  # reset total_loss

        # Save the model checkpoint after a certain number of steps
        if step_cnt % total_steps_to_continue == 0:
            torch.save({
                "model_state_dict": model.state_dict(),
                "optimizer_state_dict": optimizer.state_dict(),
                "step_cnt": step_cnt,
                "all_losses_list": all_losses_list
            }, "model_checkpoint.pth")
            print("Model checkpoint saved successfully.")
