In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from tqdm import tqdm


device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else
                      "cpu")

device


device(type='mps')

In [5]:
anime_data = pd.read_csv('./Anime Dataset 2023/anime-dataset-2023.csv')
user_data = pd.read_csv('./Anime Dataset 2023/users-details-2023.csv')
user_scores = pd.read_csv('./Anime Dataset 2023/users-score-2023.csv')

In [6]:
print("Anime data shape:", anime_data.shape)
print("User data shape:", user_data.shape)
print("User scores shape:", user_scores.shape)

print("\nSample of anime data:")
display(anime_data.head())
print("\nSample of user data:")
display(user_data.head())
print("\nSample of user scores:")
display(user_scores.head())

Anime data shape: (24905, 24)
User data shape: (731290, 16)
User scores shape: (24325191, 5)

Sample of anime data:


Unnamed: 0,anime_id,Name,English name,Other name,Score,Genres,Synopsis,Type,Episodes,Aired,...,Studios,Source,Duration,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,Cowboy Bebop,カウボーイビバップ,8.75,"Action, Award Winning, Sci-Fi","Crime is timeless. By the year 2071, humanity ...",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",...,Sunrise,Original,24 min per ep,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: Tengoku no Tobira,Cowboy Bebop: The Movie,カウボーイビバップ 天国の扉,8.38,"Action, Sci-Fi","Another day, another bounty—such is the life o...",Movie,1.0,"Sep 1, 2001",...,Bones,Original,1 hr 55 min,R - 17+ (violence & profanity),189.0,602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,Trigun,トライガン,8.22,"Action, Adventure, Sci-Fi","Vash the Stampede is the man with a $$60,000,0...",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",...,Madhouse,Manga,24 min per ep,PG-13 - Teens 13 or older,328.0,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),7.25,"Action, Drama, Mystery, Supernatural",Robin Sena is a powerful craft user drafted in...,TV,26.0,"Jul 3, 2002 to Dec 25, 2002",...,Sunrise,Original,25 min per ep,PG-13 - Teens 13 or older,2764.0,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Bouken Ou Beet,Beet the Vandel Buster,冒険王ビィト,6.94,"Adventure, Fantasy, Supernatural",It is the dark century and the people are suff...,TV,52.0,"Sep 30, 2004 to Sep 29, 2005",...,Toei Animation,Manga,23 min per ep,PG - Children,4240.0,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...



Sample of user data:


Unnamed: 0,Mal ID,Username,Gender,Birthday,Location,Joined,Days Watched,Mean Score,Watching,Completed,On Hold,Dropped,Plan to Watch,Total Entries,Rewatched,Episodes Watched
0,1,Xinil,Male,1985-03-04T00:00:00+00:00,California,2004-11-05T00:00:00+00:00,142.3,7.37,1.0,233.0,8.0,93.0,64.0,399.0,60.0,8458.0
1,3,Aokaado,Male,,"Oslo, Norway",2004-11-11T00:00:00+00:00,68.6,7.34,23.0,137.0,99.0,44.0,40.0,343.0,15.0,4072.0
2,4,Crystal,Female,,"Melbourne, Australia",2004-11-13T00:00:00+00:00,212.8,6.68,16.0,636.0,303.0,0.0,45.0,1000.0,10.0,12781.0
3,9,Arcane,,,,2004-12-05T00:00:00+00:00,30.0,7.71,5.0,54.0,4.0,3.0,0.0,66.0,0.0,1817.0
4,18,Mad,,,,2005-01-03T00:00:00+00:00,52.0,6.27,1.0,114.0,10.0,5.0,23.0,153.0,42.0,3038.0



Sample of user scores:


Unnamed: 0,user_id,Username,anime_id,Anime Title,rating
0,1,Xinil,21,One Piece,9
1,1,Xinil,48,.hack//Sign,7
2,1,Xinil,320,A Kite,5
3,1,Xinil,49,Aa! Megami-sama!,8
4,1,Xinil,304,Aa! Megami-sama! Movie,8


In [7]:
data = pd.merge(user_scores, user_data, left_on='user_id', right_on='Mal ID')
data = pd.merge(data, anime_data, on='anime_id')

popular_anime = data['anime_id'].value_counts()
popular_anime_ids = popular_anime[popular_anime > 500].index
data = data[data['anime_id'].isin(popular_anime_ids)]

active_users = data['user_id'].value_counts()
active_user_ids = active_users[active_users > 100].index
data = data[data['user_id'].isin(active_user_ids)]

columns_to_drop = [
    'Gender', 'Birthday', 'Location', 'Username_x', 'Username_y',
    'Mal ID', 'Anime Title', 'Name', 'English name', 'Other name',
    'Synopsis', 'Aired', 'Premiered', 'Producers', 'Licensors',
    'Studios', 'Source', 'Image URL', 'Joined', 'Status',
    'Duration', 'Rank', 'Scored By'
]
data = data.drop(columns=columns_to_drop)

num_cols = [
    'Days Watched', 'Mean Score', 'Watching', 'Completed',
    'On Hold', 'Dropped', 'Plan to Watch', 'Total Entries',
    'Rewatched', 'Episodes Watched'
]
for col in num_cols:
    data[col].fillna(data[col].mean(), inplace=True)

print("Processed data shape:", data.shape)
print("\nSample of processed data:")
display(data.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].mean(), inplace=True)


Processed data shape: (17870130, 21)

Sample of processed data:


Unnamed: 0,user_id,anime_id,rating,Days Watched,Mean Score,Watching,Completed,On Hold,Dropped,Plan to Watch,...,Rewatched,Episodes Watched,Score,Genres,Type,Episodes,Rating,Popularity,Favorites,Members
0,1,21,9,142.3,7.37,1.0,233.0,8.0,93.0,64.0,...,60.0,8458.0,8.69,"Action, Adventure, Fantasy",TV,UNKNOWN,PG-13 - Teens 13 or older,20,198986,2168904
1,1,48,7,142.3,7.37,1.0,233.0,8.0,93.0,64.0,...,60.0,8458.0,6.95,"Adventure, Fantasy, Mystery",TV,26.0,PG-13 - Teens 13 or older,1243,1343,178659
2,1,320,5,142.3,7.37,1.0,233.0,8.0,93.0,64.0,...,60.0,8458.0,6.54,"Action, Drama, Hentai",OVA,2.0,Rx - Hentai,2485,302,66821
3,1,49,8,142.3,7.37,1.0,233.0,8.0,93.0,64.0,...,60.0,8458.0,7.29,"Comedy, Romance, Supernatural",OVA,5.0,PG-13 - Teens 13 or older,2832,246,52627
4,1,304,8,142.3,7.37,1.0,233.0,8.0,93.0,64.0,...,60.0,8458.0,7.54,"Comedy, Romance, Supernatural",Movie,1.0,PG-13 - Teens 13 or older,3195,99,42124


In [8]:
class WideAndDeep(nn.Module):
    def __init__(self, n_users, n_animes, embed_dim, num_features):
        super(WideAndDeep, self).__init__()

        # Embeddings
        self.user_embed = nn.Embedding(n_users, embed_dim)
        self.anime_embed = nn.Embedding(n_animes, embed_dim)

        # Dropout layers
        self.embed_dropout = nn.Dropout(0.2)
        self.deep_dropout = nn.Dropout(0.3)

        # Wide component
        self.wide = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

        # Deep component
        self.deep = nn.Sequential(
            nn.Linear(embed_dim * 2, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, 1)
        )

        self._init_weights()

    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)

    def forward(self, user_id, anime_id, features):
        # Wide path
        wide_out = self.wide(features)

        # Deep path
        user_emb = self.user_embed(user_id)
        anime_emb = self.anime_embed(anime_id)

        user_emb = self.embed_dropout(user_emb)
        anime_emb = self.embed_dropout(anime_emb)

        deep_input = torch.cat([user_emb, anime_emb], dim=1)
        deep_out = self.deep(deep_input)

        output = wide_out + deep_out
        return torch.clamp(output, 0, 10)


In [9]:
data_sampled, _ = train_test_split(
    data,
    train_size=0.5,
    stratify=data['rating'],
    random_state=42
)

numeric_features = [
    'Days Watched', 'Mean Score', 'Watching', 'Completed',
    'On Hold', 'Dropped', 'Plan to Watch', 'Total Entries',
    'Rewatched', 'Episodes Watched', 'Popularity', 'Favorites', 'Members'
]

scaler = StandardScaler()
X_numeric = scaler.fit_transform(data_sampled[numeric_features])

X = data_sampled[['user_id', 'anime_id']].values
X = np.hstack((X, X_numeric))
y = data_sampled['rating'].values

X = torch.tensor(X, dtype=torch.float32).to(device)
y = torch.tensor(y, dtype=torch.float32).to(device).unsqueeze(1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [10]:
n_users = data_sampled['user_id'].nunique()
n_animes = data_sampled['anime_id'].nunique()
embed_dim = 32
num_features = X.shape[1] - 2

model = WideAndDeep(n_users, n_animes, embed_dim, num_features).to(device)
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2)

In [12]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    with tqdm(train_loader, desc="Training") as pbar:
        for X_batch, y_batch in pbar:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            user_id = X_batch[:, 0].long()
            anime_id = X_batch[:, 1].long()
            features = X_batch[:, 2:]

            optimizer.zero_grad()
            predictions = model(user_id, anime_id, features)
            loss = criterion(predictions, y_batch)

            l2_lambda = 0.01
            l2_reg = torch.tensor(0.).to(device)
            for param in model.parameters():
                l2_reg += torch.norm(param)
            loss += l2_lambda * l2_reg

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()
            pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    return total_loss / len(train_loader)

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    all_predictions = []
    all_targets = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            user_id = X_batch[:, 0].long()
            anime_id = X_batch[:, 1].long()
            features = X_batch[:, 2:]

            predictions = model(user_id, anime_id, features)
            loss = criterion(predictions, y_batch)
            total_loss += loss.item()

            all_predictions.extend(predictions.cpu().numpy())
            all_targets.extend(y_batch.cpu().numpy())

    all_predictions = np.array(all_predictions)
    all_targets = np.array(all_targets)

    test_loss = total_loss / len(test_loader)
    rmse = np.sqrt(((all_predictions - all_targets) ** 2).mean())
    mae = np.abs(all_predictions - all_targets).mean()
    accuracy = np.mean(np.abs(all_predictions - all_targets) <= 1.0)

    return test_loss, rmse, mae, accuracy

In [13]:
epochs = 10
best_loss = float('inf')

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    # Train
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)

    # Evaluate
    test_loss, rmse, mae, accuracy = evaluate_model(model, test_loader, criterion, device)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Test Loss: {test_loss:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"Accuracy (±1): {accuracy:.2%}")

    scheduler.step(test_loss)

    if test_loss < best_loss:
        best_loss = test_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': best_loss,
            'scaler': scaler
        }, 'lab-1_best_model.pth')
        print("Saved new best model")


Epoch 1/10


Training: 100%|██████████| 111689/111689 [29:01<00:00, 64.13it/s, loss=1.4674]


Train Loss: 2.4222
Test Loss: 1.8266
RMSE: 1.3515
MAE: 1.0373
Accuracy (±1): 57.66%
Saved new best model

Epoch 2/10


Training: 100%|██████████| 111689/111689 [28:56<00:00, 64.30it/s, loss=3.2871]


Train Loss: 2.0503
Test Loss: 1.8151
RMSE: 1.3472
MAE: 1.0297
Accuracy (±1): 58.27%
Saved new best model

Epoch 3/10


Training: 100%|██████████| 111689/111689 [29:03<00:00, 64.04it/s, loss=4.1322]


Train Loss: 2.0495
Test Loss: 1.8080
RMSE: 1.3446
MAE: 1.0225
Accuracy (±1): 58.81%
Saved new best model

Epoch 4/10


Training: 100%|██████████| 111689/111689 [29:09<00:00, 63.86it/s, loss=1.7132]


Train Loss: 2.0505
Test Loss: 1.8298
RMSE: 1.3527
MAE: 1.0402
Accuracy (±1): 57.47%

Epoch 5/10


Training: 100%|██████████| 111689/111689 [29:06<00:00, 63.95it/s, loss=1.8861]


Train Loss: 2.0502
Test Loss: 1.8095
RMSE: 1.3452
MAE: 1.0215
Accuracy (±1): 58.93%

Epoch 6/10


Training: 100%|██████████| 111689/111689 [29:09<00:00, 63.85it/s, loss=2.9525]


Train Loss: 2.0492
Test Loss: 1.8082
RMSE: 1.3447
MAE: 1.0209
Accuracy (±1): 58.96%

Epoch 7/10


Training: 100%|██████████| 111689/111689 [29:09<00:00, 63.84it/s, loss=2.4775]


Train Loss: 1.9368
Test Loss: 1.7934
RMSE: 1.3392
MAE: 1.0244
Accuracy (±1): 58.44%
Saved new best model

Epoch 8/10


Training: 100%|██████████| 111689/111689 [29:03<00:00, 64.04it/s, loss=1.5858]


Train Loss: 1.9307
Test Loss: 1.7810
RMSE: 1.3345
MAE: 1.0154
Accuracy (±1): 59.11%
Saved new best model

Epoch 9/10


Training: 100%|██████████| 111689/111689 [29:04<00:00, 64.02it/s, loss=4.4104]


Train Loss: 1.9313
Test Loss: 1.7821
RMSE: 1.3350
MAE: 1.0161
Accuracy (±1): 59.03%

Epoch 10/10


Training: 100%|██████████| 111689/111689 [29:06<00:00, 63.95it/s, loss=1.0999]


Train Loss: 1.9310
Test Loss: 1.7832
RMSE: 1.3354
MAE: 1.0115
Accuracy (±1): 59.51%


In [18]:
def get_recommendations(user_id, data_sampled, model, scaler, numeric_features, anime_data, top_n=10):
    user_features = data_sampled[data_sampled['user_id'] == user_id].iloc[0]

    all_anime_ids = data_sampled['anime_id'].unique()
    user_ids = np.full_like(all_anime_ids, user_id)

    user_features_array = np.tile(user_features[numeric_features].values, (len(all_anime_ids), 1))

    user_features_scaled = scaler.transform(user_features_array)

    X_pred = np.column_stack((user_ids, all_anime_ids, user_features_scaled))
    X_pred = torch.tensor(X_pred, dtype=torch.float32).to(device)

    model.eval()
    with torch.no_grad():
        predictions = model(
            X_pred[:, 0].long(),
            X_pred[:, 1].long(),
            X_pred[:, 2:]
        )

    pred_ratings = predictions.cpu().numpy()
    top_indices = pred_ratings.flatten().argsort()[-top_n:][::-1]

    recommended_anime = pd.DataFrame({
        'anime_id': all_anime_ids[top_indices],
        'predicted_rating': pred_ratings[top_indices].flatten()
    })

    recommended_anime = pd.merge(
        recommended_anime,
        anime_data[['anime_id', 'Name', 'Score', 'Members']],
        on='anime_id'
    )

    return recommended_anime

In [19]:
saved_model = torch.load('lab-1_best_model.pth', map_location=device)

n_users = data_sampled['user_id'].nunique()
n_animes = data_sampled['anime_id'].nunique()
embed_dim = 32
num_features = len(numeric_features)

model = WideAndDeep(n_users, n_animes, embed_dim, num_features).to(device)
model.load_state_dict(saved_model['model_state_dict'])
model.eval()

scaler = saved_model['scaler']

sample_user_id = data_sampled['user_id'].iloc[0]
recommendations = get_recommendations(
    sample_user_id,
    data_sampled,
    model,
    scaler,
    numeric_features,
    anime_data
)

print(f"\nRecommendations for user {sample_user_id}:")
display(recommendations[['Name', 'predicted_rating', 'Score', 'Members']].sort_values(by='predicted_rating', ascending=False))

  saved_model = torch.load('lab-1_best_model.pth', map_location=device)



Recommendations for user 1201045:




Unnamed: 0,Name,predicted_rating,Score,Members
0,Evangelion: 2.0 You Can (Not) Advance,8.908804,8.31,517144
1,Aria the Origination,8.869763,8.48,123798
2,Baccano! Specials,8.846051,8.09,175205
3,Koukaku Kidoutai: Stand Alone Complex 2nd GIG,8.845908,8.52,220872
4,Tengen Toppa Gurren Lagann Movie 2: Lagann-hen,8.842599,8.57,207140
5,Rurouni Kenshin: Meiji Kenkaku Romantan - Tsui...,8.835432,8.71,268621
6,Kara no Kyoukai Movie 5: Mujun Rasen,8.831157,8.53,230852
7,Cowboy Bebop: Tengoku no Tobira,8.813509,8.38,360978
8,Ginga Eiyuu Densetsu,8.807959,9.02,305491
9,Ginga Eiyuu Densetsu: Waga Yuku wa Hoshi no Ta...,8.801358,7.88,37629
