
# 0. Imports, seed, and device config


In [1]:

import numpy as np, pandas as pd, torch, warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

warnings.filterwarnings("ignore")          # squash SettingWithCopyWarning etc.
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("➡️ Using", device, torch.cuda.get_device_name(0) if device.type=="cuda" else "")


➡️ Using cuda NVIDIA L4


# 1. Load raw datasets

In [2]:

anime_df   = pd.read_csv("anime-dataset-2023.csv", low_memory=True)
users_df   = pd.read_csv("users-details-2023.csv", low_memory=True)
ratings_df = pd.read_csv("users-score-2023.csv", low_memory=True)


# 2. Standardise column names we’ll join on

In [3]:

ratings_df.rename(columns={"Anime Title": "Anime_Title"}, inplace=True)
users_df.rename(columns={"Mal ID": "user_id"}, inplace=True)


# 3-A. Impute missing *Rank* from *Popularity* using a quick

In [4]:
anime_df["Rank"] = pd.to_numeric(anime_df["Rank"], errors="coerce")
pop_known        = anime_df.dropna(subset=["Rank"])[["Popularity", "Rank"]]
if pop_known.size:          # skip if already complete
    reg   = LinearRegression().fit(pop_known[["Popularity"]], pop_known["Rank"])
    mask  = anime_df["Rank"].isna()
    anime_df.loc[mask, "Rank"] = reg.predict(anime_df.loc[mask, ["Popularity"]])

# ------------------------------------------------------------------
# 3-B. Drop obvious high-cardinality text columns to save RAM
# ------------------------------------------------------------------
anime_df.drop(columns=[
    "Synopsis", "Image URL", "Licensors", "Producers", "Studios",
    "Duration", "Other name", "English name"
], errors="ignore", inplace=True)

users_df.drop(columns=["Gender", "Location"], errors="ignore", inplace=True)


# 4. Merge => (user × anime × rating) super-table

In [5]:
merged_df = ratings_df.merge(anime_df,  on="anime_id", how="left") \
                      .merge(users_df,   on="user_id",  how="left")


# 5. Sample 100 000 rows (matches your later code)

In [None]:
chunk_merge_df = merged_df.sample(n=100_000, random_state=SEED).reset_index(drop=True)


# 5. Sample 100 000 rows (matches your later code)

In [6]:

chunk_merge_df = merged_df.sample(n=100000, random_state=SEED).reset_index(drop=True)


# 6. Numeric wide features

In [7]:
wide_numeric_cols       = ["Score", "Popularity", "Rank"]
chunk_merge_df[wide_numeric_cols] = (
        chunk_merge_df[wide_numeric_cols]
        .replace("UNKNOWN", np.nan)
        .apply(pd.to_numeric, errors="coerce")
)

# Mean-impute then **scale 0-1** (best practice from code 2)
scaler_wide = MinMaxScaler()
chunk_merge_df[wide_numeric_cols] = scaler_wide.fit_transform(
        chunk_merge_df[wide_numeric_cols].fillna(chunk_merge_df[wide_numeric_cols].mean())
)


# 7. Genres one-hot for the WIDE part

In [9]:
chunk_merge_df["Genres"] = chunk_merge_df["Genres"].fillna("Unknown")

# Use the new parameter name sparse_output=False
from sklearn.preprocessing import OneHotEncoder
genres_encoder = OneHotEncoder(handle_unknown="ignore",
                               sparse_output=False)

# Fit & transform in one go
genres_encoded = genres_encoder.fit_transform(
    chunk_merge_df[["Genres"]]
)  # → numpy array of shape (100000, n_genre_categories)


In [10]:
# ------------------------------------------------------------------
# 8. Label-encode IDs for the DEEP part
# ------------------------------------------------------------------
user_encoder  = LabelEncoder()
anime_encoder = LabelEncoder()

chunk_merge_df["user_id_enc"]  = user_encoder.fit_transform(chunk_merge_df["user_id"])
chunk_merge_df["anime_id_enc"] = anime_encoder.fit_transform(chunk_merge_df["anime_id"])


# 9. Horizontally stack numeric + genre one-hots  → wide_features

In [11]:
wide_features = np.hstack([
    chunk_merge_df[wide_numeric_cols].values.astype(np.float32),
    genres_encoded.astype(np.float32)
])


# 10. Ensure rating is numeric, then 80 / 20 split

In [12]:
chunk_merge_df["rating"] = pd.to_numeric(chunk_merge_df["rating"], errors="coerce") \
                             .fillna(0).astype(np.float32)

train_df, test_df = train_test_split(
    chunk_merge_df, test_size=0.20, random_state=SEED
)

# KEEP indices aligned with wide_features
train_wide = wide_features[train_df.index]
test_wide  = wide_features[test_df.index]


#11. Define the PyTorch Dataset Class

In [14]:
from torch.utils.data import Dataset, DataLoader  # ← add this

# Class to convert and return tensors for features.
class AnimeDataset(Dataset):
    def __init__(self, df, wide_features, target):
        self.df = df
        self.wide_features = wide_features
        self.target = target

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user_id   = self.df.iloc[idx]['user_id_enc']
        anime_id  = self.df.iloc[idx]['anime_id_enc']
        wide_feat = self.wide_features[idx]
        target    = self.target[idx]

        return (
            torch.tensor(user_id,  dtype=torch.long),
            torch.tensor(anime_id, dtype=torch.long),
            torch.tensor(wide_feat, dtype=torch.float32),
            torch.tensor(target,    dtype=torch.float32),
        )


# 12. Reset Index & Instantiate DataLoaders

In [15]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_wide_features = wide_features[:len(train_df)]
test_wide_features = wide_features[len(train_df):]

train_dataset = AnimeDataset(train_df, train_wide_features, train_df['rating'].values.astype(np.float32))
test_dataset = AnimeDataset(test_df, test_wide_features, test_df['rating'].values.astype(np.float32))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


# 13. Define the Wide & Deep PyTorch Model

In [17]:
import torch
import torch.nn as nn


In [18]:
class WideAndDeep(nn.Module):
    def __init__(self, num_users, num_animes, embedding_dim, wide_input_dim):
        super(WideAndDeep, self).__init__()

        # Using Linear layer as Wide component
        self.wide_layer = nn.Linear(wide_input_dim, 1)

        # Embedding layers user_id and anime_id for Depth component
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.anime_embedding = nn.Embedding(num_animes, embedding_dim)

        self.deep_layers = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, user_id, anime_id, wide_feat):

        # Linear transformation of wide features
        wide_output = self.wide_layer(wide_feat)

        # Combine user and anime embeddings
        user_embed = self.user_embedding(user_id)
        anime_embed = self.anime_embedding(anime_id)
        deep_input = torch.cat([user_embed, anime_embed], dim=-1)
        deep_output = self.deep_layers(deep_input)

        # wide and deep outputs combined
        output = wide_output + deep_output
        return output


# 14. Train, Evaluate & Recommend

In [20]:
from sklearn.metrics import mean_squared_error


In [21]:
num_users = len(user_encoder.classes_)
num_animes = len(anime_encoder.classes_)
embedding_dim = 32
wide_input_dim = wide_features.shape[1]

print("Wide and Deep starting to go")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = WideAndDeep(num_users, num_animes, embedding_dim, wide_input_dim).to(device) #Using GPU if available.

criterion = nn.MSELoss()  # Mean Squared Error Loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, optimizer, criterion, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for user_id, anime_id, wide_feat, target in train_loader:
            user_id, anime_id, wide_feat, target = user_id.to(device), anime_id.to(device), wide_feat.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(user_id, anime_id, wide_feat)
            loss = criterion(output.squeeze(), target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss/len(train_loader)}")

print("Starting to train the model")
train_model(model, train_loader, optimizer, criterion, num_epochs=20)
print("Done train the model")

def evaluate_model(model, test_loader):
    model.eval()
    predictions, targets = [], []
    with torch.no_grad():
        for user_id, anime_id, wide_feat, target in test_loader:
            user_id, anime_id, wide_feat, target = user_id.to(device), anime_id.to(device), wide_feat.to(device), target.to(device)
            output = model(user_id, anime_id, wide_feat)
            predictions.extend(output.squeeze().cpu().tolist())
            targets.extend(target.tolist())

    mse = mean_squared_error(targets, predictions)
    rmse = np.sqrt(mse)
    print(f"Test RMSE: {rmse}")

evaluate_model(model, test_loader)


Wide and Deep starting to go
Starting to train the model
Epoch 1/20, Loss: 4.1883456382751465
Epoch 2/20, Loss: 2.568796852016449
Epoch 3/20, Loss: 2.3280571353912354
Epoch 4/20, Loss: 2.113866033363342
Epoch 5/20, Loss: 1.89037212100029
Epoch 6/20, Loss: 1.6660076401233672
Epoch 7/20, Loss: 1.429991399717331
Epoch 8/20, Loss: 1.2068084759235382
Epoch 9/20, Loss: 1.0004486026287078
Epoch 10/20, Loss: 0.8181359896659851
Epoch 11/20, Loss: 0.6629150069594383
Epoch 12/20, Loss: 0.5308498921394348
Epoch 13/20, Loss: 0.4201848334670067
Epoch 14/20, Loss: 0.33043968440294263
Epoch 15/20, Loss: 0.2619627589404583
Epoch 16/20, Loss: 0.2094462308883667
Epoch 17/20, Loss: 0.16852748655974864
Epoch 18/20, Loss: 0.13792461249530316
Epoch 19/20, Loss: 0.1168020709335804
Epoch 20/20, Loss: 0.1010082170277834
Done train the model
Test RMSE: 1.988932661336076


#15. Recommendation helper

In [22]:
def recommend_top_n(model, user_id, n=10):

    model.eval()
    num_animes = len(anime_encoder.classes_)
    all_anime_ids = torch.arange(num_animes).to(device)
    user_ids = torch.tensor([user_id] * num_animes).to(device)
    wide_feat_for_all_animes = torch.tensor(wide_features[:num_animes]).to(device)

    with torch.no_grad():
        predictions = model(user_ids, all_anime_ids, wide_feat_for_all_animes)

    top_n_scores, top_n_anime_ids = torch.topk(predictions.squeeze(), n)
    top_n_anime_ids = top_n_anime_ids.cpu().numpy()
    recommended_animes = anime_encoder.inverse_transform(top_n_anime_ids)

    return list(zip(recommended_animes, top_n_scores.cpu().numpy()))


In [23]:
def get_anime_name(anime_id, anime_name):
    return anime_name.get(anime_id, "Anime ID not found")

In [24]:
chunk_anime_dict = dict(zip(chunk_merge_df['anime_id'], chunk_merge_df['Anime_Title']))

In [25]:
user_id = 42
recommend_top_n(model, user_id)

[(np.int64(2870), np.float32(13.130423)),
 (np.int64(9001), np.float32(11.185523)),
 (np.int64(15307), np.float32(10.977295)),
 (np.int64(2669), np.float32(10.838011)),
 (np.int64(7270), np.float32(10.5741)),
 (np.int64(37716), np.float32(10.565096)),
 (np.int64(1044), np.float32(10.556259)),
 (np.int64(682), np.float32(10.54083)),
 (np.int64(6171), np.float32(10.522857)),
 (np.int64(1878), np.float32(10.436265))]

In [26]:

top_10_recommendations = recommend_top_n(model, user_id)
print("Top 10 Anime Recommendations for User:", user_id)
for anime_id, score in top_10_recommendations:
    anime_name = get_anime_name(anime_id, chunk_anime_dict)
    print(f"Anime: {anime_name}")

Top 10 Anime Recommendations for User: 42
Anime: Yagami Yuu
Anime: Je T'aime
Anime: Smile Precure! Movie: Ehon no Naka wa Minna Chiguhagu!
Anime: Doraemon Movie 06: Nobita no Little Star Wars
Anime: Kidou Senshi Gundam 00 Special Edition
Anime: Beelzebub-jou no Okinimesu mama.
Anime: Taiyou no Ouji: Horus no Daibouken
Anime: Otogi Story Tenshi no Shippo
Anime: Saint Seiya: The Lost Canvas - Meiou Shinwa
Anime: Kaze no Shoujo Emily
