In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from tqdm import tqdm
import warnings
import pickle

warnings.filterwarnings('ignore')

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else
                     "mps" if torch.backends.mps.is_available() else
                     "cpu")
print(f"Using device: {device}")

Using device: mps


In [2]:
# Load datasets
print("Loading datasets...")
anime_data = pd.read_csv('./Anime Dataset 2023/anime-dataset-2023.csv')
user_data = pd.read_csv('./Anime Dataset 2023/users-details-2023.csv')
user_scores = pd.read_csv('./Anime Dataset 2023/users-score-2023.csv')

print("Dataset shapes:")
print(f"Anime data: {anime_data.shape}")
print(f"User data: {user_data.shape}")
print(f"User scores: {user_scores.shape}")

Loading datasets...


KeyboardInterrupt: 

In [8]:
# Merge datasets
print("Preprocessing data...")
data = pd.merge(user_scores, user_data, left_on='user_id', right_on='Mal ID')
data = pd.merge(data, anime_data, on='anime_id')

# Filter popular anime and active users
popular_anime = data['anime_id'].value_counts()
popular_anime_ids = popular_anime[popular_anime > 500].index
data = data[data['anime_id'].isin(popular_anime_ids)]

active_users = data['user_id'].value_counts()
active_user_ids = active_users[active_users > 100].index
data = data[data['user_id'].isin(active_user_ids)]

# Drop unnecessary columns
columns_to_drop = [
    'Gender', 'Birthday', 'Location', 'Username_x', 'Username_y',
    'Mal ID', 'Anime Title', 'Name', 'English name', 'Other name',
    'Synopsis', 'Aired', 'Premiered', 'Producers', 'Licensors',
    'Studios', 'Source', 'Image URL', 'Joined', 'Status',
    'Duration', 'Rank', 'Scored By'
]
data = data.drop(columns=columns_to_drop)

# Fill missing values
num_cols = [
    'Days Watched', 'Mean Score', 'Watching', 'Completed',
    'On Hold', 'Dropped', 'Plan to Watch', 'Total Entries',
    'Rewatched', 'Episodes Watched'
]
for col in num_cols:
    data[col].fillna(data[col].mean(), inplace=True)

print(f"Processed data shape: {data.shape}")

Preprocessing data...
Processed data shape: (17870130, 21)


In [9]:
# Create genre features
print("Creating genre features...")
genres = anime_data['Genres'].str.split(',', expand=True).stack()
genre_dummies = pd.get_dummies(genres, prefix='genre')
genre_features = genre_dummies.groupby(level=0).sum()

# Keep top genres
top_genres = genre_features.sum().nlargest(15).index
genre_features = genre_features[top_genres]

# Create interaction features
print("Creating interaction features...")
data['user_rating_diff'] = data['rating'] - data['Mean Score']
data['popularity_score'] = np.log1p(data['Popularity']) * data['rating']
data['completion_rate'] = data['Completed'] / (data['Total Entries'] + 1)
data['watch_intensity'] = data['Episodes Watched'] / (data['Days Watched'] + 1)

# Merge genre features
data = pd.merge(data, genre_features, left_on='anime_id', right_index=True)

# Define final feature list
numeric_features = [
    'Days Watched', 'Mean Score', 'Watching', 'Completed',
    'On Hold', 'Dropped', 'Plan to Watch', 'Total Entries',
    'Rewatched', 'Episodes Watched', 'Popularity', 'Favorites',
    'Members', 'completion_rate', 'watch_intensity', 'user_rating_diff',
    'popularity_score'
] + genre_features.columns.tolist()

# Scale numeric features
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

print(f"Number of features: {len(numeric_features)}")

Creating genre features...
Creating interaction features...
Number of features: 32


In [10]:
# Create ID mappings
print("Creating ID mappings...")
user_encoder = LabelEncoder()
anime_encoder = LabelEncoder()

data['user_id_mapped'] = user_encoder.fit_transform(data['user_id'])
data['anime_id_mapped'] = anime_encoder.fit_transform(data['anime_id'])

# Save encoders
encoders = {
    'user_encoder': user_encoder,
    'anime_encoder': anime_encoder,
    'scaler': scaler
}

with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

print("Number of unique users:", len(user_encoder.classes_))
print("Number of unique anime:", len(anime_encoder.classes_))

Creating ID mappings...
Number of unique users: 75459
Number of unique anime: 3361


In [11]:
# Create ID mappings
print("Creating ID mappings...")
user_encoder = LabelEncoder()
anime_encoder = LabelEncoder()

data['user_id_mapped'] = user_encoder.fit_transform(data['user_id'])
data['anime_id_mapped'] = anime_encoder.fit_transform(data['anime_id'])

# Save encoders
encoders = {
    'user_encoder': user_encoder,
    'anime_encoder': anime_encoder,
    'scaler': scaler
}

with open('encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

print("Number of unique users:", len(user_encoder.classes_))
print("Number of unique anime:", len(anime_encoder.classes_))

Creating ID mappings...
Number of unique users: 75459
Number of unique anime: 3361


In [12]:
class WideAndDeepRecommender(nn.Module):
    def __init__(self, n_users, n_items, n_features, embed_dim=64):
        super(WideAndDeepRecommender, self).__init__()

        # Wide Component
        self.wide = nn.Sequential(
            nn.Linear(n_features, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

        # Deep Component
        self.user_embedding = nn.Embedding(n_users, embed_dim)
        self.item_embedding = nn.Embedding(n_items, embed_dim)

        self.deep = nn.Sequential(
            nn.Linear(embed_dim * 2, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, 1)
        )

        self._init_weights()

    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, std=0.01)

    def forward(self, user_ids, anime_ids, features):
        # Wide Component
        wide_out = self.wide(features)

        # Deep Component
        user_emb = self.user_embedding(user_ids)
        anime_emb = self.item_embedding(anime_ids)
        deep_input = torch.cat([user_emb, anime_emb], dim=1)
        deep_out = self.deep(deep_input)

        # Combine outputs
        combined_out = wide_out + deep_out
        return torch.clamp(combined_out, 0, 10)

In [13]:
# Prepare features and targets
X = data[numeric_features].values
user_ids = data['user_id_mapped'].values
anime_ids = data['anime_id_mapped'].values
ratings = data['rating'].values

# Convert to tensors
X = torch.FloatTensor(X)
user_ids = torch.LongTensor(user_ids)
anime_ids = torch.LongTensor(anime_ids)
ratings = torch.FloatTensor(ratings).view(-1, 1)

# Create dataset
dataset = TensorDataset(user_ids, anime_ids, X, ratings)

# Split data
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, test_size]
)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128)

print("Data preparation completed.")

Data preparation completed.


In [14]:
def train_epoch(model, train_loader, criterion, optimizer, device):
    model.train()
    total_loss = 0

    with tqdm(train_loader, desc="Training") as pbar:
        for user_id, anime_id, features, rating in pbar:
            user_id = user_id.to(device)
            anime_id = anime_id.to(device)
            features = features.to(device)
            rating = rating.to(device)

            optimizer.zero_grad()
            prediction = model(user_id, anime_id, features)
            loss = criterion(prediction, rating)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            total_loss += loss.item()
            pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    return total_loss / len(train_loader)

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    predictions = []
    actuals = []

    with torch.no_grad():
        for user_id, anime_id, features, rating in test_loader:
            user_id = user_id.to(device)
            anime_id = anime_id.to(device)
            features = features.to(device)
            rating = rating.to(device)

            prediction = model(user_id, anime_id, features)
            loss = criterion(prediction, rating)
            total_loss += loss.item()

            predictions.extend(prediction.cpu().numpy())
            actuals.extend(rating.cpu().numpy())

    predictions = np.array(predictions)
    actuals = np.array(actuals)

    rmse = np.sqrt(((predictions - actuals) ** 2).mean())
    mae = np.abs(predictions - actuals).mean()
    accuracy = np.mean(np.abs(predictions - actuals) <= 1.0)

    return {
        'test_loss': total_loss / len(test_loader),
        'rmse': rmse,
        'mae': mae,
        'accuracy': accuracy
    }

In [15]:
# Initialize model
n_users = len(user_encoder.classes_)
n_items = len(anime_encoder.classes_)
n_features = len(numeric_features)

model = WideAndDeepRecommender(
    n_users=n_users,
    n_items=n_items,
    n_features=n_features,
    embed_dim=64
).to(device)

# Training setup
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', patience=2, factor=0.5, min_lr=1e-6
)

# Training loop
epochs = 10
best_loss = float('inf')

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")

    # Train
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)

    # Evaluate
    metrics = evaluate_model(model, test_loader, criterion, device)

    print(f"Train Loss: {train_loss:.4f}")
    print(f"Test Loss: {metrics['test_loss']:.4f}")
    print(f"RMSE: {metrics['rmse']:.4f}")
    print(f"MAE: {metrics['mae']:.4f}")
    print(f"Accuracy (±1): {metrics['accuracy']:.2%}")

    # Learning rate scheduling
    scheduler.step(metrics['test_loss'])

    # Save best model
    if metrics['test_loss'] < best_loss:
        best_loss = metrics['test_loss']
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': best_loss,
            'encoders': encoders
        }, 'best_model.pth')
        print("Saved new best model")


Epoch 1/1


Training: 100%|██████████| 93785/93785 [30:22<00:00, 51.47it/s, loss=0.0002]


Train Loss: 0.0152
Test Loss: 0.0570
RMSE: 0.2387
MAE: 0.1509
Accuracy (±1): 99.55%
Saved new best model


In [16]:
def get_recommendations(model, user_id, top_n=10):
    """Generate anime recommendations for a user"""
    model.eval()

    # Get user features
    user_data = data[data['user_id'] == user_id].iloc[0]
    user_id_mapped = user_encoder.transform([user_id])[0]

    # Create prediction matrix for all anime
    all_anime_ids = anime_encoder.transform(anime_encoder.classes_)
    user_ids = np.full_like(all_anime_ids, user_id_mapped)

    # Prepare features
    user_features = user_data[numeric_features].values
    feature_matrix = np.tile(user_features, (len(all_anime_ids), 1))

    # Convert to tensors
    user_ids = torch.LongTensor(user_ids).to(device)
    anime_ids = torch.LongTensor(all_anime_ids).to(device)
    features = torch.FloatTensor(feature_matrix).to(device)

    # Get predictions
    with torch.no_grad():
        predictions = model(user_ids, anime_ids, features)

    # Get top N recommendations
    predictions = predictions.cpu().numpy()
    top_indices = predictions.flatten().argsort()[-top_n:][::-1]

    # Get recommended anime details
    recommended_anime_ids = anime_encoder.inverse_transform(all_anime_ids[top_indices])
    recommendations = anime_data[anime_data['anime_id'].isin(recommended_anime_ids)][
        ['anime_id', 'Anime Title', 'Type', 'Genres']
    ].copy()

    recommendations['predicted_rating'] = predictions[top_indices]

    return recommendations

# Example usage
sample_user_id = data['user_id'].iloc[0]
recommendations = get_recommendations(model, sample_user_id)
print("\nTop 10 Recommendations:")
display(recommendations)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.