In [2]:

import pandas as pd
from google.colab import drive
import random as random
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
# === PYTORCH DEEPFM-LIKE MODEL WITH SEPARATED CF AND CB CLASSES ===
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

# === LOAD DATA ===
df = pd.read_csv("/content/drive/My Drive/deepfm_preprocessed_data.csv")
text_embeddings = np.load("/content/drive/My Drive/deepfm_text_embeddings.npy")
features_embeddings = np.load("/content/drive/My Drive/deepfm_features_embeddings.npy")

In [9]:
df.columns

Index(['user_id_encoded', 'rating_scaled', 'parent_asin', 'product_title',
       'features', 'price_scaled', 'Item Weight_scaled', 'length_scaled',
       'width_scaled', 'height_scaled', 'Color_encoded', 'Material_encoded',
       'Manufacturer_encoded', 'sentiment', 'helpful', 'u_idx', 'i_idx'],
      dtype='object')

In [10]:
# Define feature columns used in X_meta
features = [
    'rating', 'price_scaled', 'Item Weight_scaled',
    'length_scaled', 'width_scaled', 'height_scaled',
    'sentiment', 'Color_encoded', 'Material_encoded', 'Manufacturer_encoded'
]

# Encode user and item IDs
df['u_idx'] = df['user_id_encoded'].astype(int)
df['i_idx'] = df['parent_asin'].astype('category').cat.codes.astype(int)

# Binary label: rating > 4
y = (df['rating'] > 4).astype(int).values

# Combine structured + text + feature embeddings
X_meta = np.hstack([df[features].values, text_embeddings, features_embeddings])

In [11]:
# === CUSTOM DATASET ===
class RecommenderDataset(Dataset):
    def __init__(self, df, meta_features, labels):
        self.u_idx = torch.tensor(df['u_idx'].values, dtype=torch.long)
        self.i_idx = torch.tensor(df['i_idx'].values, dtype=torch.long)
        self.meta = torch.tensor(meta_features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'u_idx': self.u_idx[idx],
            'i_idx': self.i_idx[idx],
            'meta': self.meta[idx]
        }, self.labels[idx]

# === COLLABORATIVE FILTERING MODULE ===
class CollaborativeTower(nn.Module):
    def __init__(self, n_users, n_items, emb_dim):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.item_emb = nn.Embedding(n_items, emb_dim)
        self.user_bias = nn.Embedding(n_users, 1)
        self.item_bias = nn.Embedding(n_items, 1)

    def forward(self, u_idx, i_idx):
        user_vec = self.user_emb(u_idx)
        item_vec = self.item_emb(i_idx)
        user_bias = self.user_bias(u_idx).squeeze()
        item_bias = self.item_bias(i_idx).squeeze()

        fm1 = (user_bias + item_bias).unsqueeze(1)
        fm2 = 0.5 * (torch.pow(user_vec + item_vec, 2) - torch.pow(user_vec, 2) - torch.pow(item_vec, 2))
        return fm1, fm2

In [28]:
# === CONTENT-BASED MODULE ===
class ContentTower(nn.Module):
    def __init__(self, meta_dim, hidden_dim=64):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(meta_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

    def forward(self, meta):
        return self.fc(meta)

# === FINAL DEEPFM MODEL ===
class DeepFM(nn.Module):
    def __init__(self, n_users, n_items, emb_dim, meta_dim, hidden_dim=64):
        super().__init__()
        self.cf = CollaborativeTower(n_users, n_items, emb_dim)
        self.cb = ContentTower(meta_dim, hidden_dim)
        self.output = nn.Linear(1 + emb_dim + hidden_dim, 1)

    def forward(self, inputs):
        fm1, fm2 = self.cf(inputs['u_idx'], inputs['i_idx'])
        cb_out = self.cb(inputs['meta'])
        x = torch.cat([fm1, fm2, cb_out], dim=1)
        return torch.sigmoid(self.output(x)).squeeze()


# === TRAINING FUNCTION ===
def train_model(model, train_loader, val_loader, epochs, criterion, optimizer, device):
    best_auc = 0.0
    best_state = None

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_x, y in train_loader:
            for key in batch_x:
                batch_x[key] = batch_x[key].to(device)
            y = y.to(device)

            optimizer.zero_grad()
            preds = model(batch_x)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        y_true, y_pred = [], []
        with torch.no_grad():
            for batch_x, y in val_loader:
                for key in batch_x:
                    batch_x[key] = batch_x[key].to(device)
                preds = model(batch_x)
                y_true.extend(y.numpy())
                y_pred.extend(preds.cpu().numpy())

        from sklearn.metrics import roc_auc_score
        auc = roc_auc_score(y_true, y_pred)
        print(f"Epoch {epoch+1} | Train Loss: {total_loss/len(train_loader):.4f} | Val AUC: {auc:.4f}")

        if auc > best_auc:
            best_auc = auc
            best_state = model.state_dict()

    if best_state:
        model.load_state_dict(best_state)
        print("Best model restored with AUC:", best_auc)
    return model

# === USAGE ===
from sklearn.model_selection import train_test_split

df_train, df_temp, y_train, y_temp, X_train_meta, X_temp_meta = train_test_split(df, y, X_meta, test_size=0.2, random_state=42)
df_val, df_test, y_val, y_test, X_val_meta, X_test_meta = train_test_split(df_temp, y_temp, X_temp_meta, test_size=0.5, random_state=42)

train_data = RecommenderDataset(df_train, X_train_meta, y_train)
val_data = RecommenderDataset(df_val, X_val_meta, y_val)
test_data = RecommenderDataset(df_test, X_test_meta, y_test)

train_loader = DataLoader(train_data, batch_size=512, shuffle=True)
val_loader = DataLoader(val_data, batch_size=512)
test_loader = DataLoader(test_data, batch_size=512)

# Initialize model
model = DeepFM(n_users=df['u_idx'].max()+2, n_items=df['i_idx'].max()+2, emb_dim=16, meta_dim=X_meta.shape[1])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# Train
model = train_model(model, train_loader, val_loader, epochs=10, criterion=criterion, optimizer=optimizer, device=device)

# Evaluate
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for batch_x, y in test_loader:
        for key in batch_x:
            batch_x[key] = batch_x[key].to(device)
        preds = model(batch_x)
        y_true.extend(y.numpy())
        y_pred.extend(preds.cpu().numpy())

from sklearn.metrics import roc_auc_score, classification_report
auc = roc_auc_score(y_true, y_pred)
print(f"Test AUC: {auc:.4f}")

Epoch 1 | Train Loss: 0.6374 | Val AUC: 0.5113
Epoch 2 | Train Loss: 0.6034 | Val AUC: 0.5695
Epoch 3 | Train Loss: 0.5758 | Val AUC: 0.6351
Epoch 4 | Train Loss: 0.4999 | Val AUC: 0.8412
Epoch 5 | Train Loss: 0.4035 | Val AUC: 0.8803
Epoch 6 | Train Loss: 0.3549 | Val AUC: 0.8224
Epoch 7 | Train Loss: 0.3072 | Val AUC: 0.8068
Epoch 8 | Train Loss: 0.2824 | Val AUC: 0.9578
Epoch 9 | Train Loss: 0.2189 | Val AUC: 0.8903
Epoch 10 | Train Loss: 0.1934 | Val AUC: 0.8453
Best model restored with AUC: 0.9578443629907057
Test AUC: 0.8423
