# 324321214214122141

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.model_selection import train_test_split

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Set Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Paths (Match your notebook)
FOLDER_PATH = "/home/juan/Work/Midterm project/splited/"
X_PATH = os.path.join(FOLDER_PATH, "X_data.npz")
Y_PATH = os.path.join(FOLDER_PATH, "Y_data.npz")

Using device: cuda


In [2]:
print("Loading Data...")
X = sparse.load_npz(X_PATH)
Y = sparse.load_npz(Y_PATH)

print(f"Features: {X.shape}")
print(f"Targets:  {Y.shape}")  # Should be (661271, 10488)

# Split keeping sparse matrix to save RAM
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("Data successfully split.")

Loading Data...
Features: (661271, 5286)
Targets:  (661271, 10488)
Data successfully split.


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        # BCE with logits
        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        
        # pt is the probability of being right
        pt = torch.exp(-BCE_loss)
        
        # Focal term: (1-pt)^gamma. 
        # If pt is high (easy example), this term approaches 0.
        # If pt is low (hard example), this term stays high.
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduction == 'mean': return torch.mean(F_loss)
        elif self.reduction == 'sum': return torch.sum(F_loss)
        else: return F_loss

print("Focal Loss initialized.")

Focal Loss initialized.


In [4]:
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.3):
        super().__init__()
        self.block = nn.Sequential(
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(dim, dim),
            nn.BatchNorm1d(dim),
            nn.ReLU(),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        # The "Residual" connection: x + block(x)
        return x + self.block(x)

class DeepDrugResNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DeepDrugResNet, self).__init__()
        
        # 1. Project Input up to Hidden Dimension
        self.input_layer = nn.Sequential(
            nn.Linear(input_dim, 2048),
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # 2. Residual Blocks (Deep Learning Magic)
        self.res1 = ResidualBlock(2048)
        self.res2 = ResidualBlock(2048)
        
        # 3. Bottleneck to Output
        self.bottleneck = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.BatchNorm1d(1024),
            nn.ReLU()
        )
        
        self.output_layer = nn.Linear(1024, output_dim)

    def forward(self, x):
        x = self.input_layer(x)
        x = self.res1(x)
        x = self.res2(x)
        x = self.bottleneck(x)
        return self.output_layer(x)

print("DeepDrugResNet Architecture defined.")

DeepDrugResNet Architecture defined.


In [5]:
class DrugMultilabelDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        # Convert only one row to dense at a time
        x_row = torch.tensor(self.X[idx].toarray(), dtype=torch.float32).squeeze()
        y_row = torch.tensor(self.Y[idx].toarray(), dtype=torch.float32).squeeze()
        return x_row, y_row

# DataLoaders
batch_size = 128  # Increase to 256 or 512 if your GPU has >8GB VRAM
train_ds = DrugMultilabelDataset(X_train, Y_train)
test_ds = DrugMultilabelDataset(X_test, Y_test)

train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=2)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=2)
print("DataLoaders ready.")

DataLoaders ready.


In [6]:
# 1. Setup Model
input_size = X.shape[1]
output_size = Y.shape[1]
model = DeepDrugResNet(input_size, output_size).to(device)

# 2. Setup Smart Loss
# We use Focal Loss which handles imbalance automatically
criterion = FocalLoss(alpha=0.75, gamma=2) 

# 3. Optimizer with Weight Decay (Regularization)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)

# 4. Scheduler (FIXED: Removed 'verbose=True')
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    mode='min', 
    factor=0.1, 
    patience=1
)

# 5. Training Loop
epochs = 10 
print(f"Training Deep ResNet on {device}...")

model.train()
for epoch in range(epochs):
    total_loss = 0
    
    for i, (xb, yb) in enumerate(train_dl):
        xb, yb = xb.to(device), yb.to(device)
        
        optimizer.zero_grad()
        logits = model(xb)
        loss = criterion(logits, yb)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    avg_loss = total_loss / len(train_dl)
    
    # Get current Learning Rate manually
    current_lr = optimizer.param_groups[0]['lr']
    
    print(f"Epoch {epoch+1}/{epochs} | Loss: {avg_loss:.5f} | LR: {current_lr:.6f}")
    
    # Update learning rate based on loss
    scheduler.step(avg_loss)

print("Training Complete.")

Training Deep ResNet on cuda...
Epoch 1/10 | Loss: 0.00040 | LR: 0.001000
Epoch 2/10 | Loss: 0.00030 | LR: 0.001000
Epoch 3/10 | Loss: 0.00029 | LR: 0.001000
Epoch 4/10 | Loss: 0.00028 | LR: 0.001000
Epoch 5/10 | Loss: 0.00027 | LR: 0.001000
Epoch 6/10 | Loss: 0.00027 | LR: 0.001000
Epoch 7/10 | Loss: 0.00027 | LR: 0.001000
Epoch 8/10 | Loss: 0.00026 | LR: 0.001000
Epoch 9/10 | Loss: 0.00026 | LR: 0.001000
Epoch 10/10 | Loss: 0.00025 | LR: 0.001000
Training Complete.


In [9]:
import torch
import numpy as np

def evaluate_and_log_memory_efficient(model, dataloader, threshold=0.2):
    model.eval()
    
    # 1. Initialize Counters (on GPU to be fast)
    # These take almost 0 RAM compared to storing the whole dataset
    total_tp = 0
    total_fp = 0
    total_fn = 0
    
    # Store counts for plotting later (Size: 10488, very small)
    pred_counts = torch.zeros(output_size).to(device)
    true_counts = torch.zeros(output_size).to(device)
    
    print(f"Streaming evaluation (Threshold={threshold})...")
    
    with torch.no_grad():
        for xb, yb in dataloader:
            xb = xb.to(device)
            yb = yb.to(device) # Move targets to GPU for fast math
            
            # Forward pass
            logits = model(xb)
            probs = torch.sigmoid(logits)
            preds = (probs > threshold).float()
            
            # --- RAM SAVING MAGIC STARTS HERE ---
            # Instead of saving 'preds', we calculate stats immediately & discard data
            
            # 1. Update Metrics Stats (Micro-Average logic)
            total_tp += (preds * yb).sum().item()       # True Positives
            total_fp += (preds * (1 - yb)).sum().item() # False Positives
            total_fn += ((1 - preds) * yb).sum().item() # False Negatives
            
            # 2. Update Plotting Stats (Sum of events)
            pred_counts += preds.sum(dim=0)
            true_counts += yb.sum(dim=0)
            
            # Python automatically frees 'xb', 'preds', etc. here
            # --- END OF BATCH ---

    # Calculate Final Metrics
    epsilon = 1e-7
    precision = total_tp / (total_tp + total_fp + epsilon)
    recall = total_tp / (total_tp + total_fn + epsilon)
    f1 = 2 * (precision * recall) / (precision + recall + epsilon)
    
    print("\n--- Test Set Results (Memory Efficient) ---")
    print(f"Micro Precision: {precision:.4f}")
    print(f"Micro Recall:    {recall:.4f}")
    print(f"Micro F1 Score:  {f1:.4f}")
    
    # Move counts to CPU for plotting
    return true_counts.cpu().numpy(), pred_counts.cpu().numpy()

# Execute
# Note: We do NOT get 'y_pred' back because it's too big for RAM. 
# We get 'counts' instead which allows us to make the plots.
true_counts, pred_counts = evaluate_and_log_memory_efficient(model, test_dl, threshold=0.5)

Streaming evaluation (Threshold=0.5)...

--- Test Set Results (Memory Efficient) ---
Micro Precision: 0.5264
Micro Recall:    0.0240
Micro F1 Score:  0.0459


In [11]:
def evaluate_precision_at_k(model, dataloader, k=10):
    model.eval()
    total_precision = 0.0
    num_samples = 0
    
    with torch.no_grad():
        for xb, yb in dataloader:
            xb, yb = xb.to(device), yb.to(device)
            
            # Get raw logits (ranking scores)
            logits = model(xb)
            probs = torch.sigmoid(logits)
            
            # Get the top K highest probability indices
            # We don't care if prob is 0.01 or 0.99, just the order
            _, top_indices = torch.topk(probs, k=k, dim=1)
            
            # Check if actual truth (1) exists at those indices
            # gather() pulls the values from yb at the positions in top_indices
            relevant = torch.gather(yb, 1, top_indices)
            
            # Average correct in top K
            batch_p_at_k = relevant.sum().item() / (k * xb.size(0))
            
            # Accumulate (weighted by batch size)
            total_precision += batch_p_at_k * xb.size(0)
            num_samples += xb.size(0)

    score = total_precision / num_samples
    print(f"\n Precision@{k}: {score:.2%}")

# Run it
evaluate_precision_at_k(model, test_dl, k=100)


 Precision@100: 1.92%
