In [2]:
# validation with synthetic dataset
import torch, torchtext
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import Layers, loaders
from tqdm import tqdm
print(torch.__version__, torchtext.__version__)

device = "cuda" if torch.cuda.is_available() else "cpu"
model = Layers.HybridCVNN(
    image_channels=3,
    filter_dimension=3,   
    num_classes=101
).to(device)

1.13.1+cu117 0.14.1


## Now trying with food 101 data

In [3]:
import wandb

In [4]:
epochs = 1000
lr = 1e-4

In [5]:
train_loader, test_loader = loaders.get_food101_dataloaders()

In [6]:
# Initialize model, loss, optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))
scheduler = torch.optim.lr_scheduler.LambdaLR(
    optimizer,
    lr_lambda=lambda e: (e + 1) / 20 if e < 20 else 0.95 ** ((e - 200) / 10)
)
criterion = nn.CrossEntropyLoss()

In [7]:
# set up wandb
wandb.init(project="hpml-final", name="1,3binary")
wandb.config.update({
    "model_name": "custom-BCVNN",
    "batch_size": 64, "lr": 1e-4,
    "optimizer": "Adam", "num_workers": 4,
    "kernel_size": 3,
    "epochs": epochs, "compile_mode": False,
    "device": str(device)
})

[34m[1mwandb[0m: [32m[41mERROR[0m Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mchriszeng[0m ([33mchriszeng-columbia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Training loop

for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    train_correct = 0
    train_total = 0

    # training pass
    for batch_idx, (images, labels) in tqdm(enumerate(train_loader),
                                           total=len(train_loader), desc="training batches"):
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # accumulate train stats
        total_loss += loss.item()
        preds = outputs.argmax(dim=1)
        train_correct += (preds == labels).sum().item()
        train_total += labels.size(0)

    # compute train averages
    avg_train_loss = total_loss / len(train_loader)
    train_acc = train_correct / train_total if train_total > 0 else 0.0

    # validation pass
    model.eval()
    test_loss_sum = 0.0
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            test_loss_sum += loss.item()
            preds = outputs.argmax(dim=1)
            test_correct += (preds == labels).sum().item()
            test_total += labels.size(0)

    avg_test_loss = test_loss_sum / len(test_loader)
    test_acc = test_correct / test_total if test_total > 0 else 0.0

    # log to wandb
    wandb.log({
      "train/loss": avg_train_loss,
      "train/accuracy": train_acc,
      "test/loss": avg_test_loss,
      "test/accuracy": test_acc,
      "epoch": epoch + 1,
      "device": str(device)
    }, step=epoch + 1)
    if (epoch + 1) % 100 == 0:
        ckpt_path = f"checkpoints/epoch_{epoch+1:04d}.pth"
        torch.save({
            "epoch": epoch + 1,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "train_loss": avg_train_loss,
            "val_loss": avg_test_loss,
            "train_acc": train_acc,
            "val_acc": test_acc,
            "device": str(device),
        }, ckpt_path)
        print(f"üíæ Saved checkpoint to {ckpt_path}")
    





training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:48<00:00,  2.90it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:47<00:00,  2.90it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:47<00:00,  2.90it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:47<00:00,  2.90it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:47<00:00,  2.90it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:48<00:00,  2.90it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:47<00:00,  2.90it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:48<00:00,  2.90it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:48<00:00,  2.90it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:48<00:00,  2.90it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [06:48<00:00,  2.90it/s]
training b

In [None]:
# Training loop

for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    train_correct = 0
    train_total = 0

    # training pass
    for batch_idx, (images, labels) in tqdm(enumerate(train_loader),
                                           total=len(train_loader), desc="training batches"):
        images, labels = images.to(device), labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # accumulate train stats
        total_loss += loss.item()
        preds = outputs.argmax(dim=1)
        train_correct += (preds == labels).sum().item()
        train_total += labels.size(0)

    # compute train averages
    avg_train_loss = total_loss / len(train_loader)
    train_acc = train_correct / train_total if train_total > 0 else 0.0

    # validation pass
    model.eval()
    test_loss_sum = 0.0
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)

            test_loss_sum += loss.item()
            preds = outputs.argmax(dim=1)
            test_correct += (preds == labels).sum().item()
            test_total += labels.size(0)

    avg_test_loss = test_loss_sum / len(test_loader)
    test_acc = test_correct / test_total if test_total > 0 else 0.0

    # log to wandb
    wandb.log({
      "train/loss": avg_train_loss,
      "train/accuracy": train_acc,
      "test/loss": avg_test_loss,
      "test/accuracy": test_acc,
      "epoch": epoch + 1,
      "device": str(device)
    }, step=epoch + 1)
    
    if (epoch + 1) % 100 == 0:
        ckpt_path = f"checkpoints/epoch_{epoch+1:04d}.pth"
        torch.save({
            "epoch": epoch + 1,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "train_loss": avg_train_loss,
            "val_loss": avg_test_loss,
            "train_acc": train_acc,
            "val_acc": test_acc,
            "device": str(device),
        }, ckpt_path)
        print(f"üíæ Saved checkpoint to {ckpt_path}")




training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:49<00:00,  2.52it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:48<00:00,  2.53it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:48<00:00,  2.53it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:48<00:00,  2.53it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:48<00:00,  2.53it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:48<00:00,  2.53it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:48<00:00,  2.52it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:48<00:00,  2.52it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:48<00:00,  2.52it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:48<00:00,  2.53it/s]
training batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1184/1184 [07:48<00:00,  2.53it/s]
training b

In [None]:
wandb.finish()

below made extra for testing, can ignore

In [9]:
wandb.finish()

0,1
epoch,‚ñÅ‚ñÉ‚ñÜ‚ñà
test/accuracy,‚ñÅ‚ñà‚ñÅ‚ñÅ
test/loss,‚ñà‚ñÑ‚ñÑ‚ñÅ
train/accuracy,‚ñÉ‚ñÅ‚ñá‚ñà
train/loss,‚ñà‚ñÉ‚ñÇ‚ñÅ

0,1
device,cuda
epoch,4
test/accuracy,0.0099
test/loss,4.61515
train/accuracy,0.00965
train/loss,4.61519
