In [1]:
# !pip install torch==2.2.2 torchtext==0.17.2 nltk
# !pip install torchdata==0.7.1
# !pip install pyarrow

In [2]:
import torch
import os
import pickle
from model import ClassificationNet, train_epoch, evaluate_epoch, save_list_to_file, load_list_from_file

In [3]:
from dataloader import get_dataloaders, DEVICE

print("Imported successfully!")
print(f"Running on device: {DEVICE}")

BATCH_SIZE = 64
train_dataloader, valid_dataloader, test_dataloader, vocab = get_dataloaders(batch_size=BATCH_SIZE)

# --- Verification Step ---
print("\nVerifying by fetching one batch from train_dataloader...")
labels, texts = next(iter(train_dataloader))

print(f"Labels batch shape: {labels.shape}")
print(f"Texts batch shape: {texts.shape}")

Imported successfully!
Running on device: cuda
Loading data from Parquet files...
Vocabulary Size: 95811

Verifying by fetching one batch from train_dataloader...
Labels batch shape: torch.Size([64])
Texts batch shape: torch.Size([64, 72])


In [4]:
vocab_size = len(vocab)
num_classes = 4

In [5]:
EXPERIMENT_DIR = "runs/adam_from_epoch71"
METRICS_DIR = os.path.join(EXPERIMENT_DIR, "metrics") 
os.makedirs(EXPERIMENT_DIR, exist_ok=True)
os.makedirs(METRICS_DIR, exist_ok=True) 

print(f"Experiment artifacts will be saved in: {EXPERIMENT_DIR}")
print(f"Metrics will be saved in: {METRICS_DIR}")

Experiment artifacts will be saved in: runs/adam_from_epoch71
Metrics will be saved in: runs/adam_from_epoch71/metrics


In [6]:
model = ClassificationNet(vocab_size=vocab_size, num_class=num_classes).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1)

In [7]:
start_epoch = 1
best_val_accuracy = 0.0
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

In [8]:
BASE_CHECKPOINT_PATH = 'runs/adam_from_epoch51/checkpoint.pth'

In [9]:
if os.path.exists(BASE_CHECKPOINT_PATH):
    print(f"Found base checkpoint. Initializing model with weights from '{BASE_CHECKPOINT_PATH}'...")
    checkpoint = torch.load(BASE_CHECKPOINT_PATH)
    model.load_state_dict(checkpoint['model_state_dict']) # Load weights only
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    
    if 'scheduler_state_dict' in checkpoint:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        print("Scheduler state loaded.")
        
    start_epoch = checkpoint['epoch'] + 1
    history = checkpoint['history']                       # Continue history
    best_val_accuracy = checkpoint['best_val_accuracy']
else:
    print("No checkpoints found. Starting a completely new training run.")

Found base checkpoint. Initializing model with weights from 'runs/adam_from_epoch51/checkpoint.pth'...


In [10]:
EPOCHS = 90            # Train up to a total of 90 epochs

print(f"Starting training from epoch {start_epoch}...")

for epoch in range(start_epoch, EPOCHS + 1):
    print("-" * 50)
    print(f"Epoch {epoch}/{EPOCHS}")
    
    # Run training and validation
    train_loss, train_acc = train_epoch(model, train_dataloader, criterion, optimizer, DEVICE)
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    
    val_loss, val_acc = evaluate_epoch(model, valid_dataloader, criterion, DEVICE)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    
    scheduler.step()
    
    # Get current LR to print it
    current_lr = optimizer.param_groups[0]['lr']
    
    # summary
    print(f"\nEpoch Summary:")
    print(f"\tTrain Loss: {train_loss:.4f} | Train Acc: {train_acc*100:.2f}%")
    print(f"\tValid Loss: {val_loss:.4f} | Valid Acc: {val_acc*100:.2f}%")
    print(f"\tLearning Rate: {current_lr:.6f}")

    # Save best model to the experiment folder
    if val_acc > best_val_accuracy:
        best_val_accuracy = val_acc
        best_model_path = os.path.join(EXPERIMENT_DIR, 'best_model.pth')
        torch.save(model.state_dict(), best_model_path)
        print(f"New best model saved to '{best_model_path}'")

print("-" * 50)
print("Training Finished!")
        
        
# Save final history to the experiment folder
for key, value in history.items():
    file_path = os.path.join(METRICS_DIR, f"{key}.pkl")
    save_list_to_file(value, file_path)

Training:   0%|          | 0/1725 [00:00<?, ?it/s]

Starting training from epoch 71...
--------------------------------------------------
Epoch 71/90


Training: 100%|██████████| 1725/1725 [00:42<00:00, 40.40it/s, loss=0.4482]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 127.30it/s, loss=0.4059]



Epoch Summary:
	Train Loss: 0.4482 | Train Acc: 84.59%
	Valid Loss: 0.4059 | Valid Acc: 87.05%
	Learning Rate: 0.001000


Training:   0%|          | 5/1725 [00:00<00:41, 41.72it/s, loss=0.3902]

New best model saved to 'runs/adam_from_epoch71/best_model.pth'
--------------------------------------------------
Epoch 72/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.63it/s, loss=0.4442]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 130.68it/s, loss=0.4310]
Training:   0%|          | 4/1725 [00:00<00:45, 37.87it/s, loss=0.4421]


Epoch Summary:
	Train Loss: 0.4442 | Train Acc: 84.62%
	Valid Loss: 0.4310 | Valid Acc: 86.85%
	Learning Rate: 0.001000
--------------------------------------------------
Epoch 73/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.42it/s, loss=0.4230]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 132.96it/s, loss=0.4063]



Epoch Summary:
	Train Loss: 0.4230 | Train Acc: 85.46%
	Valid Loss: 0.4063 | Valid Acc: 87.33%
	Learning Rate: 0.001000


Training:   0%|          | 5/1725 [00:00<00:39, 43.75it/s, loss=0.4898]

New best model saved to 'runs/adam_from_epoch71/best_model.pth'
--------------------------------------------------
Epoch 74/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.49it/s, loss=0.4211]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 130.66it/s, loss=0.3909]



Epoch Summary:
	Train Loss: 0.4211 | Train Acc: 85.57%
	Valid Loss: 0.3909 | Valid Acc: 87.43%
	Learning Rate: 0.000100


Training:   0%|          | 3/1725 [00:00<01:02, 27.77it/s, loss=0.3838]

New best model saved to 'runs/adam_from_epoch71/best_model.pth'
--------------------------------------------------
Epoch 75/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.57it/s, loss=0.3934]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 134.57it/s, loss=0.3830]



Epoch Summary:
	Train Loss: 0.3934 | Train Acc: 86.40%
	Valid Loss: 0.3830 | Valid Acc: 88.10%
	Learning Rate: 0.000100


Training:   0%|          | 3/1725 [00:00<01:00, 28.32it/s, loss=0.4608]

New best model saved to 'runs/adam_from_epoch71/best_model.pth'
--------------------------------------------------
Epoch 76/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.47it/s, loss=0.3847]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 132.62it/s, loss=0.3743]



Epoch Summary:
	Train Loss: 0.3847 | Train Acc: 86.71%
	Valid Loss: 0.3743 | Valid Acc: 88.25%
	Learning Rate: 0.000100


Training:   0%|          | 5/1725 [00:00<00:42, 40.50it/s, loss=0.3942]

New best model saved to 'runs/adam_from_epoch71/best_model.pth'
--------------------------------------------------
Epoch 77/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.35it/s, loss=0.3803]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 134.22it/s, loss=0.3778]
Training:   0%|          | 2/1725 [00:00<01:56, 14.83it/s, loss=0.3994]


Epoch Summary:
	Train Loss: 0.3803 | Train Acc: 86.97%
	Valid Loss: 0.3778 | Valid Acc: 88.09%
	Learning Rate: 0.000100
--------------------------------------------------
Epoch 78/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.53it/s, loss=0.3757]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 131.81it/s, loss=0.3654]



Epoch Summary:
	Train Loss: 0.3757 | Train Acc: 87.11%
	Valid Loss: 0.3654 | Valid Acc: 88.35%
	Learning Rate: 0.000010


Training:   0%|          | 5/1725 [00:00<00:41, 41.25it/s, loss=0.3878]

New best model saved to 'runs/adam_from_epoch71/best_model.pth'
--------------------------------------------------
Epoch 79/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.58it/s, loss=0.3705]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 131.86it/s, loss=0.3672]



Epoch Summary:
	Train Loss: 0.3705 | Train Acc: 87.26%
	Valid Loss: 0.3672 | Valid Acc: 88.40%
	Learning Rate: 0.000010


Training:   0%|          | 5/1725 [00:00<00:39, 43.39it/s, loss=0.3385]

New best model saved to 'runs/adam_from_epoch71/best_model.pth'
--------------------------------------------------
Epoch 80/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.61it/s, loss=0.3702]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 128.08it/s, loss=0.3684]
Training:   0%|          | 5/1725 [00:00<00:38, 44.30it/s, loss=0.4257]


Epoch Summary:
	Train Loss: 0.3702 | Train Acc: 87.29%
	Valid Loss: 0.3684 | Valid Acc: 88.36%
	Learning Rate: 0.000010
--------------------------------------------------
Epoch 81/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.49it/s, loss=0.3704]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 138.11it/s, loss=0.3678]
Training:   0%|          | 4/1725 [00:00<00:44, 39.01it/s, loss=0.2933]


Epoch Summary:
	Train Loss: 0.3704 | Train Acc: 87.35%
	Valid Loss: 0.3678 | Valid Acc: 88.35%
	Learning Rate: 0.000010
--------------------------------------------------
Epoch 82/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.63it/s, loss=0.3695]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 130.84it/s, loss=0.3672]
Training:   0%|          | 4/1725 [00:00<00:47, 36.28it/s, loss=0.3528]


Epoch Summary:
	Train Loss: 0.3695 | Train Acc: 87.30%
	Valid Loss: 0.3672 | Valid Acc: 88.30%
	Learning Rate: 0.000001
--------------------------------------------------
Epoch 83/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.58it/s, loss=0.3679]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 135.25it/s, loss=0.3671]
Training:   0%|          | 3/1725 [00:00<01:06, 26.05it/s, loss=0.3893]


Epoch Summary:
	Train Loss: 0.3679 | Train Acc: 87.41%
	Valid Loss: 0.3671 | Valid Acc: 88.34%
	Learning Rate: 0.000001
--------------------------------------------------
Epoch 84/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.57it/s, loss=0.3674]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 128.75it/s, loss=0.3672]
Training:   0%|          | 4/1725 [00:00<00:50, 33.92it/s, loss=0.3726]


Epoch Summary:
	Train Loss: 0.3674 | Train Acc: 87.43%
	Valid Loss: 0.3672 | Valid Acc: 88.39%
	Learning Rate: 0.000001
--------------------------------------------------
Epoch 85/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.60it/s, loss=0.3698]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 129.10it/s, loss=0.3671]
Training:   0%|          | 3/1725 [00:00<01:05, 26.39it/s, loss=0.3080]


Epoch Summary:
	Train Loss: 0.3698 | Train Acc: 87.34%
	Valid Loss: 0.3671 | Valid Acc: 88.38%
	Learning Rate: 0.000001
--------------------------------------------------
Epoch 86/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.55it/s, loss=0.3702]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 120.75it/s, loss=0.3675]
Training:   0%|          | 3/1725 [00:00<01:01, 28.09it/s, loss=0.3665]


Epoch Summary:
	Train Loss: 0.3702 | Train Acc: 87.27%
	Valid Loss: 0.3675 | Valid Acc: 88.40%
	Learning Rate: 0.000000
--------------------------------------------------
Epoch 87/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.61it/s, loss=0.3695]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 132.05it/s, loss=0.3676]



Epoch Summary:
	Train Loss: 0.3695 | Train Acc: 87.35%
	Valid Loss: 0.3676 | Valid Acc: 88.41%
	Learning Rate: 0.000000


Training:   0%|          | 3/1725 [00:00<00:57, 29.73it/s, loss=0.4301]

New best model saved to 'runs/adam_from_epoch71/best_model.pth'
--------------------------------------------------
Epoch 88/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.27it/s, loss=0.3680]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 114.77it/s, loss=0.3676]
Training:   0%|          | 3/1725 [00:00<01:07, 25.55it/s, loss=0.3287]


Epoch Summary:
	Train Loss: 0.3680 | Train Acc: 87.45%
	Valid Loss: 0.3676 | Valid Acc: 88.41%
	Learning Rate: 0.000000
--------------------------------------------------
Epoch 89/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.26it/s, loss=0.3672]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 128.99it/s, loss=0.3676]
Training:   0%|          | 2/1725 [00:00<01:30, 19.13it/s, loss=0.4292]


Epoch Summary:
	Train Loss: 0.3672 | Train Acc: 87.52%
	Valid Loss: 0.3676 | Valid Acc: 88.39%
	Learning Rate: 0.000000
--------------------------------------------------
Epoch 90/90


Training: 100%|██████████| 1725/1725 [00:41<00:00, 41.47it/s, loss=0.3683]
Evaluating: 100%|██████████| 150/150 [00:01<00:00, 135.41it/s, loss=0.3676]



Epoch Summary:
	Train Loss: 0.3683 | Train Acc: 87.38%
	Valid Loss: 0.3676 | Valid Acc: 88.40%
	Learning Rate: 0.000000
--------------------------------------------------
Training Finished!


In [12]:
current_checkpoint = {
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'scheduler_state_dict': scheduler.state_dict(),
    'history': history,
    'best_val_accuracy': best_val_accuracy
}

torch.save(current_checkpoint, os.path.join(EXPERIMENT_DIR, 'checkpoint.pth'))