In [1]:
from mydatasetclass import FacialKeypointDataset
import myconfig
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader,Dataset
from efficientnet_pytorch import EfficientNet
from torch import nn,optim
import os
from myfuncs import load_checkpoint,get_rmse,get_submission,save_checkpoint
import mlflow
from mlflow.models import infer_signature

print(os.getcwd())

/Users/nikhilanand/BigDataLabProject/mlflow_part


In [3]:
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment("BDL Project")

2024/05/17 01:30:19 INFO mlflow.tracking.fluent: Experiment with name 'BDL Project' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/698340650722622115', creation_time=1715889619794, experiment_id='698340650722622115', last_update_time=1715889619794, lifecycle_stage='active', name='BDL Project', tags={}>

In [5]:
train_ds = FacialKeypointDataset(
    csv_file="/Users/nikhilanand/BigDataLabProject/data/training_new.csv",
    transform=myconfig.train_transforms
)
train_loader = DataLoader(
    train_ds,
    batch_size=myconfig.BATCH_SIZE,
    num_workers=myconfig.NUM_WORKERS,
    pin_memory=myconfig.PIN_MEMORY,
    shuffle=True
)
val_ds = FacialKeypointDataset(
    transform=myconfig.val_transforms,
    csv_file="/Users/nikhilanand/BigDataLabProject/data/val_new.csv"
)
val_loader=DataLoader(
    val_ds,
    batch_size=myconfig.BATCH_SIZE,
    num_workers=myconfig.NUM_WORKERS,
    pin_memory=myconfig.PIN_MEMORY,
    shuffle=False
)
# test_ds = FacialKeypointDataset(
#     csv_file="/Users/nikhilanand/JupyterNotebooks/InterIIT2023/CVSelections/test.csv",
#     transform=myconfig.val_transforms,
#     train=False,
# )

# test_loader = DataLoader(
#     test_ds,
#     batch_size=1,
#     num_workers=myconfig.NUM_WORKERS,
#     pin_memory=myconfig.PIN_MEMORY,
#     shuffle=False,
# )

I'll run three expts. Will take an hour to run all three.
- First I'll try with lr of 8e-6, w_d of 1e-3.
- Then I'll try with lr of 8e-5, w_d of 1e-5.
- Then I'll try with lr of 8e-4, w_d of 1e-6.

In [6]:
LEARNING_RATE = 8e-6
WEIGHT_DECAY = 1e-3

with mlflow.start_run(run_name="trial1"):

    loss_fn = nn.MSELoss(reduction="sum")
    model = EfficientNet.from_pretrained("efficientnet-b0")
    model._fc = nn.Linear(1280, 30)
    model = model.to(myconfig.DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scaler = torch.cuda.amp.GradScaler()
    if myconfig.LOAD_MODEL and myconfig.CHECKPOINT_FILE in os.listdir():
        load_checkpoint(torch.load(myconfig.CHECKPOINT_FILE), model, optimizer, LEARNING_RATE)

    params = {
        "model": model,
        "learning_rate": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
        "num_epochs":1
    }
    mlflow.log_params(params)
    
    for epoch in range(1):
        print("Epoch ",epoch)
        get_rmse(val_loader, model, loss_fn, myconfig.DEVICE)
        train_one_epoch(train_loader, model, optimizer, loss_fn, scaler, myconfig.DEVICE)

        if myconfig.SAVE_MODEL:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint, filename="b0.pth.tar")
               
    rmse = get_rmse(val_loader, model, loss_fn, myconfig.DEVICE)
    # Log the loss metric
    mlflow.log_metric("rmse_loss", rmse)

Loaded pretrained weights for efficientnet-b0
Loading checkpoint...
Epoch  0




Loss on val: 51.992409905935745


100%|██████████| 84/84 [36:16<00:00, 25.91s/it]

Loss average over epoch: 52.63855821567224
Saving checkpoint...





Loss on val: 51.86037865462463


In [10]:
LEARNING_RATE = 8e-5
WEIGHT_DECAY = 1e-5

with mlflow.start_run(run_name="trial2"):
    # mlflow.autolog()
    loss_fn = nn.MSELoss(reduction="sum")
    model = EfficientNet.from_pretrained("efficientnet-b0")
    model._fc = nn.Linear(1280, 30)
    model = model.to(myconfig.DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scaler = torch.cuda.amp.GradScaler()
    if myconfig.LOAD_MODEL and myconfig.CHECKPOINT_FILE in os.listdir():
        load_checkpoint(torch.load(myconfig.CHECKPOINT_FILE), model, optimizer, LEARNING_RATE)

    params = {
        "model": model,
        "learning_rate": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
        "num_epochs":1
    }
    mlflow.log_params(params)

    for epoch in range(1):
        print("Epoch ",epoch)
        get_rmse(val_loader, model, loss_fn, myconfig.DEVICE)
        train_one_epoch(train_loader, model, optimizer, loss_fn, scaler, myconfig.DEVICE)

        if myconfig.SAVE_MODEL:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint, filename="b1.pth.tar")
            
    rmse = get_rmse(val_loader, model, loss_fn, myconfig.DEVICE)
    mlflow.log_metric("rmse_loss", rmse)
    

2024/05/16 20:38:34 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Loaded pretrained weights for efficientnet-b0
Loading checkpoint...
Epoch  0




Loss on val: 51.992409905935745


  2%|▏         | 2/84 [01:06<45:08, 33.04s/it]

In [None]:
LEARNING_RATE = 8e-4
WEIGHT_DECAY = 1e-6

with mlflow.start_run(run_name="trial3"):
    mlflow.autolog()
    loss_fn = nn.MSELoss(reduction="sum")
    model = EfficientNet.from_pretrained("efficientnet-b0")
    model._fc = nn.Linear(1280, 30)
    model = model.to(myconfig.DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scaler = torch.cuda.amp.GradScaler()
    if myconfig.LOAD_MODEL and myconfig.CHECKPOINT_FILE in os.listdir():
        load_checkpoint(torch.load(myconfig.CHECKPOINT_FILE), model, optimizer, LEARNING_RATE)

    params = {
        "model": model,
        "learning_rate": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
        "num_epochs":1
    }
    mlflow.log_params(params)

    for epoch in range(1):
        print("Epoch ",epoch)
        get_rmse(val_loader, model, loss_fn, myconfig.DEVICE)
        train_one_epoch(train_loader, model, optimizer, loss_fn, scaler, myconfig.DEVICE)

        if myconfig.SAVE_MODEL:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
            }
            save_checkpoint(checkpoint, filename="b2.pth.tar")
            
    rmse = get_rmse(val_loader, model, loss_fn, myconfig.DEVICE)
    mlflow.log_metric("rmse_loss", rmse)
