# danbooru tagger training

In [1]:
from tagger import *

In [2]:
local = True
reload = False

In [3]:
init_patience = (2,1)[local]

sched_params = {
    .1:{
        'threshold':.01,
        'patience':init_patience,
        'min_lr':.02,
        'factor':.2
    },
    
    .02:{
        'threshold':1e-3,
        'patience':init_patience*5,
        'min_lr':1e-3,
        'factor':.5
    },
    
    1e-3:{
        'threshold':1e-4,
        'patience':init_patience*10,
        'min_lr':1e-4,
        'factor':2/3
    },
}

## data upload

In [4]:
def get_data(train_ds, val_ds, bs):
    
    return (
        DataLoader(
            train_ds, batch_size=bs, shuffle=True),
        DataLoader(
            val_ds, batch_size=(2*bs, len(val_ds))[local])
    )

In [5]:
imgs_path = data_path / 'less-images'

In [6]:
train_dir = imgs_path / 'train'
train_ids = [int(f.stem) for f in train_dir.glob('*')]
train_labels = all_labels[all_labels.id.isin(train_ids)]
train_ds = DanbooruDataset(label_data=train_labels, img_dir=train_dir)

In [7]:
val_dir = imgs_path / 'val'
val_ids = [int(f.stem) for f in val_dir.glob('*')]
val_labels = all_labels[all_labels.id.isin(val_ids)]
val_ds = DanbooruDataset(label_data=val_labels, img_dir=val_dir)

In [8]:
if local:
    bs = 2
    train_samp = get_random_sample(train_ds, 10)
    val_samp = get_random_sample(val_ds, 2)
    train_len, val_len = len(train_samp), len(val_samp)
    train_dl, val_dl = get_data(train_samp, val_samp, bs)
    
else:
    bs = 256
    train_len, val_len = len(train_ds), len(val_ds)
    train_dl, val_dl = get_data(train_ds, val_ds, bs)

## defs

### model and optimizer

In [9]:
def load_model(finetune=False):
    
    model = Tagger().to(dev)
    for param in model.base.parameters():
        param.requires_grad = finetune
    
    if reload:
        fp = Path()/'state-dicts'
        fn = 'model_state_dict.pt'
        try:
            model.load_state_dict(torch.load(
                fp / fn, map_location=dev))
        except RuntimeError:
            model.load_state_dict(torch.load(
                f'backup_{fn}', map_location=dev))
            
    return model

In [10]:
def set_optimizer(model):
    
    optimizer = optim.AdamW(lr=.1, params=filter(
        lambda p: p.requires_grad, model.parameters()))

    if reload:
        fp = Path()/'state-dicts'
        fn = 'opt_state_dict.pt'
        try:
            model.load_state_dict(torch.load(
                fp / fn, map_location=dev))
        except RuntimeError:
            model.load_state_dict(torch.load(
                fp / f'backup_{fn}', map_location=dev))
    
    return optimizer

In [11]:
def save_model(model, opt, save_path=Path()/'state-dicts'):
    
    names = [f'{n}_state_dict.pt' for n in ['model', 'opt']]
    paths = [save_path / n for n in names]
    
    torch.save(model.state_dict(), save_path / names[0])
    torch.save(opt.state_dict(), save_path / names[1])
    
    backups = [save_path / f'backup_{n}' for n in names]
    for i in range(2):
        shutil.copy(paths[i], backups[i])

### lr scheduler and early stopper

In [12]:
class Scheduler(optim.lr_scheduler.ReduceLROnPlateau):
    
    def __init__(self, opt):
        self.last_lr = round(opt.param_groups[0]['lr'], 5)
        super().__init__(
            opt, verbose=True, **sched_params[self.last_lr])
        self.min_lr = self.min_lrs[0]
        
    def load(self, fp=Path()/'state-dicts'):
        fn = 'sched_state_dict.pt'
        try:
            super().load_state_dict(torch.load(
                fp / fn, map_location=dev))
        except RuntimeError:
            super().load_state_dict(torch.load(
                fp / f'backup_{fn}', map_location=dev))
    
    def save(self, fp=Path()/'state-dicts'):
        fn = 'sched_state_dict.pt'
        sd = self.state_dict()
        torch.save(sd, fp / fn)
        torch.save(sd, fp / f'backup_{fn}')
        
    def step(self, val_loss):
        super().step(val_loss)
        self.last_lr = round(self._last_lr[0], 5)
        self.save()
        
    def compare(self):
        return self.last_lr == self.min_lr

In [13]:
class EarlyStopper():

    def __init__(self, patience=init_patience*50, min_delta=1e-5):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        
    def step(self, val_loss):
        if self.best_loss == None:
            self.best_loss = val_loss
        elif self.best_loss - val_loss >= self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            print(f'INFO: stopper counter reset')
        else:
            self.counter += 1
            if self.counter == self.patience/2:
                print('INFO: stopper counter halfway through')
            self.early_stop = \
                self.counter >= self.patience

In [19]:
def check_sched(sched, stopper):
    if sched.last_lr >= list(sched_params.keys())[-1]:
        sched = Scheduler(opt)
        print('INFO: scheduler refreshed')
    else:
        stopper = EarlyStopper()
        sched.last_lr=0
        print('INFO: LR scheduling ended')
    return sched, stopper

### training loop

In [15]:
def calc_batch_loss(xb, yb, model, opt=None, loss_func=nn.MSELoss()):

    loss = loss_func(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item()

In [16]:
def fit(model, opt, sched, train_dl, val_dl, res):
    stopper=None
    while True:
        
        model.train()
        running_loss = 0
        for xb, yb in train_dl:
            xb.to(dev)
            yb.to(dev)
            batch_loss = calc_batch_loss(xb, yb, model, opt)
            running_loss += batch_loss*len(xb)
            save_model(model, opt)
        train_loss = running_loss/train_len

        model.eval()
        with torch.no_grad():
            val_loss = np.sum([
                calc_batch_loss(xb, yb, model)*len(xb)
                    for xb, yb in val_dl
            ]) / val_len
        
        print(f'epoch: {len(res)}', end=' | ')
        print(f'train MSE: {train_loss:.4e}', end=' | ')
        print(f'val MSE: {val_loss:.4e}')
        res = pd.concat([res, pd.DataFrame({
            'train': [train_loss], 'val':[val_loss]})])
        res.to_csv('losses.csv', index=False)
        
        if stopper is None:
            sched.step(val_loss)
            if sched.compare():
                sched, stopper = check_sched(sched, stopper)
        elif not stopper.early_stop:
                stopper.step(val_loss)
        else:
            print('INFO: training stopped')
            break

## run

In [20]:
model = load_model()
opt = set_optimizer(model)
sched = Scheduler(opt)

if reload:
    res=pd.read_csv('losses.csv')
    sched.load()
else:
    res=pd.DataFrame()

Using cache found in C:\Dev\pytorch_vision_v0.12.0


In [21]:
fit(model, opt, sched, train_dl, val_dl, res)

epoch: 0 | train MSE: 3.3512e-01 | val MSE: 2.0119e-01
epoch: 1 | train MSE: 2.5710e-01 | val MSE: 2.6501e-01
epoch: 2 | train MSE: 2.5396e-01 | val MSE: 2.6377e-01
Epoch 00003: reducing learning rate of group 0 to 2.0000e-02.
INFO: scheduler refreshed
epoch: 3 | train MSE: 2.3602e-01 | val MSE: 2.6356e-01
epoch: 4 | train MSE: 2.2208e-01 | val MSE: 2.4758e-01
epoch: 5 | train MSE: 2.0321e-01 | val MSE: 2.1664e-01
epoch: 6 | train MSE: 1.9199e-01 | val MSE: 2.0157e-01
epoch: 7 | train MSE: 1.7101e-01 | val MSE: 1.6230e-01
epoch: 8 | train MSE: 1.5522e-01 | val MSE: 2.4174e-01
epoch: 9 | train MSE: 1.6557e-01 | val MSE: 2.2468e-01
epoch: 10 | train MSE: 1.4380e-01 | val MSE: 1.6117e-01
epoch: 11 | train MSE: 1.4500e-01 | val MSE: 1.6372e-01
epoch: 12 | train MSE: 1.4909e-01 | val MSE: 1.5623e-01
epoch: 13 | train MSE: 1.3303e-01 | val MSE: 1.5868e-01
epoch: 14 | train MSE: 1.3513e-01 | val MSE: 2.2582e-01
epoch: 15 | train MSE: 1.2210e-01 | val MSE: 1.5469e-01
epoch: 16 | train MSE: 1.3

epoch: 134 | train MSE: 9.0428e-02 | val MSE: 1.1290e-01
epoch: 135 | train MSE: 8.5190e-02 | val MSE: 1.2400e-01
epoch: 136 | train MSE: 9.5415e-02 | val MSE: 1.8389e-01
epoch: 137 | train MSE: 8.6025e-02 | val MSE: 1.2500e-01
epoch: 138 | train MSE: 8.3576e-02 | val MSE: 1.3121e-01
epoch: 139 | train MSE: 9.0296e-02 | val MSE: 2.1433e-01
epoch: 140 | train MSE: 8.6637e-02 | val MSE: 1.2447e-01
epoch: 141 | train MSE: 9.6391e-02 | val MSE: 1.2458e-01
epoch: 142 | train MSE: 8.2054e-02 | val MSE: 1.4488e-01
epoch: 143 | train MSE: 7.8904e-02 | val MSE: 2.2332e-01
epoch: 144 | train MSE: 8.9342e-02 | val MSE: 1.8338e-01
Epoch 00082: reducing learning rate of group 0 to 1.0000e-04.
INFO: LR scheduling ended
epoch: 145 | train MSE: 8.9303e-02 | val MSE: 1.6678e-01
epoch: 146 | train MSE: 8.4038e-02 | val MSE: 2.2705e-01
epoch: 147 | train MSE: 9.1326e-02 | val MSE: 1.9062e-01
epoch: 148 | train MSE: 8.4645e-02 | val MSE: 1.1399e-01
INFO: stopper counter reset
epoch: 149 | train MSE: 8.710