In [1]:
from config_living import LivingConfig
from torch.utils.data import DataLoader
from data import LivingDataset
from inspect import getsource
from torchnet import meter
from tqdm import tqdm
import numpy as np
import torch as t
import models
import utils
import fire
import time
import csv
import os

opt = LivingConfig()
log = utils.log


def train(**kwargs):
    name = time.strftime('living_train_%Y%m%d_%H%M%S')
    log_file = open(f"{opt.save_log_root}/{name}.txt", 'w')

    opt.parse(kwargs, log_file)
    start_time = time.strftime("%b %d %Y %H:%M:%S")
    log(log_file, f'Training start time: {start_time}')

    # step1: configure model
    log(log_file, 'Building model...')
    model = models.model(
        module_name=opt.module_name,
        model_name=opt.model_name,
        input_channel=3,
        output_channel=2,
        pretrained=False
    )
    input_channel = 512
    connect = models.connect(
        module_name=opt.module_name,
        model_name=opt.model_name,
        input_channel=input_channel,
        output_channel=2,
        reshape=True
    )

    if opt.multi_GPU:
        model_parallel = models.ParallelModule(model=model)
        connect_parallel = models.ParallelModule(model=connect)
        if opt.load_model_path:
            model_parallel.load_model(opt.load_model_path)
        if opt.load_connect_path:
            connect_parallel.load_model(opt.load_connect_path)
        model = model_parallel.model
        connect = connect_parallel.model
    else:
        if opt.load_model_path:
            model.load_model(opt.load_model_path)
        if opt.load_connect_path:
            connect.load_model(opt.load_connect_path)
    model.cuda()
    connect.cuda()

    # step2: data
    log(log_file, 'Building dataset...')
    train_data = LivingDataset(
        data_root=opt.train_data_root, mask_size=opt.mask_size)
    val_data = LivingDataset(
        data_root=opt.val_data_root, mask_size=opt.mask_size)

    log(log_file, 'Building data loader...')
    train_dataloader = DataLoader(
        train_data,
        opt.batch_size,
        shuffle=True,
        num_workers=opt.num_workers
    )
    val_dataloader = DataLoader(
        val_data,
        opt.batch_size,
        shuffle=True,
        num_workers=opt.num_workers
    )

    # step3: criterion and optimizer
    log(log_file, 'Building criterion and optimizer...')
    lr = opt.lr_base
    optimizer = t.optim.Adam(
        list(model.parameters())+list(connect.parameters()),
        lr=lr,
        weight_decay=opt.weight_decay
    )
    current_epoch = opt.current_epoch
    # criterion = t.nn.MSELoss()
    criterion = t.nn.SmoothL1Loss()
    loss_meter = meter.AverageValueMeter()

    # step4: training
    log(log_file, 'Starting to train...')
    if current_epoch == 0 and os.path.exists(opt.result_file):
        os.remove(opt.result_file)
    result_file = open(opt.result_file, 'a', newline='')
    writer = csv.writer(result_file)
    if current_epoch == 0:
        data_name = ['Epoch', 'Average Loss', 'Val Loss']
        writer.writerow(data_name)
        result_file.flush()

    while current_epoch < opt.max_epoch:
        current_epoch += 1
        running_loss = 0.0
        loss_meter.reset()
        log(log_file)
        log(log_file, f'Training epoch: {current_epoch}')

        for i, (input, target) in tqdm(enumerate(train_dataloader)):
            input = input.cuda()
            target = target.cuda()
            optimizer.zero_grad()
            score_model = model(input)
            score_connect = connect(score_model)
            loss = criterion(score_connect, target)
            loss.backward()
            optimizer.step()

            # log info
            running_loss += loss.item()
            if i % opt.print_freq == opt.print_freq - 1:
                log(log_file, f'loss {running_loss / opt.print_freq:.5f}')
                running_loss = 0.0
            loss_meter.add(loss.item())

        if current_epoch % opt.save_freq == 0:
            if opt.multi_GPU:
                model_parallel.save_model(current_epoch)
                connect_parallel.save_model(current_epoch)
            else:
                model.save_model(current_epoch)
                connect.save_model(current_epoch)
        average_loss = round(loss_meter.value()[0], 5)
        log(log_file, f'Average Loss: {average_loss}')

        # validate
        if current_epoch % opt.val_freq == 0:
            val_error = val(model, connect, val_dataloader, log_file)
            results = [current_epoch, average_loss, val_error]
            writer.writerow(results)
            result_file.flush()

        # update learning rate
        if opt.update_lr:
            if current_epoch % opt.lr_decay_freq == 0:
                lr = lr * 0.5
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr
                    log(log_file, f'Updating learning rate: {lr}')

    end_time = time.strftime("%b %d %Y %H:%M:%S")
    log(log_file, f'Training end time: {end_time}')
    log_file.close()
    result_file.close()


def val(model, connect, dataloader, file):
    model.eval()
    connect.eval()
    error_meter = meter.AverageValueMeter()

    for _, (input, target) in enumerate(dataloader):
        batch_size = input.shape[0]
        with t.no_grad():
            input = input.cuda()
            score_model = model(input)
            score_connect = connect(score_model)

        output = score_connect.cpu().numpy().astype(int)
        target = target.numpy()
        distance_error = (output[:, 0]-target[:, 0])**2 + \
            (output[:, 1]-target[:, 1])**2
        for i in range(batch_size):
            error_meter.add(distance_error[i]**0.5 * utils.pixel2length)

    model.train()
    connect.train()
    val_error = round(error_meter.value()[0], 5)
    log(file, f'Val Error: {val_error}')
    return val_error


def help():
    print("""
    usage : python file.py <function> [--args=value]
    <function> := train | help
    example: 
            python {0} train --lr=0.01
            python {0} help
    avaiable args:""".format(__file__))

    source = (getsource(opt.__class__))
    print(source)


if __name__ == '__main__':
    fire.Fire(train)



user config:
train_data_root: D:\Projects and Training\Projects\floor plan\dataset\dataset\pickle\train
val_data_root: D:\Projects and Training\Projects\floor plan\dataset\dataset\pickle\val
save_log_root: log
result_file: result_living.csv
module_name: living
model_name: resnet34_fc1
load_model_path: None
load_connect_path: None
mask_size: 9
multi_GPU: False
batch_size: 16
num_workers: 2
print_freq: 300
max_epoch: 300
current_epoch: 0
save_freq: 50
val_freq: 5
update_lr: True
lr_decay_freq: 30
lr_base: 0.0001
weight_decay: 0.0001
parse: <bound method LivingConfig.parse of <config_living.LivingConfig object at 0x00000178DD96DA80>>
Training start time: Oct 20 2023 13:45:49
Building model...
Building dataset...
Building data loader...
Building criterion and optimizer...
Starting to train...

Training epoch: 1


299it [04:24,  1.38it/s]

loss 74.13238


599it [08:36,  1.11it/s]

loss 48.39147


899it [12:53,  1.02s/it]

loss 36.89139


1199it [17:20,  1.27it/s]

loss 27.95836


1499it [22:04,  1.54it/s]

loss 20.22199


1799it [26:39,  1.37it/s]

loss 15.28788


2099it [31:01,  2.21it/s]

loss 12.69644


2399it [34:07,  2.24it/s]

loss 11.96378


2699it [37:28,  2.13it/s]

loss 11.29944


2999it [40:05,  2.53it/s]

loss 11.17175


3299it [42:38,  2.70it/s]

loss 11.14345


3599it [45:19,  1.25it/s]

loss 10.95348


3899it [47:40,  2.75it/s]

loss 10.80465


4040it [49:00,  1.37it/s]


Average Loss: 22.85744

Training epoch: 2


299it [02:44,  2.20it/s]

loss 10.40325


599it [06:05,  2.51it/s]

loss 10.47281


899it [09:11,  2.63it/s]

loss 10.21055


1199it [12:25,  1.02s/it]

loss 10.12944


1499it [16:01,  1.48s/it]

loss 10.18387


1799it [19:05,  2.41it/s]

loss 10.04552


2099it [22:28,  1.18s/it]

loss 9.92381


2399it [25:25,  1.85it/s]

loss 9.49989


2493it [26:25,  2.16it/s]