# Training notebook

## Import libraries

Make sure to import model class, dataset class and associated preprocessing operations.

In [None]:
# Standard libraries
import torch
import numpy as np
from torch.utils.data import DataLoader
from torchvision import transforms
import datetime
from tensorboardX import SummaryWriter
import os
import zipfile
import time
import seaborn as sns
import pandas as pd

# Custom functions/classes
from load_data import DataProcesser
from train_utils import accuracy, AverageMeter
from models import ConvNetCam, ConvNetCamBi
from class_dataset import myDataset, ToTensor, Subtract, RandomShift, RandomNoise, RandomCrop, FixedCrop

# For reproducibility
myseed = 7
torch.manual_seed(myseed)
torch.cuda.manual_seed(myseed)
np.random.seed(myseed)

## Hyperparameters for training

Hyperparameters for training:
- nepochs: int, number of training epochs
- batch size
- lr: float, initial learning rate.

Hyperparameters to setup the model dimensions:
- length: int, length of a trajectory. This is the length as the model will expect it. Setting it to smaller value than the actual length can be used for preprocessing/data jittering.
- nclass: int, number of output classes.
- nfeatures: int, size of the input representation before output layer. This also corresponds to the number of filters in the last convolution layer.
- selected_classes: list, select only some classes from input dataset. Leave empty to use all classes.

In [None]:
nepochs = 3
batch_size = 128
lr = 1e-2

length = 200
nclass = 6
nfeatures = 10
selected_classes = []

## Load and process data, Data augmentation

Define which data to load and whether/how to preprocess the batch. 
- data_file: str, path to a .zip that can be loaded as a DataProcesser. The archove must contain 3 files: one for the
 data, one for the split train/validation, one with the classes informations. See DataProcesser.read_archive().
- meas_var: list of str, names of the measurement variables. In DataProcesser convention, this is the prefix in a
 column name that contains a measurement (time being the suffix). Pay attention to the order since this is how the dimensions of a sample of data will be ordered (i.e. 1st in the list will form 1st row of measurements in the sample, 2nd is the 2nd, etc...)
- start_time/end_time: int, use to subset data to a specific time range. Useful to completely exclude some acquisition times where irrelevant measurements are acquired.

In [None]:
data_file = '/home/marc/Dropbox/Work/TSclass/data/fly_hctsa.zip'
#data_file = '/home/marc/Dropbox/Work/TSclass_GF/data/ErkAkt_6GF_len240_repl2_trim100.zip'
meas_var = ['V']
start_time = 0
end_time = 599

In [None]:
data = DataProcesser(data_file)
data.subset(sel_groups=meas_var, start_time=start_time, end_time=end_time)
if selected_classes:
    data.dataset = data.dataset[data.dataset[data.col_class].isin(selected_classes)]
data.get_stats()
# data.process(method='center_train', independent_groups=True)
data.split_sets()
data_train = myDataset(dataset=data.train_set, transform=transforms.Compose([
    RandomCrop(output_size=length, ignore_na_tails=True),
    #transforms.RandomApply([RandomNoise(mu=0, sigma=0.02)]),
    #Subtract([data.stats['mu']['ERK']['train'], data.stats['mu']['AKT']['train']]),
    Subtract(data.stats['mu']['V']['train']),
    ToTensor()
]))
data_test = myDataset(dataset=data.validation_set, transform=transforms.Compose([
    RandomCrop(output_size=length, ignore_na_tails=True),
    #Subtract([data.stats['mu']['ERK']['train'], data.stats['mu']['AKT']['train']]),
    Subtract(data.stats['mu']['V']['train']),
    ToTensor()
]))

Plot some trajectories to check that the data loading and processing is properly done.

In [None]:
n_smpl = 6
indx_smpl = np.random.randint(0, len(data_train), n_smpl)

col_ids = []
col_lab = []
col_mes = []
# Long format for seaborn grid, for loop to avoid multiple indexing
# This would triggers preprocessing multiple times and add randomness
for i in indx_smpl:
    smpl = data_train[i]
    col_ids.append(smpl['identifier'])
    col_lab.append(smpl['label'].item())
    col_mes.append(smpl['series'].numpy().transpose())
col_ids = pd.Series(np.hstack(np.repeat(col_ids, length)))
col_lab = pd.Series(np.hstack(np.repeat(col_lab, length)))
col_mes = pd.DataFrame(np.vstack(col_mes), columns=meas_var)
col_tim = pd.Series(np.tile(np.arange(0, length), n_smpl))

df_smpl = pd.concat([col_ids, col_lab, col_tim, col_mes], axis=1)
df_smpl.rename(columns={0: 'identifier', 1: 'label', 2:'time'}, inplace=True)
df_smpl = df_smpl.melt(id_vars=['identifier', 'label', 'time'], value_vars=meas_var)

sns.set_style('white')
sns.set_context('notebook')
grid = sns.FacetGrid(data=df_smpl, col='identifier', col_wrap=3, sharex=True)
grid.map_dataframe(sns.lineplot, x='time', y='value', hue='variable')
grid.set(xlabel='Time', ylabel='Measurement Value')
grid.add_legend()

## Resume training or new model

Set to None for new model, otherwise provide the path to a saved model file. 

In [None]:
load_model = None
# load_model = 'path/to/file.pytorch'

## Tensorboard logs and model save file

Unique name for model with timestamp. Can follow training online with tensorboard with these logs

In [None]:
file_logs = os.path.splitext(os.path.basename(data_file))[0]  # file name without extension
logs_str = 'logs/' + '_'.join(meas_var) + '/' + datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + \
           '_' + file_logs + '/'
writer = SummaryWriter(logs_str)
save_model = 'models/' + logs_str.lstrip('logs/').rstrip('/') + '.pytorch'

if not os.path.exists(file_logs):
    os.makedirs(file_logs)
if not os.path.exists('models/' + '_'.join(meas_var)):
    os.makedirs('models/' + '_'.join(meas_var))

## Setup model, loss and optimizer

The model dimensions are tuned to fit the previous parameters. 

L2 regularization is controlled by the "weight_decay" in the optimizer object.

In [None]:
model = ConvNetCam(batch_size=batch_size, nclass=nclass, length=length, nfeatures=nfeatures)
if load_model:
    model.load_state_dict(torch.load(load_model))
model.double()
cuda_available = torch.cuda.is_available()
if cuda_available:
    model = model.cuda()
    
optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999), weight_decay=1e-3)
criterion = torch.nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[200, 400, 600, 800, 1000, 1500], gamma=0.5)

## Training loop

In [None]:
def TrainModel(model, optimizer, criterion, scheduler, train_loader, test_loader, nepochs,
               save_model=save_model, logs=True, save_pyfiles=True):
    # ------------------------------------------------------------------------------------------------------------------
    # Model, loss, optimizer
    top1 = AverageMeter()
    top2 = AverageMeter()

    # Create zip archive with all python file at execution time
    if save_pyfiles:
        lpy = [i for i in os.listdir(".") if i.endswith(".py")]
        with zipfile.ZipFile(logs_str + "AllPyFiles.zip", mode='w') as zipMe:
            for file in lpy:
                zipMe.write(file, compress_type=zipfile.ZIP_DEFLATED)
    if logs:
        print('Train logs saved at: {}'.format(logs_str))

    # ------------------------------------------------------------------------------------------------------------------
    # Get adequate size of sample for nn.Conv layers
    # Add a dummy channel dimension for conv1D layer (if multivariate, treat as a 2D plane with 1 channel)
    assert len(train_loader.dataset[0]['series'].shape) == 2
    nchannel, univar_length = train_loader.dataset[0]['series'].shape
    if nchannel == 1:
        view_size = (batch_size, 1, univar_length)
    elif nchannel >= 2:
        view_size = (batch_size, 1, nchannel, univar_length)

    # ------------------------------------------------------------------------------------------------------------------
    # Training loop
    for epoch in range(nepochs):
        scheduler.step()
        model.train()
        top1.reset()
        top2.reset()

        loss_train = []
        for i_batch, sample_batch in enumerate(train_loader):
            series, label = sample_batch['series'], sample_batch['label']
            if cuda_available:
                series, label = series.cuda(), label.cuda()
            series = series.view(view_size)

            prediction = model(series)

            loss = criterion(prediction, label)
            loss_train.append(loss.cpu().detach().numpy())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if i_batch % 25 == 0:
                print('Training epoch: [{0}/{4}][{1}/{2}]; Loss: {3}'.format(epoch + 1, i_batch + 1, len(train_loader),
                                                                             loss, nepochs))

            prec1, prec2 = accuracy(prediction, label, topk=(1, 2))
            top1.update(prec1[0], series.size(0))
            top2.update(prec2[0], series.size(0))

            if i_batch % 100 == 0:
                print('Training Accuracy Epoch: [{0}]\t'
                      'Prec@1 {top1.val.data:.3f} ({top1.avg.data:.3f})\t'
                      'Prec@2 {top2.val.data:.3f} ({top2.avg.data:.3f})'.format(
                    epoch, top1=top1, top2=top2))
            if logs:
                writer.add_scalar('Train/Loss', loss, epoch * len(train_loader) + i_batch + 1)
                writer.add_scalar('Train/Top1', top1.val, epoch * len(train_loader) + i_batch + 1)
                writer.add_scalar('Train/Top2', top2.val, epoch * len(train_loader) + i_batch + 1)
        if logs:
            writer.add_scalar('MeanEpoch/Train_Loss', np.mean(loss_train), epoch)
            writer.add_scalar('MeanEpoch/Train_Top1', top1.avg, epoch)
            writer.add_scalar('MeanEpoch/Train_Top2', top2.avg, epoch)

        # --------------------------------------------------------------------------------------------------------------
        # Evaluation loop
        model.eval()
        top1.reset()
        top2.reset()
        loss_eval = []
        for i_batch, sample_batch in enumerate(test_loader):
            series, label = sample_batch['series'], sample_batch['label']
            if cuda_available:
                series, label = series.cuda(), label.cuda()
            series = series.view(view_size)
            label = torch.autograd.Variable(label)

            prediction = model(series)
            loss = criterion(prediction, label)
            loss_eval.append(loss.cpu().detach().numpy())

            prec1, prec2 = accuracy(prediction, label, topk=(1, 2))
            top1.update(prec1[0], series.size(0))
            top2.update(prec2[0], series.size(0))

        # For validation loss, report only after the whole batch is processed
        if logs:
            writer.add_scalar('Val/Loss', loss, epoch * len(train_loader) + i_batch + 1)
            writer.add_scalar('Val/Top1', top1.val, epoch * len(train_loader) + i_batch + 1)
            writer.add_scalar('Val/Top2', top2.val, epoch * len(train_loader) + i_batch + 1)
            writer.add_scalar('MeanEpoch/Val_Loss', np.mean(loss_eval), epoch)
            writer.add_scalar('MeanEpoch/Val_Top1', top1.avg, epoch)
            writer.add_scalar('MeanEpoch/Val_Top2', top2.avg, epoch)


        print('===>>>\t'
              'Prec@1 ({top1.avg.data:.3f})\t'
              'Prec@2 ({top2.avg.data:.3f})'.format(top1=top1, top2=top2))

    if save_model:
        torch.save(model, save_model)
        print('Model saved at: {}'.format(save_model))
    return model


## Run the training

Can follow the training in tensorboard with:
```
tensorboard --logdir "path/to/logs"
```

In [None]:
train_loader = DataLoader(dataset=data_train,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=4,
                          drop_last=True)
test_loader = DataLoader(dataset=data_test,
                         batch_size=batch_size,
                         shuffle=True,
                         num_workers=4,
                         drop_last=True)

t0 = time.time()
mymodel = TrainModel(model, optimizer, criterion, scheduler, train_loader, test_loader, nepochs)
t1 = time.time()

print('Elapsed time: {}'.format(t1 - t0))