In [1]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import time, random
from tqdm import tqdm
from sklearn.metrics import classification_report

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import transforms, utils, datasets
from torch.utils.data import Dataset, DataLoader

from torchsample.torchsample.modules import ModuleTrainer
from ember_utils import *
from ember_model import *
#from ember_utils import get_dataloader

In [31]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print            
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        #self.path = path
        self.trace_func = trace_func
        #self.current_epoch = epoch
    def __call__(self, path, epoch, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(path, epoch, val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(path, epoch, val_loss, model)
            self.counter = 0

    def save_checkpoint(self, path, epoch, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        self.delete_previous_saved_model(path)
        path = path + 'best_model_epoch_' + str(epoch) + '.pt'
        torch.save(model.state_dict(), path)
        self.val_loss_min = val_loss
        
    def delete_previous_saved_model(self, path):
        saved_models = os.listdir(path)
        for prev_model in saved_models:
            prev_model = path + prev_model
            print(prev_model)
            if os.path.isfile(prev_model):
                os.remove(prev_model)
            else: pass

In [46]:
def training_early_stopping(model, X_train, y_train, X_valid, y_valid,\
                            patience, batch_size, device, optimizer, num_epoch,\
             criterion, replay_type, current_task, save_dir, exp, earlystopping=True):
 
    
    
    trainloader = get_dataloader(X_train, y_train, batch_size, train_data=True)
    validloader = get_dataloader(X_valid, y_valid, batch_size, train_data=False)
    
    
    train_loss, train_acc = [], []
    valid_loss, valid_acc = [], []
    
    early_stopping = EarlyStopping(patience=patience, verbose=True)
    
    
    start_train = time.time()
    for epoch in range(1,num_epoch+1):
        print(f"Epoch {epoch} of {num_epoch}")
        epoch_train_loss, epoch_train_acc = epoch_training(model, trainloader, batch_size, criterion, optimizer, device)
        epoch_valid_loss, epoch_valid_acc = validation(model, validloader, batch_size, criterion, device)
        
        train_loss.append(epoch_train_loss)
        train_acc.append(epoch_train_acc)
        valid_loss.append(epoch_valid_loss)
        valid_acc.append(epoch_valid_acc)
        
        print(f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_acc:.2f}")
        print(f'Val Loss: {epoch_valid_loss:.4f}, Val Acc: {epoch_valid_acc:.2f}')

        
        save_path = './dummy_test/' + str(exp) + '/'
        create_parent_folder(save_path)
        #lr_scheduler(epoch_valid_loss)
        if earlystopping:
            early_stopping(save_path, epoch, epoch_valid_loss, model)

            if early_stopping.early_stop:
                print("Early stopping")
                break
                
    end = time.time()
    print(f"Training time: {(end-start_train)/60:.3f} minutes")
    
    best_model = os.listdir(save_path)
    best_epoch = int(saved_models[0].split('_')[3].split('.')[0])
    return (end-start_train)/60, best_epoch 

In [47]:
use_cuda = True
print('Torch', torch.__version__, 'CUDA', torch.version.cuda)
use_cuda = use_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu");
torch.manual_seed(1);


batch_size = 1024
num_epoch = 500
learning_rate = 0.001

replay_type = 'joint'

result_save_dir = '../../ember2018_exps_store/month_based_partial/'

model = Ember_Net()

if torch.cuda.device_count() > 1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)

model = model.to(device)
print(f'Model has {count_parameters(model)/1000000}m parameters')    
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.BCELoss()
patience = 5

#lr_scheduler = LRScheduler(optimizer, patience)


all_task_months = ['2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06',
                   '2018-07', '2018-08', '2018-09', '2018-10', '2018-11', '2018-12']

data_dir = '../../ember2018/month_based_processing/'


replay_portion = 0.00


for task in range(len(all_task_months[:1])):
    start_time = time.time()
    current_task = all_task_months[task]
    task_months = all_task_months[:task+1]
    print(f'Current Task {current_task} with Replay {replay_portion*100}%')

    X_train, Y_train, X_valid, Y_valid = get_task_partial_joint_training_data(data_dir, task_months, replay_portion)
    X_test, Y_test = get_task_test_data(data_dir, task_months)
    
    model_save_dir = '../../ember2018_exps_store/month_based_partial/' + '/partial_joint_replay_' + str(replay_portion) + '/'
    results_save_dir = './ember2018_store_results/' + '/replay_' + str(replay_portion) + '/' 
    
    tr_time, best_epoch = training_early_stopping(model, X_train, Y_train, X_valid, Y_valid, patience, batch_size, device, optimizer, num_epoch,\
                 criterion, replay_type, current_task, save_dir, 11, earlystopping=True)

    acc, rocauc = testing_aucscore(model, X_test, Y_test, batch_size, device)
    
    print(tr_time, best_epoch)
    print(acc, rocauc)
    
    end_time = time.time()
    
    print(f'Elapsed time {(end_time - start_time)/60} mins.')    



Torch 1.6.0 CUDA 10.2
Model has 9.210305m parameters
Current Task 2018-01 with Replay 0.0%
Current Task month 2018-01 data X (50149, 2381) Y (50149,)
X_train (50149, 1, 49, 49) Y_train (50149,)

X_valid (5573, 1, 49, 49) Y_valid (5573,)

X_test (6192, 1, 49, 49) Y_test (6192,)
Epoch 1 of 500


48it [00:18,  2.55it/s]
6it [00:01,  5.99it/s]

Train Loss: 0.7124, Train Acc: 51.04
Val Loss: 0.6930, Val Acc: 51.61
Validation loss decreased (inf --> 0.692952).  Saving model ...
./dummy_test/11/best_model_epoch_5.pt
Epoch 2 of 500



48it [00:19,  2.50it/s]
6it [00:01,  5.84it/s]

Train Loss: 0.6997, Train Acc: 50.90
Val Loss: 0.6971, Val Acc: 49.95
EarlyStopping counter: 1 out of 5
Epoch 3 of 500



48it [00:19,  2.45it/s]
6it [00:01,  5.91it/s]

Train Loss: 0.6947, Train Acc: 50.94
Val Loss: 0.6966, Val Acc: 49.64
EarlyStopping counter: 2 out of 5
Epoch 4 of 500



48it [00:19,  2.41it/s]
6it [00:01,  5.79it/s]

Train Loss: 0.6927, Train Acc: 51.23
Val Loss: 0.6957, Val Acc: 50.16
EarlyStopping counter: 3 out of 5
Epoch 5 of 500



48it [00:20,  2.39it/s]
6it [00:00,  6.14it/s]

Train Loss: 0.6918, Train Acc: 51.54
Val Loss: 0.6984, Val Acc: 49.37
EarlyStopping counter: 4 out of 5
Epoch 6 of 500



48it [00:20,  2.38it/s]
6it [00:01,  5.84it/s]
 14%|█▍        | 1/7 [00:00<00:01,  5.99it/s]

Train Loss: 0.6911, Train Acc: 51.21
Val Loss: 0.6987, Val Acc: 49.70
EarlyStopping counter: 5 out of 5
Early stopping
Training time: 2.204 minutes


100%|██████████| 7/7 [00:00<00:00,  7.56it/s]

2.204322302341461 5
0.5243863049095607 0.5128695963079182
Elapsed time 2.27274866104126 mins.



