# New test set 
The aim of this notebook is to create a new test method.

## Librairies

In [1]:
# --- Utils librairies ---
#Generics librairies
import os
import os.path
from os import path
import numpy as np
import copy
import pickle
from statistics import mean
import matplotlib.pyplot as plt

#Measure librairies
import time

#Dataset librairies
import pandas as pd

# --- DL librairies ---
#Pytorch librairies
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader

#Sklearn librairies
from sklearn.model_selection import LeaveOneGroupOut, train_test_split
from sklearn.metrics import accuracy_score

## GPU environment

In [2]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
os.environ["CUDA_VISIBLE_DEVICES"]="1,2,3,4";
gpus_list = [0, 1, 2, 3]

print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
4


## Data

### Load data

In [3]:
df, labels, names = pd.read_pickle("../data/dataset_COVID_RAW.pkl")
df = pd.DataFrame(df)
labels = pd.Series(labels, name = 'label')
names = pd.Series(names, name = 'names')
df = pd.concat((df, labels, names), axis = 1)

In [4]:
outside_group = np.random.choice(df.names.values)
print(outside_group)

CovNeg_26


In [5]:
final_test_set = df[df.names.values == outside_group]
df = df[df.names.values != outside_group]

In [6]:
X_set = df.drop(columns = ['label', "names"]).values
Y_set = df.label.values
groups = df.names.values

In [7]:
folds = list(LeaveOneGroupOut().split(X_set, Y_set, groups = groups))

### Data augmentation

In [8]:
def dataAugment(signal, betashift = 0.24039033704204857, slopeshift = 0.5640435054299953, multishift = 0.0013960388613510225):
    #baseline shift
    beta = np.random.random(size=(signal.shape[0],1))*2*betashift-betashift
    slope = np.random.random(size=(signal.shape[0],1))*2*slopeshift-slopeshift + 1
    #relative positions
    axis = np.array(range(signal.shape[1]))/float(signal.shape[1])
    #offset
    offset = slope*(axis) + beta - axis - slope/2. + 0.5

    #multiplicative coefficient
    multi = np.random.random(size=(signal.shape[0],1))*2*multishift-multishift + 1
    augmented_signal = multi*signal + offset

    return augmented_signal

### Creation of sets

In [9]:
class RamanDataset(Dataset):
    def __init__(self, X, Y):
        super(RamanDataset).__init__()
        x = torch.from_numpy(X)
        self.raman_spectra = x
        y = torch.from_numpy(Y)
        self.labels = y
        
    def __len__(self):
        return len(self.raman_spectra)
    
    def __getitem__(self, idx):
        spectrum = self.raman_spectra[idx]
        label = self.labels[idx]
        
        return spectrum, label

In [10]:
class Datasets(Dataset):
    def __init__(self, ramanDatasets):
        super(Datasets).__init__()
        self.datasets = ramanDatasets
        
    def __len__(self):
        return len(self.datasets)
    
    def __getitem__(self, idx):
        return self.datasets[idx]

In [11]:
def setsCreation(pathToFile, X_set, Y_set, folds):
    if not path.exists(pathToFile):
        train_set = []
        validation_set = []
        test_set = []
        X_train = []
        X_val = []
        X_test = []
        Y_train = []
        Y_val = []
        Y_test = []
        
        for i, (train_idx, test_idx) in enumerate(folds):
            X_train_tmp = X_set[train_idx]
            Y_train_tmp = Y_set[train_idx]
            
            X_test_tmp = X_set[test_idx]
            Y_test_tmp = Y_set[test_idx]
            
            X_train_tmp, X_val_tmp, Y_train_tmp, Y_val_tmp = train_test_split(X_train_tmp, Y_train_tmp, test_size = .1,
                                                                              stratify = Y_train_tmp)
            augment = 30
            augmented_data = []
            Y_list = copy.copy(Y_train_tmp)
            for i in range(augment):
                augmented_data.append(dataAugment(X_train_tmp))
            for i in range(augment-1):
                Y_list = np.concatenate((Y_list, Y_train_tmp), axis=0)
                
            X_train_tmp = np.vstack(augmented_data)
            Y_train_tmp = copy.copy(Y_list)
            train_set_tmp = RamanDataset(X_train_tmp, Y_train_tmp)
            train_set.append(train_set_tmp)
            val_set_tmp = RamanDataset(X_val_tmp, Y_val_tmp)
            validation_set.append(val_set_tmp)
            test_set_tmp = RamanDataset(X_test_tmp, Y_test_tmp)
            test_set.append(test_set_tmp)
        
        train_dataset = Datasets(train_set)
        validation_dataset = Datasets(validation_set)
        test_dataset = Datasets(test_set)
        training_settings = (train_dataset, validation_dataset, test_dataset)
        
        with open(pathToFile, "wb") as outf:
            pickle.dump(training_settings, outf)
        
    else:
        with open(pathToFile, "rb") as inf:
            train_dataset, validation_dataset, test_dataset = pickle.load(inf)
            
    return train_dataset, validation_dataset, test_dataset

In [12]:
pathToFile = "../train_settings/training_settings_cov_raw_without_" + str(outside_group) + ".pckl"
train_dataset, validation_dataset, test_dataset = setsCreation(pathToFile, X_set, Y_set, folds)

In [13]:
final_X_set = final_test_set.drop(columns = ['label', "names"]).values
final_Y_set = final_test_set.label.values
test_set_final = RamanDataset(final_X_set, final_Y_set)

## Model

### Utility classes and functions 

In [14]:
class ConvNet(nn.Module):
    def __init__(self):
        super(ConvNet, self).__init__()
        
        self.cnn_layers = nn.Sequential(
            nn.Conv1d(1, 100, kernel_size=100,
                     stride=1, padding_mode='replicate'),
            nn.ReLU(),
            nn.BatchNorm1d(100, eps=0.001, momentum=0.99),
            nn.Conv1d(100, 102, kernel_size=5,
                     stride=2, padding_mode='replicate'),
            nn.ReLU(),
            nn.MaxPool1d(6, stride=3),
            nn.BatchNorm1d(102, eps=0.001, momentum=0.99),
            nn.Conv1d(102, 25, kernel_size=9,
                     stride=5, padding_mode='replicate'),
            nn.ReLU(),
            nn.MaxPool1d(3, stride=2)
        )
        
        self.dense_layers = nn.Sequential(
            nn.Dropout(p=0.1),
            nn.Linear(325, 732),
            nn.LeakyReLU(),
            nn.Dropout(p=0.7000000000000001),
            nn.Linear(732, 152),
            nn.LeakyReLU(),
            nn.Dropout(p=0.25),
            nn.Linear(152,189),
            nn.LeakyReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(189, 3),
            nn.Softmax(dim=1)
        )
        
    def forward(self, x):
        x = x.resize_(x.shape[0],  1, x.shape[1])
        x = self.cnn_layers(x)
        x = torch.flatten(x, 1)
        x = self.dense_layers(x)
        return x

In [15]:
def createCNN(gpu_ids):
    model = ConvNet()
    model = model.double()
    optimizer = Adam(model.parameters(), lr=0.00020441990333108206)
    loss = nn.CrossEntropyLoss()
    
    if torch.cuda.is_available():
        cuda='cuda:'+str(gpu_ids[0])
        model = nn.DataParallel(model, device_ids=gpu_ids)
        loss.cuda()
    device = torch.device(cuda if torch.cuda.is_available() else 'cpu')
    model.to(device)
    return model, loss, optimizer, device

In [16]:
def train(device, model, loss, optimizer, train_dataset, validation_dataset, epochs, patience, path, verbose=0, batch_size=338):
    train_losses = []
    val_losses = []
    train_acc = []
    val_acc = []
    min_val_loss = np.Inf
    max_val_acc = np.NINF
    epochs_no_improve_loss = 0
    epochs_no_improve_acc = 0
    if verbose == 1:
        verbScheduler = True
    else:
        verbScheduler = False
    scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=80, cooldown=10, verbose=verbScheduler)
    training_generator = DataLoader(train_dataset, batch_size=batch_size)
    validation_generator = DataLoader(validation_dataset, batch_size=batch_size)
    model.train()
    for epoch in range(epochs):
        loss_train_epoch = []
        acc_train_epoch = []
        for i, (ramanSpectraTrain, labelTrain) in enumerate(training_generator):
            ramanSpectraTrain = ramanSpectraTrain.to(device)
            labelTrain = labelTrain.to(device)
        
            optimizer.zero_grad()
        
            output_train = model(ramanSpectraTrain)
            
            loss_train = loss(output_train, labelTrain)
            loss_train_epoch.append(loss_train.cpu().item())
            
            loss_train.backward()
            optimizer.step()
            
            output_label = torch.argmax(output_train, dim=1)
            acc_train = accuracy_score(labelTrain.cpu().detach().numpy(), output_label.cpu().detach().numpy())
            acc_train_epoch.append(acc_train)
        
        loss_train = mean(loss_train_epoch)
        acc_train = mean(acc_train_epoch)
        train_losses.append(loss_train)
        train_acc.append(acc_train)
        
        with torch.no_grad():
            loss_val_epoch = []
            acc_val_epoch = []
            for j, (ramanSpectraVal, labelVal) in enumerate(validation_generator):
                ramanSpectraVal = ramanSpectraVal.to(device)
                labelVal = labelVal.to(device)
                    
                output_val = model(ramanSpectraVal)
                    
                loss_val = loss(output_val, labelVal)
                loss_val_epoch.append(loss_val.cpu().item())
                
                val_label = torch.argmax(output_val, dim=1)
                acc_val = accuracy_score(labelVal.cpu().detach().numpy(), val_label.cpu().detach().numpy())
                acc_val_epoch.append(acc_val)
            
            loss_val = mean(loss_val_epoch)
            acc_val = mean(acc_val_epoch)
        val_losses.append(loss_val)
        val_acc.append(acc_val)
        scheduler.step(loss_val)
        if acc_val > max_val_acc:
            epochs_no_improve_acc = 0
            max_val_acc = acc_val
            torch.save({'model_state_dict' : model.state_dict(),
                       'optimizer_state_dict' : optimizer.state_dict(),
                       'train_loss' : train_losses,
                       'train_acc' : train_acc,
                       'val_loss' : val_losses,
                       'val_acc' : val_acc}, path)
        else:
            epochs_no_improve_acc += 1
        
        if loss_val < min_val_loss:
            epochs_no_improve_loss = 0
            min_val_loss = loss_val
            torch.save({'model_state_dict' : model.state_dict(),
                       'optimizer_state_dict' : optimizer.state_dict(),
                       'train_loss' : train_losses,
                       'train_acc' : train_acc,
                       'val_loss' : val_losses,
                       'val_acc' : val_acc}, path)
        else:
            epochs_no_improve_loss += 1
            
        if verbose == 1:
            print("Epoch {}:\t train loss : {}; train accuracy : {}; \n validation loss : {}; validation accuracy : {}".format(epoch+1, loss_train, acc_train, loss_val, acc_val))
            
        if epochs_no_improve_loss >= patience and epochs_no_improve_acc >= patience:
            print("Early stopping at epoch {}".format(epoch+1))
            break
    
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    train_losses = checkpoint['train_loss']
    train_acc = checkpoint['train_acc']
    val_losses = checkpoint['val_loss']
    val_acc = checkpoint['val_acc']
    
    if verbose == 1:
        print("------------------------------ Final result of the model ! ------------------------------")
        print("Train loss : {}; Train accuracy : {}; \n Validation loss : {}; Validation accuracy : {}".format(train_losses[-1], train_acc[-1], val_losses[-1], val_acc[-1]))
        
    return train_losses, val_losses, train_acc, val_acc

In [17]:
def testModel(model, test_set, device, batch_size=1):
    test_acc = []
    test_generator = DataLoader(test_set, batch_size=batch_size)
    model.eval()
    for i, (ramanSpectra, label) in enumerate(test_generator):
        ramanSpectra = ramanSpectra.to(device)
        label = label.to(device)
        
        labelPredict = model(ramanSpectra)
        labelPredict = torch.argmax(labelPredict, dim=1)
        
        acc = accuracy_score(label.cpu().detach().numpy(), labelPredict.cpu().detach().numpy())
        test_acc.append(acc)
    
    return test_acc

In [18]:
def globalTest(models, test_set, devices, batch_size=1):
    test_generator = DataLoader(test_set, batch_size=batch_size)
    predictions = [[] for _ in models]
    for i in range(len(models)):
        model = models[i]
        model.eval()
        for j, (ramanSpectra, label) in enumerate(test_generator):
            ramanSpectra = ramanSpectra.to(devices[i])
            label = label.to(devices[i])
        
            labelPredict = model(ramanSpectra)
            labelPredict = torch.argmax(labelPredict, dim=1)
            predictions[i].append(labelPredict)
    
    finalPrediction = []
    for i in range(len(predictions[0])):
        count0 = 0
        count1 = 0
        count2 = 0
        for j in range(len(predictions)):
            if predictions[j][i] == 0:
                count0 += 1
            elif predictions[j][i] == 1:
                count1 += 1
            else:
                count2 += 1
        m = max(count0, count1, count2)
        if m == count0:
            finalPrediction.append(0)
        elif m == count1:
            finalPrediction.append(1)
        else:
            finalPrediction.append(2)
    
    realLabels = []
    for j, (ramanSpectra, label) in enumerate(test_generator):
        realLabels.append(label)
        
    acc = accuracy_score(realLabels, finalPrediction)
    return acc

### Train part

In [19]:
num_epochs = 273
patience = 50
models_list = []
optimizers = []
loss_train_list = []
loss_val_list = []
acc_train_list = []
acc_val_list = []
devices = []
start = time.time()
path_to_directory = '../saved_models/covid_RAW_without_'+str(outside_group)
if not(os.path.exists(path_to_directory)):
        os.mkdir(path_to_directory)
for i in range(len(train_dataset)):
    print("------------------------------ Let's train model {} ! ------------------------------".format(i+1))
    model, loss, optimizer, device = createCNN(gpus_list)
    path = path_to_directory +"/"+str(i+1)+".pckl"
    start_bis = time.time()
    train_loss, val_loss, train_acc, val_acc = train(device, model, loss, optimizer, train_dataset[i], validation_dataset[i], num_epochs, patience, path, verbose=1)
    end_bis = time.time()
    print("Elapsed time for 1 model : ", end_bis - start_bis)
    devices.append(device)
    loss_train_list.append(train_loss)
    loss_val_list.append(val_loss)
    acc_train_list.append(train_acc)
    acc_val_list.append(val_acc)
    models_list.append(model)
    optimizers.append(optimizer)
end = time.time()

------------------------------ Let's train model 1 ! ------------------------------
Epoch 1:	 train loss : 0.9558276391390352; train accuracy : 0.5695577701650576; 
 validation loss : 0.8204464463912797; validation accuracy : 0.7383966244725738
Epoch 2:	 train loss : 0.7624936468577672; train accuracy : 0.7866292951313194; 
 validation loss : 0.7287124671127737; validation accuracy : 0.8143459915611815
Epoch 3:	 train loss : 0.7087735630258768; train accuracy : 0.8411605937921728; 
 validation loss : 0.7299723241713809; validation accuracy : 0.810126582278481
Epoch 4:	 train loss : 0.6823444177060696; train accuracy : 0.8685404339250493; 
 validation loss : 0.7087638442504923; validation accuracy : 0.8481012658227848
Epoch 5:	 train loss : 0.6749435416795745; train accuracy : 0.8751479289940829; 
 validation loss : 0.666149380895674; validation accuracy : 0.8776371308016878
Epoch 6:	 train loss : 0.6582079929666865; train accuracy : 0.8925672168587149; 
 validation loss : 0.71031501640

KeyboardInterrupt: 

In [None]:
print("Training time :", (end-start)/3600)

In [None]:
hT = (end-start)//3600
mT = ((end-start)%3600)//60
sT = (((end-start)%3600)%60)
print("------------------------------ Total time of training {} h {} m and {} s ------------------------------".format(hT, mT, sT))

### Test part

In [None]:
test_accs = []
for i in range(len(test_dataset)):
    print("------------------------------ Let's predict with model {} ! ------------------------------".format(i+1))
    acc = testModel(models_list[i], test_dataset[i], devices[i])
    test_accs.append(mean(acc))
    print("------------------------------ Model {} predict with {} of accuracy ------------------------------".format(i+1, mean(acc)))

In [None]:
total_acc = 0
for i in range(len(test_accs)):
    total_acc += test_accs[i]
print("The mean accuracy is {}".format(total_acc/len(test_accs))) 

### Global test

In [None]:
acc = globalTest(models_list, test_set_final, devices)
print("The accuracy obtain on the final test set is {}".format(acc))