# Fine-tuning experiments

Based on the two notebooks : https://github.com/csho33/bacteria-ID/blob/master/1_reference_finetuning.ipynb & https://github.com/csho33/bacteria-ID/blob/master/3_clinical_finetuning.ipynb

In [417]:
from time import time
t00 = time()
import numpy as np
import os,sys,re
import random

## Loading the data

In [418]:
#Load from directory
os.chdir(os.getcwd())
base_dir = 'Raman_Data/'
als_dir = base_dir + 'ALS/'
ctrl_dir = base_dir + 'CTRL/'

base_dir2 = 'Bacteria_TL'
sys.path.append(base_dir2)

models = ['pretrained_model.ckpt', 'finetuned_model.ckpt', 'clinical_pretrained_model.ckpt']

In [419]:
all_files_als = os.listdir(als_dir)
all_files_als.sort(key=lambda f: int(re.sub('\D', '', f)))

all_files_ctrl = os.listdir(ctrl_dir)
all_files_ctrl.sort(key=lambda f: int(re.sub('\D', '', f)))

In [420]:
def parse_text(file, dir):
    with open(dir + file, 'rt') as fd:
        data=[]
        line = fd.readline()
        nline = re.findall(r"[-+]?\d*\.\d+|\d+", line)
        data.append(nline)
        while line:
            line=fd.readline()
            nline = re.findall(r"[-+]?\d*\.\d+|\d+", line)
            data.append(nline)
    return data

In [421]:
X=[] #actual y of spectra
Y=[] # 1 -> als; 0 -> ctrl
coord=[] #actual x of spectra

sep=[60,78,114,150,194,210,225,241,255,280,299,313,323,333,343,353,363,373,383,393] #Il manque le 227
groups=[] #for GROUP K FOLD
group=0
index=1
for f in all_files_als:
    data=[]
    datab=[]
    for e in parse_text(f, als_dir):
        if len(e) > 0:
            datab.append(float(e[0]))
            data.append(float(e[1]))
    coord.append(datab)
    X.append(data)
    Y.append(1)
    groups.append(group)
    if index in sep:
        group+=1
    index+=1
    
sep=[33,76,91,138,149,158,168,178,188,198]
index=1
for f in all_files_ctrl:
    data=[]
    datab=[]
    for e in parse_text(f, ctrl_dir):
        if len(e) > 0:
            datab.append(float(e[0]))
            data.append(float(e[1]))
    coord.append(datab)
    X.append(data)
    Y.append(0)
    groups.append(group)
    if index in sep:
        group+=1
    index+=1

In [422]:
X=np.array(X)
Y=np.array(Y)
groups=np.array(groups)
print(X.shape, Y.shape)

(591, 1174) (591,)


In [423]:
for i in range(len(X)):
    for j in range (len(X[i])):
        if(X[i][j] < 0):
            X[i][j] = 0

## Split our dataset into a finetunable set and a full test set

First, we decide to split our dataset into :
1) a "finetunable" set for finetune the pretrained model on our data

2) a "full test" set i.e the left over patients to test our resulting finetuned model

The split ratio is 2/3 -> 20 patients for finetune (12 ALS & 8 CTRL) and 10 for test (8 ALS & 2 CTRL). 

In [424]:
def split_dataset(num_als, num_ctrl):
    patient_idxs_finetune = []

    x = list(range(0, 20))
    patient_idxs_finetune = random.sample(x,num_als)
    patient_idxs_test = [i for i in x if i not in patient_idxs_finetune]

    x2 = list(range(20, 30))
    patient_idxs_finetune += random.sample(x2,num_ctrl)
    patient_idxs_test += [i for i in x2 if i not in patient_idxs_finetune]

    #Shuffle to avoid implicit leakage (1 the firsts and 0 the lasts)
    random.shuffle(patient_idxs_finetune)
    random.shuffle(patient_idxs_test)
    return patient_idxs_finetune, patient_idxs_test

In [425]:
#Some test to be sure
patient_idxs_finetune, patient_idxs_test = split_dataset(12, 8)
print(len(patient_idxs_finetune), patient_idxs_finetune)
print(len(patient_idxs_test), patient_idxs_test)

20 [9, 15, 4, 14, 19, 17, 20, 23, 26, 29, 28, 5, 25, 7, 21, 24, 1, 2, 13, 18]
10 [8, 12, 11, 27, 6, 10, 16, 0, 22, 3]


## Load ResNet

In [426]:
from resnet import ResNet
import os
import torch

In [427]:
# CNN parameters
layers = 6
hidden_size = 100
block_size = 2
hidden_sizes = [hidden_size] * layers
num_blocks = [block_size] * layers
input_dim = 1174
in_channels = 64
n_classes = 2 # instead of 30, we use the 2 empiric groupings
os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(0)
cuda = torch.cuda.is_available()

In [428]:
#Remove last layers
def removekey(d, listofkeys):
    r = dict(d)
    for key in listofkeys:
        print('key: {} is removed'.format(key))
        r.pop(key)
    return r

In [429]:
def load_model(n=0):
    cnn = ResNet(hidden_sizes, num_blocks, input_dim=input_dim,
                    in_channels=in_channels, n_classes=n_classes)
    if cuda: cnn.cuda()

    checkpoint = torch.load(base_dir2 + '/' + models[n], map_location=lambda storage, loc: storage)
    mod_weights = removekey(checkpoint, ['linear.weight', 'linear.bias'])
    cnn.load_state_dict(mod_weights, strict=False)
    return cnn, mod_weights, checkpoint

## Fine-tuning

### Custom Train/Val/Test split

Based on the "clinical" notebook we decide to implement a custom method of train/val/test split : the 20 patients are grouped into 4 sub-groups. The first 3 patients of each group are assigned to the training set, the 4th to the validation set and the 5th into the test set.

In [430]:
#20 patients into 4 groups of 5 patients
def group_patients(patient_idxs_finetune):
    patient_idxs = []
    x = patient_idxs_finetune
    for i in range(4):
        l = random.sample(x,5)
        patient_idxs.append(l)
        x = [i for i in x if i not in l]
    return patient_idxs

In [431]:
patient_idxs = group_patients(patient_idxs_finetune)

In [432]:
# Sample train/val/te spectra -> each group [train, train, train, val, test]
def custom_split_finetuning(patient_idxs):
    idx_tr, idx_val, idx_te = [], [], []
    for group_idx, patient_list in enumerate(patient_idxs):
        print('Group {} patients'.format(group_idx))
        print(' Tr: {}'.format(patient_list[:3]))
        print(' Val: {}'.format(patient_list[3]))
        print(' Te : {}'.format(patient_list[4]))
        for j, patient in enumerate(patient_list):
            l= np.where(groups == patient)
            start_idx = l[0][0]
            end_idx = l[0][len(l[0])-1]
            idx_range = list(range(start_idx, end_idx+1))
            np.random.shuffle(idx_range) #-> do we shuffle ? 
            print(patient, idx_range, len(idx_range))
            if j < 3:
                idx_tr.extend(idx_range)
            elif j ==3:
                idx_val.extend(idx_range)
            else:
                idx_te.extend(idx_range)
    return idx_tr, idx_val, idx_te

In [433]:
#Outputs len
idx_tr, idx_val, idx_te = custom_split_finetuning(patient_idxs)
print(len(idx_tr))
print(len(idx_val))
print(len(idx_te))

Group 0 patients
 Tr: [5, 9, 4]
 Val: 24
 Te : 13
5 [201, 203, 204, 202, 209, 195, 196, 197, 198, 200, 194, 206, 207, 199, 208, 205] 16
9 [262, 258, 269, 259, 275, 272, 266, 279, 264, 273, 265, 274, 257, 268, 261, 271, 278, 256, 267, 260, 270, 276, 255, 277, 263] 25
4 [179, 165, 170, 191, 156, 152, 167, 175, 158, 178, 183, 193, 159, 162, 190, 169, 172, 173, 161, 154, 192, 160, 189, 151, 168, 150, 187, 171, 153, 164, 181, 177, 155, 184, 174, 185, 166, 182, 176, 188, 163, 186, 157, 180] 44
24 [534, 540, 533, 532, 537, 536, 541, 531, 539, 538, 535] 11
13 [331, 330, 325, 323, 324, 332, 326, 329, 328, 327] 10
Group 1 patients
 Tr: [25, 1, 23]
 Val: 18
 Te : 17
25 [547, 548, 543, 542, 545, 546, 549, 544, 550] 9
1 [63, 71, 60, 74, 75, 66, 61, 69, 70, 73, 65, 67, 68, 64, 72, 62, 77, 76] 18
23 [508, 490, 487, 506, 521, 498, 489, 515, 496, 524, 525, 516, 492, 530, 510, 505, 501, 528, 485, 523, 514, 491, 494, 507, 518, 513, 486, 526, 504, 527, 509, 499, 520, 488, 512, 502, 517, 493, 519, 522, 500

## Finetuning based on custom split

In [434]:
from datasets import spectral_dataloader
from training import run_epoch
from torch import optim

In [435]:
# Fine-tune CNN
def finetune(cnn, idx_tr, idx_val, idx_te):
    epochs = 1
    batch_size = 10
    t0 = time()
    # Set up Adam optimizer
    optimizer = optim.Adam(cnn.parameters(), lr=1e-3, betas=(0.5, 0.999))
    # Set up dataloaders
    dl_tr = spectral_dataloader(X, Y, idxs=idx_tr,
        batch_size=batch_size, shuffle=True)
    dl_val = spectral_dataloader(X, Y, idxs=idx_val,
        batch_size=batch_size, shuffle=False)
    dl_te = spectral_dataloader(X, Y, idxs=idx_te,
        batch_size=batch_size, shuffle=False)
    # Fine-tune CNN for first fold
    best_val = 0
    no_improvement = 0
    max_no_improvement = 5
    print('Starting fine-tuning!')
    for epoch in range(epochs):
        print(' Epoch {}: {:0.2f}s'.format(epoch+1, time()-t0))
        # Train
        acc_tr, loss_tr = run_epoch(epoch, cnn, dl_tr, cuda,
            training=True, optimizer=optimizer)
        print('  Train acc: {:0.2f}'.format(acc_tr))
        # Val
        acc_val, loss_val = run_epoch(epoch, cnn, dl_val, cuda,
            training=False, optimizer=optimizer)
        print('  Val acc: {:0.2f}'.format(acc_val))
        # Test
        acc_te, loss_te = run_epoch(epoch, cnn, dl_te, cuda,
            training=False, optimizer=optimizer)
        print('  Test acc: {:0.2f}'.format(acc_te))
        # Check performance for early stopping
        if acc_val > best_val or epoch == 0:
            best_val = acc_val
            no_improvement = 0
        else:
            no_improvement += 1
        if no_improvement >= max_no_improvement:
            print('Finished after {} epochs!'.format(epoch+1))
            break
    print('Finished: {:0.2f}s'.format(time()-t0))

## Make predictions

In [436]:
from training import get_predictions
from scipy import stats
from statistics import mean

In [437]:
#1/ Getting test indices
def get_test_indices(patient_idxs_test):
    idx_te = []
    for group_idx in patient_idxs_test:
        l= np.where(groups == group_idx)
        start_idx = l[0][0]
        end_idx = l[0][len(l[0])-1]
        idx_te += list(range(start_idx, end_idx+1))
    return idx_te

In [438]:
def predict(cnn, idx_te):
    #1/ Predicting on finetuned model
    dl_te = spectral_dataloader(X, Y, idxs=idx_te,
        batch_size=batch_size, shuffle=False)
    #t0 = time()
    y_hat = get_predictions(cnn, dl_te, cuda)
    #print('Finished: {:0.2f}s'.format(time()-t0))
    #2/ Getting the right Y indices for comparing
    Y_l = []
    for i in range(len(Y)):
        if i in idx_te:
            Y_l.append(Y[i])
    #3/ Computing accuracy and std
    acc = (y_hat == Y_l).mean()
    print('Accuracy: {:0.1f}%'.format(100*acc))
    return acc

## Get average accuracy and std on finetuned models

In [439]:
def results(num_model, idx_tr, idx_val, idx_te, fn_idx_te):
    list_acc=[]
    #Average on 10 times
    for i in range(10):
        print(i+1)
        #Load model
        cnn, _, _ = load_model(num_model)
        #finetune it owith custom split
        finetune(cnn, idx_tr, idx_val, idx_te)
        #get accuracy to make an average and std on test set
        list_acc.append(predict(cnn, fn_idx_te))
    return list_acc

## Trials :

Distribution of 12 ALS & 8 CTRL :

In [440]:
num_als=12
num_ctrl = 8
patient_idxs_finetune, patient_idxs_test = split_dataset(num_als, num_ctrl)
patient_idxs = group_patients(patient_idxs_finetune)
idx_tr, idx_val, idx_te = custom_split_finetuning(patient_idxs)

Group 0 patients
 Tr: [18, 4, 11]
 Val: 17
 Te : 29
18 [375, 376, 380, 382, 379, 378, 377, 381, 374, 373] 10
4 [171, 187, 150, 154, 186, 151, 170, 156, 152, 158, 161, 160, 153, 166, 189, 188, 176, 172, 167, 190, 173, 178, 192, 164, 182, 191, 181, 159, 180, 162, 168, 163, 183, 155, 184, 175, 185, 157, 174, 179, 165, 169, 193, 177] 44
11 [302, 304, 300, 303, 312, 301, 308, 311, 299, 305, 306, 309, 307, 310] 14
17 [368, 369, 367, 365, 371, 366, 364, 363, 370, 372] 10
29 [590, 585, 581, 584, 588, 587, 582, 583, 589, 586] 10
Group 1 patients
 Tr: [25, 20, 1]
 Val: 15
 Te : 13
25 [545, 544, 549, 542, 543, 547, 548, 550, 546] 9
20 [425, 406, 396, 419, 394, 415, 397, 400, 401, 395, 421, 417, 393, 424, 409, 407, 405, 420, 416, 412, 422, 418, 410, 402, 404, 411, 414, 413, 408, 423, 398, 399, 403] 33
1 [64, 76, 69, 71, 63, 62, 66, 73, 67, 75, 65, 72, 60, 74, 61, 77, 70, 68] 18
15 [343, 346, 345, 347, 350, 344, 348, 349, 351, 352] 10
13 [323, 324, 328, 332, 329, 331, 327, 326, 330, 325] 10
Group 2

In [441]:
n1 = len([i for i in idx_tr if i < 393])
n2 = len([i for i in idx_val if i < 393])
n3 = len([i for i in idx_te if i < 393])
print("Number of samples to train the finetune model : ", len(idx_tr), " -> ALS : ", n1, ", CTRL : ", len(idx_tr)-n1)
print("Number of samples to validate the finetune model : ", len(idx_val), " -> ALS : ", n2, ", CTRL : ", len(idx_val)-n2)
print("Number of samples to test the finetune model : ", len(idx_te), " -> ALS : ", n3, ", CTRL : ", len(idx_te)-n3)

Number of samples to train the finetune model :  278  -> ALS :  126 , CTRL :  152
Number of samples to validate the finetune model :  92  -> ALS :  92 , CTRL :  0
Number of samples to test the finetune model :  41  -> ALS :  20 , CTRL :  21


In [442]:
fn_idx_te = get_test_indices(patient_idxs_test)
m= len([i for i in fn_idx_te if i < 393])
print("Number of samples to test : ", len(fn_idx_te), " -> ALS : ", m, ", CTRL : ", len(fn_idx_te)-m)

Number of samples to test :  180  -> ALS :  155 , CTRL :  25


- pretrained_model :

In [443]:
list_acc = results(0, idx_tr, idx_val, idx_te, fn_idx_te)
print(mean(list_acc))
print(np.std(list_acc))

1
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 83.81
  Val acc: 100.00
  Test acc: 100.00
Finished: 7.53s
Accuracy: 72.8%
2
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 83.81
  Val acc: 100.00
  Test acc: 97.56
Finished: 7.42s
Accuracy: 66.7%
3
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 79.14
  Val acc: 94.57
  Test acc: 100.00
Finished: 7.38s
Accuracy: 65.6%
4
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 80.22
  Val acc: 100.00
  Test acc: 100.00
Finished: 7.34s
Accuracy: 65.0%
5
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 81.29
  Val acc: 100.00
  Test acc: 97.56
Finished: 7.37s
Accuracy: 68.9%
6
key: linear.weight is removed
key: linear.bias is removed
Starting fine

- finetuned_model :

In [444]:
list_acc = results(1, idx_tr, idx_val, idx_te, fn_idx_te)
print(mean(list_acc))
print(np.std(list_acc))

1
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 83.45
  Val acc: 100.00
  Test acc: 100.00
Finished: 7.40s
Accuracy: 64.4%
2
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 87.41
  Val acc: 100.00
  Test acc: 100.00
Finished: 7.30s
Accuracy: 62.8%
3
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 85.61
  Val acc: 86.96
  Test acc: 100.00
Finished: 7.32s
Accuracy: 62.2%
4
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 84.89
  Val acc: 100.00
  Test acc: 100.00
Finished: 7.39s
Accuracy: 68.3%
5
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 85.61
  Val acc: 98.91
  Test acc: 100.00
Finished: 7.38s
Accuracy: 64.4%
6
key: linear.weight is removed
key: linear.bias is removed
Starting fin

- clinical_pretrained_model

In [445]:
list_acc = results(2, idx_tr, idx_val, idx_te, fn_idx_te)
print(mean(list_acc))
print(np.std(list_acc))

1
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 78.42
  Val acc: 100.00
  Test acc: 100.00
Finished: 7.97s
Accuracy: 66.1%
2
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 79.86
  Val acc: 86.96
  Test acc: 100.00
Finished: 7.35s
Accuracy: 67.8%
3
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 83.09
  Val acc: 98.91
  Test acc: 100.00
Finished: 7.38s
Accuracy: 71.7%
4
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 82.37
  Val acc: 100.00
  Test acc: 95.12
Finished: 7.27s
Accuracy: 73.3%
5
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 85.97
  Val acc: 98.91
  Test acc: 100.00
Finished: 7.32s
Accuracy: 68.9%
6
key: linear.weight is removed
key: linear.bias is removed
Starting fine-

Distribution of 15 ALS & 5 CTRL

In [446]:
num_als=15
num_ctrl = 5
patient_idxs_finetune, patient_idxs_test = split_dataset(num_als, num_ctrl)
patient_idxs = group_patients(patient_idxs_finetune)
idx_tr, idx_val, idx_te = custom_split_finetuning(patient_idxs)

Group 0 patients
 Tr: [0, 10, 21]
 Val: 12
 Te : 14
0 [8, 21, 38, 48, 32, 10, 45, 7, 33, 11, 12, 55, 2, 49, 54, 30, 46, 59, 35, 25, 23, 22, 52, 53, 56, 43, 39, 47, 18, 36, 50, 4, 31, 51, 34, 29, 15, 9, 37, 13, 5, 27, 6, 26, 16, 17, 1, 14, 24, 57, 3, 58, 40, 0, 19, 44, 28, 41, 20, 42] 60
10 [291, 286, 289, 283, 288, 280, 285, 297, 296, 287, 294, 282, 298, 290, 292, 293, 295, 281, 284] 19
21 [430, 445, 437, 457, 426, 444, 443, 441, 456, 428, 448, 439, 468, 466, 451, 433, 442, 438, 459, 465, 446, 427, 460, 431, 467, 458, 464, 463, 447, 462, 434, 440, 453, 450, 436, 455, 435, 432, 449, 461, 454, 452, 429] 43
12 [314, 321, 318, 319, 322, 317, 316, 320, 313, 315] 10
14 [334, 333, 338, 341, 342, 336, 335, 337, 339, 340] 10
Group 1 patients
 Tr: [8, 27, 7]
 Val: 1
 Te : 29
8 [246, 248, 253, 252, 242, 245, 250, 251, 243, 254, 241, 247, 249, 244] 14
27 [561, 564, 566, 565, 563, 567, 569, 568, 570, 562] 10
7 [240, 225, 228, 236, 234, 239, 235, 230, 232, 233, 231, 229, 237, 238, 227, 226] 16
1 [74

In [447]:
n1 = len([i for i in idx_tr if i < 393])
n2 = len([i for i in idx_val if i < 393])
n3 = len([i for i in idx_te if i < 393])
print("Number of samples to train the finetune model : ", len(idx_tr), " -> ALS : ", n1, ", CTRL : ", len(idx_tr)-n1)
print("Number of samples to validate the finetune model : ", len(idx_val), " -> ALS : ", n2, ", CTRL : ", len(idx_val)-n2)
print("Number of samples to test the finetune model : ", len(idx_te), " -> ALS : ", n3, ", CTRL : ", len(idx_te)-n3)

Number of samples to train the finetune model :  255  -> ALS :  191 , CTRL :  64
Number of samples to validate the finetune model :  48  -> ALS :  48 , CTRL :  0
Number of samples to test the finetune model :  79  -> ALS :  54 , CTRL :  25


In [448]:
fn_idx_te = get_test_indices(patient_idxs_test)
m= len([i for i in fn_idx_te if i < 393])
print("Number of samples to test : ", len(fn_idx_te), " -> ALS : ", m, ", CTRL : ", len(fn_idx_te)-m)

Number of samples to test :  209  -> ALS :  100 , CTRL :  109


In [449]:
list_acc = results(1, idx_tr, idx_val, idx_te, fn_idx_te)
print(mean(list_acc))
print(np.std(list_acc))

1
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 90.98
  Val acc: 100.00
  Test acc: 84.81
Finished: 7.69s
Accuracy: 40.2%
2
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 87.06
  Val acc: 100.00
  Test acc: 89.87
Finished: 7.51s
Accuracy: 35.9%
3
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 88.63
  Val acc: 100.00
  Test acc: 83.54
Finished: 7.30s
Accuracy: 42.1%
4
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 88.63
  Val acc: 100.00
  Test acc: 86.08
Finished: 7.29s
Accuracy: 35.9%
5
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 89.02
  Val acc: 100.00
  Test acc: 82.28
Finished: 7.30s
Accuracy: 40.7%
6
key: linear.weight is removed
key: linear.bias is removed
Starting fine-t

The accuracy is corely correlated to the number of samples -> unbalanced data 