# Fine-tuning experiments

Based on the two notebooks : https://github.com/csho33/bacteria-ID/blob/master/1_reference_finetuning.ipynb & https://github.com/csho33/bacteria-ID/blob/master/3_clinical_finetuning.ipynb

In [370]:
from time import time
t00 = time()
import numpy as np
import os,sys,re
import random

## Loading the data

In [371]:
#Load from directory
os.chdir(os.getcwd())
base_dir = 'Raman_Data/'
als_dir = base_dir + 'ALS/'
ctrl_dir = base_dir + 'CTRL/'

base_dir2 = 'Bacteria_TL'
sys.path.append(base_dir2)

models = ['pretrained_model.ckpt', 'finetuned_model.ckpt', 'clinical_pretrained_model.ckpt']

In [372]:
all_files_als = os.listdir(als_dir)
all_files_als.sort(key=lambda f: int(re.sub('\D', '', f)))

all_files_ctrl = os.listdir(ctrl_dir)
all_files_ctrl.sort(key=lambda f: int(re.sub('\D', '', f)))

In [373]:
def parse_text(file, dir):
    with open(dir + file, 'rt') as fd:
        data=[]
        line = fd.readline()
        nline = re.findall(r"[-+]?\d*\.\d+|\d+", line)
        data.append(nline)
        while line:
            line=fd.readline()
            nline = re.findall(r"[-+]?\d*\.\d+|\d+", line)
            data.append(nline)
    return data

In [374]:
X=[] #actual y of spectra
Y=[] # 1 -> als; 0 -> ctrl
coord=[] #actual x of spectra

sep=[60,78,114,150,194,210,225,241,255,280,299,313,323,333,343,353,363,373,383,393] #Il manque le 227
groups=[] #for GROUP K FOLD
group=0
index=1
for f in all_files_als:
    data=[]
    datab=[]
    for e in parse_text(f, als_dir):
        if len(e) > 0:
            datab.append(float(e[0]))
            data.append(float(e[1]))
    coord.append(datab)
    X.append(data)
    Y.append(1)
    groups.append(group)
    if index in sep:
        group+=1
    index+=1
    
sep=[33,76,91,138,149,158,168,178,188,198]
index=1
for f in all_files_ctrl:
    data=[]
    datab=[]
    for e in parse_text(f, ctrl_dir):
        if len(e) > 0:
            datab.append(float(e[0]))
            data.append(float(e[1]))
    coord.append(datab)
    X.append(data)
    Y.append(0)
    groups.append(group)
    if index in sep:
        group+=1
    index+=1

In [375]:
X=np.array(X)
Y=np.array(Y)
groups=np.array(groups)
print(X.shape, Y.shape)

(591, 1174) (591,)


In [376]:
for i in range(len(X)):
    for j in range (len(X[i])):
        if(X[i][j] < 0):
            X[i][j] = 0

## Split our dataset into a finetunable set and a full test set

First split 2/3 -> 20 patients for finetune and 10 for test

In [377]:
patient_idxs_finetune = []

x = list(range(0, 20))
patient_idxs_finetune = random.sample(x,12)
patient_idxs_test = [i for i in x if i not in patient_idxs_finetune]

x2 = list(range(20, 30))
patient_idxs_finetune += random.sample(x2,8)
patient_idxs_test += [i for i in x2 if i not in patient_idxs_finetune]


random.shuffle(patient_idxs_finetune)
random.shuffle(patient_idxs_test)

print(len(patient_idxs_finetune), patient_idxs_finetune)
print(len(patient_idxs_test), patient_idxs_test)

20 [23, 15, 20, 0, 24, 28, 16, 5, 1, 26, 29, 7, 12, 2, 3, 8, 27, 22, 11, 6]
10 [10, 14, 9, 19, 13, 18, 21, 25, 17, 4]


## Load ResNet

In [378]:
from resnet import ResNet
import os
import torch

In [379]:
# CNN parameters
layers = 6
hidden_size = 100
block_size = 2
hidden_sizes = [hidden_size] * layers
num_blocks = [block_size] * layers
input_dim = 1174
in_channels = 64
n_classes = 2 # instead of 30, we use the 2 empiric groupings
os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(0)
cuda = torch.cuda.is_available()

In [380]:
#Remove last layers
def removekey(d, listofkeys):
    r = dict(d)
    for key in listofkeys:
        print('key: {} is removed'.format(key))
        r.pop(key)
    return r

In [381]:
def load_model():
    cnn = ResNet(hidden_sizes, num_blocks, input_dim=input_dim,
                    in_channels=in_channels, n_classes=n_classes)
    if cuda: cnn.cuda()

    checkpoint = torch.load(base_dir2 + '/' + models[1], map_location=lambda storage, loc: storage)
    mod_weights = removekey(checkpoint, ['linear.weight', 'linear.bias'])
    cnn.load_state_dict(mod_weights, strict=False)
    return cnn, mod_weights, checkpoint

## Fine-tuning

### Custom Train/Val/Test split

In [382]:
patient_idxs = []
x = patient_idxs_finetune
for i in range(4):
    l = random.sample(x,5)
    patient_idxs.append(l)
    x = [i for i in x if i not in l]
print(patient_idxs)

[[3, 29, 23, 11, 1], [12, 2, 8, 7, 24], [5, 26, 28, 0, 20], [22, 15, 6, 27, 16]]


In [383]:
# Sample train/val/te spectra
idx_tr, idx_val, idx_te = [], [], []
for group_idx, patient_list in enumerate(patient_idxs):
    print('Group {} patients'.format(group_idx))
    print(' Tr: {}'.format(patient_list[:3]))
    print(' Val: {}'.format(patient_list[3]))
    print(' Te : {}'.format(patient_list[4]))
    for j, patient in enumerate(patient_list):
        l= np.where(groups == patient)
        start_idx = l[0][0]
        end_idx = l[0][len(l[0])-1]
        idx_range = list(range(start_idx, end_idx+1))
        print(patient, idx_range, len(idx_range))
        np.random.shuffle(idx_range)
        #idx_sample = idx_range[:5]
        if j < 3:
            idx_tr.extend(idx_range)
        elif j ==3:
            idx_val.extend(idx_range)
        else:
            idx_te.extend(idx_range)

Group 0 patients
 Tr: [3, 29, 23]
 Val: 11
 Te : 1
3 [114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149] 36
29 [581, 582, 583, 584, 585, 586, 587, 588, 589, 590] 10
23 [484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530] 47
11 [299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312] 14
1 [60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77] 18
Group 1 patients
 Tr: [12, 2, 8]
 Val: 7
 Te : 24
12 [313, 314, 315, 316, 317, 318, 319, 320, 321, 322] 10
2 [78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113] 36
8 [241, 242, 243, 244, 245, 246, 247, 

In [384]:
print(len(idx_tr))
print(len(idx_val))
print(len(idx_te))

229
100
72


## Finetuning

In [385]:
from datasets import spectral_dataloader
from training import run_epoch
from torch import optim

In [386]:
cnn, _, _ = load_model()
# Fine-tune CNN
epochs = 1 # Change this number to ~30 for full training
batch_size = 10
t0 = time()
# Set up Adam optimizer
optimizer = optim.Adam(cnn.parameters(), lr=1e-3, betas=(0.5, 0.999))
# Set up dataloaders
dl_tr = spectral_dataloader(X, Y, idxs=idx_tr,
    batch_size=batch_size, shuffle=True)
dl_val = spectral_dataloader(X, Y, idxs=idx_val,
    batch_size=batch_size, shuffle=False)
dl_te = spectral_dataloader(X, Y, idxs=idx_te,
    batch_size=batch_size, shuffle=False)
# Fine-tune CNN for first fold
best_val = 0
no_improvement = 0
max_no_improvement = 5
print('Starting fine-tuning!')
for epoch in range(epochs):
    print(' Epoch {}: {:0.2f}s'.format(epoch+1, time()-t0))
    # Train
    acc_tr, loss_tr = run_epoch(epoch, cnn, dl_tr, cuda,
        training=True, optimizer=optimizer)
    print('  Train acc: {:0.2f}'.format(acc_tr))
    # Val
    acc_val, loss_val = run_epoch(epoch, cnn, dl_val, cuda,
        training=False, optimizer=optimizer)
    print('  Val acc: {:0.2f}'.format(acc_val))
    # Test
    acc_te, loss_te = run_epoch(epoch, cnn, dl_te, cuda,
        training=False, optimizer=optimizer)
    print('  Test acc: {:0.2f}'.format(acc_te))
    # Check performance for early stopping
    if acc_val > best_val or epoch == 0:
        best_val = acc_val
        no_improvement = 0
    else:
        no_improvement += 1
    if no_improvement >= max_no_improvement:
        print('Finished after {} epochs!'.format(epoch+1))
        break
print('Finished: {:0.2f}s'.format(time()-t0))

key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 92.14
  Val acc: 99.00
  Test acc: 87.50
Finished: 20.65s


## Make predictions

In [387]:
from training import get_predictions
from scipy import stats
from statistics import mean

In [388]:
# Getting test indices
idx_te = []
for group_idx in patient_idxs_test:
    l= np.where(groups == group_idx)
    start_idx = l[0][0]
    end_idx = l[0][len(l[0])-1]
    idx_te += list(range(start_idx, end_idx+1))
dl_te = spectral_dataloader(X, Y, idxs=idx_te,
    batch_size=batch_size, shuffle=False)

In [389]:
t0 = time()
# Make predictions on the 5 test patients
y_hat = get_predictions(cnn, dl_te, cuda)
print('Finished: {:0.2f}s'.format(time()-t0))

Finished: 5.98s


In [390]:
Y_l = []
for i in range(len(Y)):
    if i in idx_te:
        Y_l.append(Y[i])

In [391]:
# Computing accuracy
acc = (y_hat == Y_l).mean()
print('Accuracy: {:0.1f}%'.format(100*acc))

Accuracy: 55.3%


In [392]:
print(Y_l)
print(y_hat)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 1 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1]
