# Fine-tuning on the reference dataset

Based on the first notebook : https://github.com/csho33/bacteria-ID

## Libraries and imports

In [30]:
from time import time
t00 = time()
import numpy as np
import os, re, sys
import tensorflow as tf
import torch
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
from statistics import mean

## Loading the data

In [31]:
#Load from Google Drive
#drive.mount('/content/drive', force_remount=True)
#root_dir = "/content/drive/My Drive/"
#base_dir = root_dir + 'Raman_Data/'
#base_dir2 = root_dir + 'Bacteria_TL'

#Load from directory
os.chdir(os.getcwd())
base_dir = 'Raman_Data/'
als_dir = base_dir + 'ALS/'
ctrl_dir = base_dir + 'CTRL/'

base_dir2 = 'Bacteria_TL'
sys.path.append(base_dir2)
#
models = ['pretrained_model.ckpt', 'finetuned_model.ckpt', 'clinical_pretrained_model.ckpt']

In [32]:
all_files_als = os.listdir(als_dir)
print(len(all_files_als))
all_files_ctrl = os.listdir(ctrl_dir)
print(len(all_files_ctrl))

393
198


Sort files by ascending order

In [33]:
all_files_als.sort(key=lambda f: int(re.sub('\D', '', f)))
all_files_ctrl.sort(key=lambda f: int(re.sub('\D', '', f)))

### Utility function to parse data files

In [34]:
def parse_text(file, dir):
    with open(dir + file, 'rt') as fd:
        data=[]
        line = fd.readline()
        nline = re.findall(r"[-+]?\d*\.\d+|\d+", line)
        data.append(nline)
        while line:
            line=fd.readline()
            nline = re.findall(r"[-+]?\d*\.\d+|\d+", line)
            data.append(nline)
    return data

### Create X and Y

In [47]:
X=[] #actual y of spectra
Y=[] # 1 -> als; 0 -> ctrl
coord=[] #actual x of spectra

sep=[60,78,114,150,194,210,225,241,255,280,299,313,323,333,343,353,363,373,383,393] #Il manque le 227
groups=[] #for GROUP K FOLD
group=0
index=1
for f in all_files_als:
    data=[]
    datab=[]
    for e in parse_text(f, als_dir):
        if len(e) > 0:
            datab.append(float(e[0]))
            data.append(float(e[1]))
    coord.append(datab)
    X.append(data)
    Y.append(1)
    groups.append(group)
    if index in sep:
        group+=1
    index+=1

In [48]:
#print(len(X))
sep=[33,76,91,138,149,158,168,178,188,198]
index=1
for f in all_files_ctrl:
    data=[]
    datab=[]
    for e in parse_text(f, ctrl_dir):
        if len(e) > 0:
            datab.append(float(e[0]))
            data.append(float(e[1]))
    coord.append(datab)
    X.append(data)
    Y.append(0)
    groups.append(group)
    if index in sep:
        group+=1
    index+=1

In [49]:
X=np.array(X)
Y=np.array(Y)
groups=np.array(groups)
print(X.shape, Y.shape)

(591, 1174) (591,)


### Remove negative values from spectra

In [50]:
for i in range(len(X)):
    for j in range (len(X[i])):
        if(X[i][j] < 0):
            X[i][j] = 0

## PCA

In [51]:
# Create a scaler object
sc = StandardScaler()

# Fit the scaler to the features and transform
X_std = sc.fit_transform(X)

In [52]:
# Create a pca object with the 2 components as a parameter
pca = decomposition.PCA(n_components=500)

# Fit the PCA and transform the data
X_std_pca = pca.fit_transform(X_std)
print(X_std_pca.shape)

(591, 500)


## Loading pre-trained CNN

In [53]:
# CNN parameters
layers = 6
hidden_size = 100
block_size = 2
hidden_sizes = [hidden_size] * layers
num_blocks = [block_size] * layers
input_dim = 1174
in_channels = 64
n_classes = 2 # 2 classes -> 0 : ctrl & 1 : als
os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(0)
cuda = torch.cuda.is_available()

In [54]:
#Remove last layers
def removekey(d, listofkeys):
    r = dict(d)
    for key in listofkeys:
        print('key: {} is removed'.format(key))
        r.pop(key)
    return r

In [55]:
from resnet import ResNet
# Load pre trained model
def load_model():
    cnn = ResNet(hidden_sizes, num_blocks, input_dim=input_dim,
                    in_channels=in_channels, n_classes=n_classes)
    if cuda: cnn.cuda()

    ## PROBLEM OF DIFFERING NUMBER OF CLASSES
    #cnn.load_state_dict(torch.load('./pretrained_model.ckpt', map_location=lambda storage, loc: storage))

    checkpoint = torch.load(base_dir2 + '/' + models[0], map_location=lambda storage, loc: storage)
    mod_weights = removekey(checkpoint, ['linear.weight', 'linear.bias'])
    cnn.load_state_dict(mod_weights, strict=False)
    return cnn, mod_weights, checkpoint

In [56]:
cnn, mod_weights, checkpoint = load_model()

key: linear.weight is removed
key: linear.bias is removed


In [57]:
for key, value in mod_weights.items() :
    print (key)

print(checkpoint['encoder.5.1.bn2.running_var'].shape)
print(checkpoint['linear.weight'].shape)
print(checkpoint['linear.bias'].shape)

print(cnn)

conv1.weight
bn1.weight
bn1.bias
bn1.running_mean
bn1.running_var
encoder.0.0.conv1.weight
encoder.0.0.bn1.weight
encoder.0.0.bn1.bias
encoder.0.0.bn1.running_mean
encoder.0.0.bn1.running_var
encoder.0.0.conv2.weight
encoder.0.0.bn2.weight
encoder.0.0.bn2.bias
encoder.0.0.bn2.running_mean
encoder.0.0.bn2.running_var
encoder.0.0.shortcut.0.weight
encoder.0.0.shortcut.1.weight
encoder.0.0.shortcut.1.bias
encoder.0.0.shortcut.1.running_mean
encoder.0.0.shortcut.1.running_var
encoder.0.1.conv1.weight
encoder.0.1.bn1.weight
encoder.0.1.bn1.bias
encoder.0.1.bn1.running_mean
encoder.0.1.bn1.running_var
encoder.0.1.conv2.weight
encoder.0.1.bn2.weight
encoder.0.1.bn2.bias
encoder.0.1.bn2.running_mean
encoder.0.1.bn2.running_var
encoder.1.0.conv1.weight
encoder.1.0.bn1.weight
encoder.1.0.bn1.bias
encoder.1.0.bn1.running_mean
encoder.1.0.bn1.running_var
encoder.1.0.conv2.weight
encoder.1.0.bn2.weight
encoder.1.0.bn2.bias
encoder.1.0.bn2.running_mean
encoder.1.0.bn2.running_var
encoder.1.0.shortcu

## Making predictions with pre trained CNN

In [58]:
from training import get_predictions
from datasets import spectral_dataloader

In [59]:
# Make predictions on subset of data
t0 = time()
dl = spectral_dataloader(X, Y, batch_size=10, shuffle=False)
y_hat = get_predictions(cnn, dl, cuda)
print('Predicted {} spectra: {:0.2f}s'.format(len(y_hat), time()-t0))

Predicted 591 spectra: 2.51s


In [60]:
# Computing accuracy
acc = (y_hat == Y).mean()
print('Accuracy: {:0.1f}%'.format(100*acc))

Accuracy: 49.4%


## Fine-tuning

In [61]:
from training import run_epoch
from torch import optim
from sklearn.model_selection import GroupKFold

### Train/val

In [62]:
n_splits=8

In [63]:
group_kfold = GroupKFold(n_splits=n_splits)
group_kfold.get_n_splits(X, Y, groups)
list_accuracy = []
for train_index, test_index in group_kfold.split(X, Y, groups):
    cnn, _, _ = load_model()
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

    # Fine-tune CNN
    epochs = 1 # Change this number to ~30 for full training
    batch_size = 10
    t0 = time()
    # Set up Adam optimizer
    optimizer = optim.Adam(cnn.parameters(), lr=1e-3, betas=(0.5, 0.999))
    # Set up dataloaders
    dl_tr = spectral_dataloader(X, Y, idxs=train_index,
        batch_size=batch_size, shuffle=True)
    dl_val = spectral_dataloader(X, Y, idxs=test_index,
        batch_size=batch_size, shuffle=False)
    # Fine-tune CNN for first fold
    best_val = 0
    no_improvement = 0
    max_no_improvement = 5
    print('Starting fine-tuning!')
    for epoch in range(epochs):
        print(' Epoch {}: {:0.2f}s'.format(epoch+1, time()-t0))
        # Train
        acc_tr, loss_tr = run_epoch(epoch, cnn, dl_tr, cuda,
            training=True, optimizer=optimizer)
        print('  Train acc: {:0.2f}'.format(acc_tr))
        # Val
        acc_val, loss_val = run_epoch(epoch, cnn, dl_val, cuda,
            training=False, optimizer=optimizer)
        print('  Val acc  : {:0.2f}'.format(acc_val))
        # Check performance for early stopping
        if acc_val > best_val or epoch == 0:
            best_val = acc_val
            no_improvement = 0
        else:
            no_improvement += 1
        if no_improvement >= max_no_improvement:
            print('Finished after {} epochs!'.format(epoch+1))
            break

    list_accuracy.append(acc_val)
    
print('\n This demo was completed in: {:0.2f}s'.format(time()-t00))
print(mean(list_accuracy))

key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 88.48
  Val acc  : 98.57
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 87.50
  Val acc  : 98.59
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 91.81
  Val acc  : 88.46
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 85.96
  Val acc  : 88.46
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 85.16
  Val acc  : 100.00
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 87.86
  Val acc  : 100.00
key: linear.weight is removed
key: linear.bias is removed
Starting fine-tuning!
 Epoch 1: 0.00s
  Train acc: 89.79
  Val acc  : 95.83
key: linear.weight is removed
key: linear.bias is removed
St