<a href="https://colab.research.google.com/github/lphohmann/DL_microbial_gene_classifier/blob/main/1D_ResNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Setting everything up

In [1]:
#hide
!pip install fastai --upgrade
#!pip install -Uqq fastbook --upgrade
#!pip install torchtext==0.8.1

Collecting fastai
[?25l  Downloading https://files.pythonhosted.org/packages/bd/ca/bc9f4e04adcdfda1357f5c63bc67a7bf4f315883ca544726f3376b1ed068/fastai-2.4-py3-none-any.whl (187kB)
[K     |████████████████████████████████| 194kB 5.1MB/s 
Collecting fastcore<1.4,>=1.3.8
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b0/f1fbf554e0bf3c76e1bdc3b82eedfe41fcf656479586be38c64421082b1b/fastcore-1.3.20-py3-none-any.whl (53kB)
[K     |████████████████████████████████| 61kB 4.5MB/s 
Installing collected packages: fastcore, fastai
  Found existing installation: fastai 1.0.61
    Uninstalling fastai-1.0.61:
      Successfully uninstalled fastai-1.0.61
Successfully installed fastai-2.4 fastcore-1.3.20


In [2]:
import numpy as np

In [3]:
#hide
#import fastbook
#fastbook.setup_book()

In [4]:
#hide
#import fastai
from fastai import *
from fastai.vision.all import *
from fastai.text.all import *
#from fastai.callback import *

In [5]:
# mount google drive to access files and set the correct working
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/ColabNotebooks/DL_project/
!pwd

Mounted at /content/drive
/content/drive/MyDrive/ColabNotebooks/DL_project
/content/drive/MyDrive/ColabNotebooks/DL_project


# Define functions required for creating the Datablock

In [6]:
# get the training and validation datasets

In [7]:
# defining functions required for building the DataBlock
# one hot encoding function
from sklearn.preprocessing import OneHotEncoder
def OH_enc(seq: str):
    # get the categories into array
    cats = ['K', 'D', 'N', 'E', 'R', 'A', 'T', 'L', 'I', 'Q', 'C', 'F', 'G', 'W', 'M', 'S', 'H', 'P', 'V', 'Y']
    cat_array = np.array(sorted(cats), ndmin=1) #
    # get seq into array
    trunc_seq = seq[:300] # truncate sequences longer than 300 
    seq_array = np.array(list(trunc_seq))
    #one hot encode the sequence
    onehot_encoder = OneHotEncoder(categories=[cat_array],sparse=False,handle_unknown='ignore')
    onehot_encoded_seq = onehot_encoder.fit_transform(seq_array.reshape(len(seq_array), 1))
    return np.transpose(onehot_encoded_seq)

# zero padding function
def pad_along_axis(array: np.ndarray, target_length: int, axis: int = 0):
    pad_size = target_length - array.shape[axis]
    if pad_size <= 0:
        return array
    npad = [(0, 0)] * array.ndim
    npad[axis] = (0, pad_size)
    return np.pad(array, pad_width=npad, mode='constant', constant_values=0)

# combine in one function
def main_item_tfms(seq): # added -> None because the tensor shape later is 1,2820 and i think it should be 20,2820
    enc_seq = OH_enc(seq)
    pad_encseq_array = pad_along_axis(enc_seq,300,1)
    return pad_encseq_array 

In [8]:
# get_x and y by specifying column in dataframe
def get_y(r): return r['Knum']
def get_x(r): 
    return main_item_tfms(r['Seq']) # apply the one hot encoding and padding function

In [9]:
# read in my data from which training and validation set will be created
trainval = pd.read_csv('trainval.csv', low_memory=False)

In [10]:
# CategoryBlock -> typetfm: categorize, batchtfm: CUDA, itemtfm: totensor
def CategoryBlock(vocab=None, sort=True, add_na=False):
    "`TransformBlock` for single-label categorical targets"
    return TransformBlock(type_tfms=Categorize(vocab=vocab, sort=sort, add_na=add_na))

In [11]:
# alternative DataBlock try
# building the datablock
dblock = DataBlock(blocks=(TransformBlock(batch_tfms=IntToFloatTensor), CategoryBlock(vocab=['K00024', 'K00121', 'K00122', 'K00123', 'K00124', 'K00126',
       'K00127', 'K00148', 'K00169', 'K00170', 'K00171', 'K00172',
       'K00194', 'K00196', 'K00197', 'K00198', 'K00200', 'K00201',
       'K00202', 'K00317', 'K00320', 'K00441', 'K00600', 'K00625',
       'K00672', 'K00830', 'K00925', 'K01007', 'K01070', 'K01499',
       'K01595', 'K01895', 'K03388', 'K03389', 'K03390', 'K05979',
       'K06034', 'K08097', 'K08691', 'K08692', 'K09733', 'K10713',
       'K10714', 'K11212', 'K11261', 'K11779', 'K11780', 'K12234',
       'K13039', 'K13788', 'K14067', 'K14080', 'K14083', 'K14940',
       'K14941', 'K15228', 'K15229', 'K18277'])),
                 splitter = TrainTestSplitter(test_size=0.2, random_state=42, stratify=trainval[['Knum']]),
                 get_x = get_x,
                 get_y = get_y
                 )

In [12]:
#dblock.summary(trainval)

# Create the dataloader

In [13]:
# create dataloaders from datablock 
dls = dblock.dataloaders(trainval,bs=256)

In [15]:
dls.vocab

58

In [16]:
# check one batch to make sure the dls is constructed right
x,y = dls.one_batch()
#y # these should be the targets 
#x # and these the encoded sequences

In [17]:
# check the shape
x.shape #torch.Size([256, 20, 300])

torch.Size([256, 20, 300])

# Defining the architecture for a 1D CNN (not ResNet)

In [None]:
# monkey patch to change back tensor type
def _conv_forward(self, input: Tensor, weight: Tensor, bias: Optional[Tensor]):
    input= input.type(torch.FloatTensor).cuda() # added this line to change the tensor type
    if self.padding_mode != 'zeros':
        return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
                        weight, bias, self.stride,
                        _single(0), self.dilation, self.groups)
    return F.conv1d(input, weight, bias, self.stride,
                    self.padding, self.dilation, self.groups)
# replace in the module
nn.Conv1d._conv_forward = _conv_forward

In [None]:
# ConvLayer creates a sequence of convolutional (ni to nf), ReLU (if use_activ) and norm_type layers
def block(ni, nf): return ConvLayer(ni, nf, stride=2, ndim=1, ks=5)

In [None]:
def get_model():
    return nn.Sequential(
        block(20, 32), #how to handle this
        block(32, 64),
        block(64, 128),
        block(128, 256),
        nn.AdaptiveAvgPool1d(1),
        Flatten(),
        nn.Linear(256, 58)) 

In [None]:
# metric
precision = Precision(average='weighted') 
# Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.
# create learn object
learn = Learner(dls, get_model(), loss_func=nn.CrossEntropyLoss(), metrics=precision) # switched metric from accuracy to precision for assessing dataset balance

In [None]:
#learn.summary()

In [None]:
# in this cell i try to balance the training batches
# import
from torch.utils.data.sampler import WeightedRandomSampler


# Training the 1D CNN

In [None]:
#learn.lr_find() # choose an adequate lr

In [None]:
#from fastai.callbacks import SaveModelCallback

In [None]:
# fit and train 
epochs=1
lr=0.1
learn = learn.fit_one_cycle(epochs, lr) #callbacks=[SaveModelCallback(learn, every='epoch',  monitor='accuracy', name='saved_1D_net')]) # make training more stable with fit_one_cycle

epoch,train_loss,valid_loss,precision_score,time
0,0.086343,0.08007,0.979263,09:39


In [None]:
len(dls.train_ds)

383591

In [None]:
# confusion matrix
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix(figsize=(12,12), dpi=60)

# Saving the 1D CNN model

In [None]:
# save the architecture and the parameters
learn.export()

In [None]:
# check that the file exists
path = Path()
path.ls(file_exts='.pkl')

In [None]:
# load the model not for training but for making predictions (inference)
learn_inf = load_learner(path/'export.pkl')

In [None]:
# make prediction
learn_inf.predict('images/grizzly.jpg') # image as example
'''returned three things: the predicted category in the same format you originally provided 
(in this case, that’s a string), the index of the predicted category, and the probabilities of each category
The last two are based on the order of categories in the vocab of the DataLoaders '''
# check vocab
learn_inf.dls.vocab
