This is official pytorch tutorial: <a href=https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py> Blitz Tutorial<a>

What is done in this tutorial:
    1. Load and normalize the CIFAR10 training and test datasets using torchvision
    2. Define a Convolutional Neural Network
    3. Define a loss function
    4. Train the network on the training data
    5. Test the network on the test data

TORCHVISION:
    The torchvision package consists of popular datasets, model architectures,
    and common image transformations for computer vision.

In [2]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import TensorDataset, DataLoader
from torch.nn.functional import one_hot

import matplotlib.pyplot as plt
import numpy as np
import h5py
import pickle
from itertools import chain
from sklearn.model_selection import train_test_split
import pandas as pd

In [3]:
embeddings = h5py.File("../../data/embeddings.h5", "r")
with open("../../data/seq_anno_hash.pickle", 'rb') as handle:
    proteins_and_hashes = pickle.load(handle)

In [4]:
label_prot = pd.read_csv("../data_splits/train_prot_id_labels.csv")

X_train, X_test = train_test_split(label_prot, test_size = 0.3, train_size=0.7, random_state=42, stratify=label_prot["label"])
print(len(X_train))
print(len(X_test))
X_train

3246
1392


Unnamed: 0,prot_id,label
3831,O16299,G
2899,P03524,SP_TM
2165,B6EU02,G
4334,Q95UE8,G_SP
3760,Q0VD86,G
...,...,...
3631,Q8BXQ2,SP_TM
1467,Q8I7Z8,G
576,P51946,G
433,Q12211,G


In [5]:
train_embeddings = list()
train_labels = list()
train_protein_ids = list()
for index, row in X_train.iterrows():
    hash_code = proteins_and_hashes[row["label"]][row["prot_id"]][2]
    mean_embedding = np.mean(embeddings.get(hash_code), axis=0)
    label = row["label"]
    id = row["prot_id"]
    train_protein_ids.append(id)
    train_embeddings.append(mean_embedding)
    train_labels.append(label)

test_embeddings = list()
test_labels = list()
test_protein_ids = list()
for index, row in X_test.iterrows():
    hash_code = proteins_and_hashes[row["label"]][row["prot_id"]][2]
    mean_embedding = np.mean(embeddings.get(hash_code), axis=0)
    label = row["label"]
    id = row["prot_id"]
    test_protein_ids.append(id)
    test_embeddings.append(mean_embedding)
    test_labels.append(label)

### Create dataloader

In [6]:
label_mappings = {
    'G_SP': 0,
     'G': 1,
     'SP_TM': 2,
     'TM': 3
}
reverse_label_mappings = {val: key for key, val in label_mappings.items()}

In [7]:
train_embeddings = torch.Tensor(np.array(train_embeddings))
train_labels = torch.Tensor([label_mappings[label] for label in  train_labels])

In [8]:
test_embeddings = torch.Tensor(np.array(test_embeddings))
test_labels = torch.Tensor([label_mappings[label] for label in  test_labels])

In [9]:
train_labels = one_hot(train_labels.to(torch.int64), 4)
dataset = TensorDataset(train_embeddings, train_labels)
dataloader = DataLoader(dataset, batch_size=4)

In [10]:
test_labels = one_hot(test_labels.to(torch.int64), 4)
test_dataset = TensorDataset(test_embeddings, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=4)

In [100]:
import torch.nn as nn
import torch.nn.functional as F


class CNN(nn.Module):
    """ Define a model with two convolution layers each followed by a Max Pooling layer
    Then the embeddings are Flattened and forwarded into two FCls.
    The last layer is a FCL with 10 neurons.
    """
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 512, kernel_size=3, padding=1, stride=1)
        self.batch_norm1 = nn.BatchNorm1d(512)
        #self.pool = nn.MaxPool1d(2)

        self.conv2 = nn.Conv1d(512, 256, kernel_size=3, padding=1, stride=1)
        self.batch_norm2 = nn.BatchNorm1d(256)

        #self.flatten = nn.Flatten()

        self.fc = nn.Linear(256, 4)
    """
    view ->   torch.Size([4, 1, 1024])
    view conv1 +  batch ->   torch.Size([4, 512, 1024])
    view pool 1 ->   torch.Size([4, 512, 512])
    view conv2 +  batch ->   torch.Size([4, 256, 512])
    view pool 2 ->   torch.Size([4, 256, 256])
    """
    def forward(self, x):
        print("view ->  " , x.shape)
        x = F.relu( self.batch_norm1( self.conv1(x) ) )
        print("view conv1 +  batch ->  " , x.shape)
        #x = self.pool(x)
        print("view pool 1 ->  " , x.shape)

        x = F.relu( self.batch_norm2( self.conv2(x) ) )
        print("view conv2 +  batch ->  " , x.shape)
        #x = self.pool(x)
        print("view pool 2 ->  " , x.shape)

        x = self.fc(x)
        print("view fc end ->  " , x.shape)

        #x = F.softmax(x, dim=4)
        return x


net = CNN()

In [70]:
import torch.optim as optim

# The function is indicating the performance of the model.
# During the training process this function should be minimized
criterion = nn.CrossEntropyLoss()

# The minimization is achieved through Stochastic Gradient Descent
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

In [101]:
for epoch in range(4):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(dataloader, 0):
        """
        shape of inputs: torch.Size([4, 3, 32, 32])
            Batchsize: 4
            Channels: 3 (Red, Green, Blue)
            Image size: 32 x 32

        labels: tensor([9, 3, 0, 3])
            9: class of image 0 in batch
            3: class of image 1 in batch
            ...
        """
        inputs, labels = data

        """ zero the parameter gradients after every batch
        This is necessary because the gradients (directions of how the weigths and biases
        will be updated) are accumulated in each backward pass.
        https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch
        """
        optimizer.zero_grad()  # SGD

        # forward + backward + optimize
        # shape outputs: torch.Size([4, 10])
        # for every image a prediction
        # print(f"{inputs}")
        outputs = net(inputs.unsqueeze(1))
        #print(f"{outputs} \t {labels}")

        # the first iteration CrossEntropy: tensor(2.3100, grad_fn=<NllLossBackward0>)
        print(f"{outputs.shape}")
        break
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()

        # running loss after 3 iterations: 6.894119024276733
        # Why is the loss added?
        running_loss += loss.item()
        if i % 100 == 99:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1}] loss: {(running_loss / 99):.3f}')
            running_loss = 0.0

print('Finished Training')

view ->   torch.Size([4, 1, 1024])
view conv1 +  batch ->   torch.Size([4, 512, 1024])
view pool 1 ->   torch.Size([4, 512, 1024])
view conv2 +  batch ->   torch.Size([4, 256, 1024])
view pool 2 ->   torch.Size([4, 256, 1024])


RuntimeError: mat1 and mat2 shapes cannot be multiplied (1024x1024 and 256x4)

### Evaluate the model on the test data
This could be done with TorchMetrics but we will do this manually here

In [69]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in test_dataloader:
        images, labels = data
        # calculate outputs by running images through the network
        outputs = net(images)

        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        _, labels = torch.max(labels.data, 1)

        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the network on the {len(test_labels)} test embeddings: {(100 * correct / total):.2f}')

Accuracy of the network on the 516 test embeddings: 95.74


In [70]:
# prepare to count predictions for each class
classes = list(label_mappings.keys())
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

# again no gradients needed
with torch.no_grad():
    for data in test_dataloader:
        images, labels = data
        outputs = net(images)
        _, predictions = torch.max(outputs, 1)
        _, labels = torch.max(labels, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print("Accuracy for class {:5s} is: {:.1f} %".format(classname,
                                                         accuracy))

"""
Accuracy for class G_SP  is: 0.0 %
Accuracy for class G     is: 100.0 %
Accuracy for class SP_TM is: 0.0 %
Accuracy for class TM    is: 0.0 %
"""

Accuracy for class G_SP  is: 96.2 %
Accuracy for class G     is: 99.3 %
Accuracy for class SP_TM is: 81.0 %
Accuracy for class TM    is: 89.7 %


'\nAccuracy for class G_SP  is: 0.0 %\nAccuracy for class G     is: 100.0 %\nAccuracy for class SP_TM is: 0.0 %\nAccuracy for class TM    is: 0.0 %\n'

Create hashsum