# Organelle image classification

In [30]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import transforms
from PIL import Image
import os
from sklearn.model_selection import train_test_split

## 1. Introduction

This project aims to revolutionize protein classification through advanced image analysis techniques. The primary objective is to create a machine learning model capable of predicting the presence of specific proteins within a given image or multiple images, addressing the complexity of a multi-label classification challenge. The metric of choice for assessing model performance is the mean F1-score, ensuring precision and recall are both accounted for.

## 2. Preprocessing

In [31]:
# load data
train_data_dir = "./train"
train_csv_dir = "./train.csv"

test_data_dir = "./test"
test_csv_dir = "./sub.csv"

In [32]:
# Load the csv files containing the train labels and test ID's in dataframes:
train_df = pd.read_csv(train_csv_dir)
test_df = pd.read_csv(test_csv_dir)
display(train_df.head())
display(test_df.head())

Unnamed: 0,Image,Label
0,12874,1 4
1,21466,0
2,3610,4
3,2095,2 4
4,28909,4


Unnamed: 0,Image,Label
0,25880,0
1,7810,0
2,23748,0
3,24621,0
4,30169,0


In [33]:
# label dictionary
labels = {
    0: "Mitochondria",
    1: "Nuclear bodies",
    2: "Nucleoli",
    3: "Golgi apparatus",
    4: "Nucleoplasm",
    5: "Nucleoli fibrillar center",
    6: "Cytosol",
    7: "Plasma membrane",
    8: "Centrosome",
    9: "Nuclear speckles",
}

In [34]:
# Since we have a multi-label classification problem, images can have more than 1 label. In order to transform the labels to a tensor that can be used for training, we can encode them to the format of a tensor, using a one-hot-encoding.
def encode_label(label: str):
    # create tensor of length 10 for the one-hot-ecoding
    target = torch.zeros(10)
    # now iterate over the classes in the string and set the respective 0's to 1's
    for l in str(label).split(" "):
        target[int(l)] = 1.0
    return target

In [35]:
# function to decode the encoded labels back to its original format


def decode_target(
    target: torch.Tensor, text_labels: bool = False, threshold: float = 0.5
):
    result = []
    for i, x in enumerate(target):
        if x >= threshold:
            if text_labels:
                result.append(labels[i] + "(" + str(i) + ")")
            else:
                result.append(str(i))
    return " ".join(result)

In [36]:
# Image dimensions
image_path = 'train/0.png'
image = Image.open(image_path)
width, height = image.size
print(f'width: {width}\nheight: {height}')

width: 128
height: 128


In [49]:
# We can create a PyTorchDataset that will ease the training process and can be used later for the DataLoader:


class ProteinDataset(Dataset):
    def __init__(self, dataframe, data_dir='./train', transform=None, mode='train'):
        self.data = dataframe
        self.transform = transform
        self.data_dir = data_dir
        self.mode = mode  # 'train' or 'test'

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        if self.mode == 'train':
            img_name = os.path.join(self.data_dir, str(self.data.iloc[idx, 0]) + '.png')
            image = Image.open(img_name)
            label = encode_label(self.data.iloc[idx, 1])

            if self.transform:
                image = self.transform(image)

            return image, label
        elif self.mode == 'test':
            img_name = os.path.join(self.data_dir, str(self.data.iloc[idx, 0]) + '.png')
            image = Image.open(img_name)

            if self.transform:
                image = self.transform(image)

            return image, self.data.iloc[idx, 0]  # Return image name for test mode
        else:
            raise ValueError("Invalid mode. Use 'train' or 'test'.")

In [38]:
class ProteinCNN(nn.Module):
    def __init__(self):
        super(ProteinCNN, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1
        )
        self.conv2 = nn.Conv2d(
            in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1
        )
        self.fc1 = nn.Linear(
            32 * 32 * 32, 256
        )
        self.fc2 = nn.Linear(
            256, 10
        )

        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(
            -1, 32 * 32 * 32
        )
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [39]:
transforms_train = transforms.Compose([
    transforms.Resize((128, 128)),  # Resize images to 128x128 pixels if still needed
    transforms.ToTensor(),
    # Add more transformations as needed (e.g., normalization, data augmentation)
])

train_size = int(0.8 * len(train_df))
val_size = len(train_df) - train_size

train_dataset, val_dataset = train_test_split(train_df, test_size=val_size)

train_loader = DataLoader(ProteinDataset(train_dataset, transform=transforms_train), batch_size=32, shuffle=True)
val_loader = DataLoader(ProteinDataset(val_dataset, transform=transforms_train), batch_size=32, shuffle=False)


In [40]:
model = ProteinCNN()

In [41]:

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [42]:

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels.float())
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    model.eval()
    val_loss = 0.0
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            outputs = model(inputs)
            val_loss += criterion(outputs, labels.float()).item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {running_loss / len(train_loader)}, Validation Loss: {val_loss / len(val_loader)}")

print('Finished Training')

Epoch 1/10, Training Loss: 0.3553027282287548, Validation Loss: 0.3357306746477933
Epoch 2/10, Training Loss: 0.32751903948071714, Validation Loss: 0.32520058198073476
Epoch 3/10, Training Loss: 0.3094104642217809, Validation Loss: 0.32895459541954947
Epoch 4/10, Training Loss: 0.26700921948854023, Validation Loss: 0.35286656882345063
Epoch 5/10, Training Loss: 0.18574697886194502, Validation Loss: 0.4282612256782571
Epoch 6/10, Training Loss: 0.09338631022286105, Validation Loss: 0.5835017174789586
Epoch 7/10, Training Loss: 0.03867001649379343, Validation Loss: 0.8103485703468323
Epoch 8/10, Training Loss: 0.01986734572096498, Validation Loss: 0.9303343246892556
Epoch 9/10, Training Loss: 0.012905308548602964, Validation Loss: 1.1322299579984134
Epoch 10/10, Training Loss: 0.010490603451526349, Validation Loss: 1.1741705100560926
Finished Training


In [44]:
model.eval()  # Set the model to evaluation mode
correct = 0
total = 0
threshold = 0.5  # Define a threshold for prediction

with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs)
        
        # Apply threshold to convert probabilities to binary predictions
        predicted = (outputs > threshold).float()
        
        total += labels.size(0) * labels.size(1)  # Total number of elements in the batch
        correct += (predicted == labels).sum().item()  # Count correct predictions

accuracy = correct / total
print(f"Accuracy on validation set: {accuracy}")


Accuracy on validation set: 0.8558479532163743


In [50]:
# Predictions for the test set

test_dataset = ProteinDataset(test_df, data_dir=test_data_dir, transform=transforms_train, mode='test')
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [52]:
# Make predictions for the test images
model.eval()
predictions = []

with torch.no_grad():
    for inputs, img_names in test_loader:
        outputs = model(inputs)
        
        # Apply threshold for binary predictions
        threshold = 0.5
        predicted = (outputs > threshold).squeeze().cpu().numpy().astype(int)
        
        # Convert predictions to label format
        predicted_labels = " ".join([str(i) for i in np.where(predicted == 1)[0]])

        # Append image names and predicted labels to list
        predictions.append((img_names[0].item(), predicted_labels))

# Write predictions to sub.csv
submission_df = pd.DataFrame(predictions, columns=["Image", "Label"])
submission_df.to_csv("sub.csv", index=False)