# Organelle image classification

In [11]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, random_split, DataLoader
from torchvision.transforms import transforms
from PIL import Image
import matplotlib.pyplot as plt
from IPython.display import display

## 1. Introduction

This project aims to revolutionize protein classification through advanced image analysis techniques. The primary objective is to create a machine learning model capable of predicting the presence of specific proteins within a given image or multiple images, addressing the complexity of a multi-label classification challenge. The metric of choice for assessing model performance is the mean F1-score, ensuring precision and recall are both accounted for.

## 2. Preprocessing

In [3]:
# load data
train_data_dir = './train'
train_csv_dir = './train.csv'

test_data_dir = './test'
test_csv_dir = './submission.csv'

In [7]:
# Load the csv files containing the train labels and test ID's in dataframes:
train_df = pd.read_csv(train_csv_dir)
test_df = pd.read_csv(test_csv_dir)
display(train_df.head())
display(test_df.head())

Unnamed: 0,Image,Label
0,12874,1 4
1,21466,0
2,3610,4
3,2095,2 4
4,28909,4


Unnamed: 0,Image,Label
0,25880,0
1,7810,0
2,23748,0
3,24621,0
4,30169,0


In [8]:
# label dictionary
labels = {
    0: 'Mitochondria',
    1: 'Nuclear bodies',
    2: 'Nucleoli',
    3: 'Golgi apparatus',
    4: 'Nucleoplasm',
    5: 'Nucleoli fibrillar center',
    6: 'Cytosol',
    7: 'Plasma membrane',
    8: 'Centrosome',
    9: 'Nuclear speckles'
}

In [None]:
# Since we have a multi-label classification problem, images can have more than 1 label. In order to transform the labels to a tensor that can be used for training, we can encode them to the format of a tensor, using a one-hot-encoding.
def encode_label(label: str):
    # create tensor of length 10 for the one-hot-ecoding
    target = torch.zeros(10)
    # now iterate over the classes in the string and set the respective 0's to 1's
    for l in str(label).split(' '):
        target[int(l)] =1. 
    return target

In [9]:
# function to decode the encoded labels back to its original format

def decode_target(target: torch.Tensor, text_labels: bool = False, threshold: float = 0.5):
    result = []
    for i, x in enumerate(target):
        if (x >= threshold):
            if text_labels:
                result.append(labels[i] + "(" + str(i) + ")")
            else:
                result.append(str(i))
    return ' '.join(result)

In [None]:
# We can create a PyTorchDataset that will ease the training process and can be used later for the DataLoader:

class ProteinDataset(Dataset):
    def __init__(self, train_labels, transform=None):
        self.data = train_labels
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        img_name = os.path.join(self.root_dir, self.data.iloc[idx, 0])  # Get the image path
        image = Image.open(img_name)  # Open the image
        
        # Apply transformations if specified
        if self.transform:
            image = self.transform(image)
        
        label = encode_label(self.data.iloc[idx, 1])  # Encode labels using the function you defined
        
        return image, label


In [None]:
# Define your CNN model for protein classification
class ProteinCNN(nn.Module):
    def __init__(self):
        super(ProteinCNN, self).__init__()
        # Define the layers of your CNN
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(32 * 56 * 56, 256)  # Update input size based on your image dimensions
        self.fc2 = nn.Linear(256, 10)  # 10 output classes for multi-label classification
        
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        
    def forward(self, x):
        # Implement the forward pass of your CNN
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
        x = x.view(-1, 32 * 56 * 56)  # Reshape based on your image dimensions after convolutions
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    
# Initialize your CNN model
model = ProteinCNN()

# Set up transforms for training and validation data
transforms_train = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to required dimensions
    transforms.ToTensor(),
    # Add more transformations as needed (e.g., normalization, data augmentation)
])

# Create instances of ProteinDataset for train and validation sets
train_dataset = ProteinDataset(train_csv_dir, train_data_dir, transform=transforms_train)
val_dataset = ProteinDataset(val_csv_dir, val_data_dir, transform=transforms_train)

# Define dataloaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Update with appropriate loss function for multi-label classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Update with appropriate optimizer and learning rate

# Training loop
num_epochs = 10  # Adjust the number of epochs as needed

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data  # Get inputs and labels from the dataloader
        optimizer.zero_grad()  # Zero the parameter gradients
        
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, labels)  # Calculate the loss
        loss.backward()  # Backward pass
        optimizer.step()  # Optimize
        
        running_loss += loss.item()
        
    # Print statistics after each epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss / len(train_loader)}")

print('Finished Training')
