# Cancer Cell Line Feature Extraction with an Autoencoder

In this Jupyter notebook, an Autoencoder is used to obtain cancer cell line feature representations.

<br>

### File Requirements

The following file is required in the `data/STEP00` folder:

1. [Cell_line_RMA_proc_basalExp.txt](https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources/Home.html)

<br>

### Output
The trained model "CCL_AUTOENCODER.pth".

<br>

### Evaluation
Visualization of the learning curve and output of performance metrics.


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import pearsonr


In [None]:
np.random.seed(42)  # Fixed seed for reproducibility
torch.manual_seed(42)

In [None]:
ccl_file = pd.read_csv("data/STEP00/Cell_line_RMA_proc_basalExp.txt", sep="\t")

# Select data from row 1 (index 1) onwards and column 2 (index 2) onwards
ccl_df = ccl_file.iloc[:, 2:].T

# Drop duplicates and convert to a list
rna_values = ccl_df.drop_duplicates()
rna_values = rna_values.values.tolist()

## Autoencoder Architecture

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dims, latent_dim):
        super(Autoencoder, self).__init__()
        # Encoder
        encoder_layers = []
        dims = [input_dim] + hidden_dims
        for i in range(len(dims) - 1):
            encoder_layers.append(nn.Linear(dims[i], dims[i + 1]))
            encoder_layers.append(nn.ReLU())
        self.encoder = nn.Sequential(*encoder_layers)
        
        # Code (latent representation)
        self.latent = nn.Linear(hidden_dims[-1], latent_dim)

        # Decoder
        decoder_layers = []
        dims = [latent_dim] + list(reversed(hidden_dims))
        for i in range(len(dims) - 1):
            decoder_layers.append(nn.Linear(dims[i], dims[i + 1]))
            decoder_layers.append(nn.ReLU())
        self.decoder = nn.Sequential(*decoder_layers)
        self.output_layer = nn.Linear(hidden_dims[0], input_dim)

    def forward(self, x):
        x = self.encoder(x)
        x = self.latent(x)
        x = self.decoder(x)
        x = self.output_layer(x)
        return x


## Training

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create training and validation splits
data = torch.tensor(rna_values, dtype=torch.float32)
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
train_dataset = TensorDataset(train_data)
val_dataset = TensorDataset(val_data)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Hyperparameters
input_dim = 17737   # Number of genes expression values
hidden_dims = [512, 128]
latent_dim = 128
lr = 0.0006
epochs=300

# Early stopping parameters
patience = 20  # Number of epochs to wait for improvement
min_delta = 1e-4  # Minimum change to qualify as an improvement
best_val_loss = float('inf')
patience_counter = 0

train_losses = []
val_losses = []

# Model definition
model = Autoencoder(input_dim, hidden_dims, latent_dim).to(device)
optimizer = optim.AdamW(model.parameters(), lr=lr)
criterion = nn.MSELoss()


# Training loop
for epoch in range(epochs):
    model.train()
    epoch_train_loss = 0
    
    for batch in train_loader:
        x_batch = batch[0].to(device)
        
        # Forward pass
        outputs = model(x_batch)
        loss = criterion(outputs, x_batch)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_train_loss += loss.item()
        
    epoch_train_loss /= len(train_loader)
    train_losses.append(epoch_train_loss)

    # Validation loss
    model.eval()
    epoch_val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            x_batch = batch[0].to(device)
            outputs = model(x_batch)
            loss = criterion(outputs, x_batch)
            epoch_val_loss += loss.item()
    epoch_val_loss /= len(val_loader)
    val_losses.append(epoch_val_loss)

    print(f"Epoch {epoch + 1}, Training Loss: {epoch_train_loss:.4f}, Validation Loss: {epoch_val_loss:.4f}")

    # Early stopping
    if epoch_val_loss < best_val_loss - min_delta:
        best_val_loss = epoch_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), "models/CCL_AUTOENCODER.pth")  # Save the best model
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print(f"Early stopping triggered at epoch {epoch + 1}")
        break


In [None]:
torch.save(model.state_dict(), "models/CCL_AUTOENCODER.pth")

## Evaluation: Learning curve and performance metrics

In [None]:
plt.style.use("seaborn-v0_8-ticks")
plt.rc("font", family="Times New Roman", size=12)

fig, ax = plt.subplots(figsize=(8, 6))
ax.tick_params(axis="both", which="both", direction="in", length=6, width=1)

ax.plot(train_losses, label="Training Loss", linewidth=1.5)
ax.plot(val_losses, label="Validation Loss", linewidth=1.5)

ax.set_xlabel("Training Epochs")
ax.set_ylabel("Mean Squared Error Loss")
ax.set_ylim(0.15, 0.8)
ax.legend()

plt.title("Autoencoder")
fig.savefig("autoencoder_loss.pdf", format="pdf", bbox_inches="tight")
plt.show()

In [None]:
def calculate_mse(model, data_loader, device):
    model.eval()
    mse_list = []
    
    with torch.no_grad():
        for batch in data_loader:
            x_batch = batch[0].to(device)
            reconstructed = model(x_batch)
            mse = mean_squared_error(x_batch.cpu().numpy(), reconstructed.cpu().numpy())
            mse_list.append(mse)
    return np.mean(mse_list)


def calculate_mae(model, data_loader, device):
    model.eval()
    mae_list = []
    
    with torch.no_grad():
        for batch in data_loader:
            x_batch = batch[0].to(device)
            reconstructed = model(x_batch)
            mae = mean_absolute_error(x_batch.cpu().numpy(), reconstructed.cpu().numpy())
            mae_list.append(mae)
    return np.mean(mae_list)


def calculate_pcc(model, data_loader, device):
    model.eval()
    pcc_list = []
    
    with torch.no_grad():
        for batch in data_loader:
            x_batch = batch[0].to(device)
            reconstructed = model(x_batch)
            x_original = x_batch.cpu().numpy()
            x_reconstructed = reconstructed.cpu().numpy()
            
            # Calculate Pearson correlation coefficient for each gene
            for gene_idx in range(x_original.shape[1]):
                r, _ = pearsonr(x_original[:, gene_idx], x_reconstructed[:, gene_idx])
                pcc_list.append(r)
    return np.mean(pcc_list)

In [None]:
print(f"Training MSE: {calculate_mse(model, train_loader, device):.4f}")
print(f"Validation MSE: {calculate_mse(model, val_loader, device):.4f}")
print(f"Training RMSE: {math.sqrt(calculate_mse(model, train_loader, device)):.4f}")
print(f"Validation RMSE: {math.sqrt(calculate_mse(model, val_loader, device)):.4f}")
print(f"Training MAE: {calculate_mae(model, train_loader, device):.4f}")
print(f"Validation MAE: {calculate_mae(model, val_loader, device):.4f}")
print(f"Training PCC: {calculate_pcc(model, train_loader, device):.4f}")
print(f"Validation PCC: {calculate_pcc(model, val_loader, device):.4f}")