# Scatter Plot Evaluation

This separate file is required to ensure that both models receive the same batches and genes so that comparability is possible.
For this reason, both models are loaded and set to evaluation mode.

<br>

### File Requirements

The following files are required:
1. [`data/STEP00/Cell_line_RMA_proc_basalExp.txt`](https://www.cancerrxgene.org/gdsc1000/GDSC1000_WebResources/Home.html)
2. `models/CCL_AUTOENCODER.pth` - obtained by STEP02_AUTOENCODER.ipynb
3. `models/CCL_TRANSFORMER.pth` - obtained by STEP02_TRANSFORMER.ipynb

<br>

### Output
A pdf file containing the scatter plots.


In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [None]:
np.random.seed(42)  # Fixed seed for reproducibility
torch.manual_seed(42)

In [None]:
ccl_file = pd.read_csv("data/STEP00/Cell_line_RMA_proc_basalExp.txt", sep="\t")

# Select data from row 1 (index 1) onwards and column 2 (index 2) onwards
ccl_df = ccl_file.iloc[:, 2:].T

# Drop duplicates and convert to a list
rna_values = ccl_df.drop_duplicates()
rna_values = rna_values.values.tolist()

## Transformer Encoder Architecture

The “CCL_TRANSFORMER.pth” model is loaded and set to evaluation mode.

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
        self.qkv_proj = nn.Linear(embed_dim, embed_dim * 3)  
        self.o_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch_size, seq_length, embed_dim = x.size()
        qkv = self.qkv_proj(x)
        qkv = qkv.view(batch_size, seq_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1)
        q, k, v = qkv.chunk(3, dim=2)
        attn_scores = torch.matmul(q.transpose(-1, -2), k) / self.head_dim**0.5
        attn_weights = F.softmax(attn_scores, dim=-1)
        attn_output = torch.matmul(attn_weights, v.transpose(-1, -2)).transpose(-1, -2)
        attn_output = attn_output.contiguous().view(batch_size, seq_length, embed_dim)
        attn_output = self.o_proj(attn_output)
        attn_output = self.dropout(attn_output)
        
        return attn_output


class TransformerEncoderBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_hidden_dim, dropout):
        super(TransformerEncoderBlock, self).__init__()
        self.attention = MultiHeadAttention(embed_dim, num_heads, dropout)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_hidden_dim),
            nn.ReLU(),
            nn.Linear(ff_hidden_dim, embed_dim),
            nn.Dropout(dropout)
        )
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_output = self.attention(x)
        x = self.norm1(x + attn_output)
        ff_output = self.ff(x)
        x = self.norm2(x + ff_output)
        
        return x


class TransformerEncoder(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads, ff_hidden_dim, num_layers, dropout=0.1):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(embed_dim, num_heads, ff_hidden_dim, dropout)
        for _ in range(num_layers)])
        self.fc_out = nn.Linear(embed_dim, input_dim)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        x = self.fc_out(x)
        x = x.squeeze(1)

        return x
    
transformer_model = TransformerEncoder(17737, 512, 8, 1024, 4, 0.5)
transformer_model.load_state_dict(torch.load("models/CCL_TRANSFORMER.pth"))
transformer_model.eval()  # Set the model to evaluation mode

## Autoencoder Architecture
The “CCL_AUTOENCODER.pth” model is loaded here and set to evaluation mode.

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dims, latent_dim):
        super(Autoencoder, self).__init__()
        # Encoder
        encoder_layers = []
        dims = [input_dim] + hidden_dims
        for i in range(len(dims) - 1):
            encoder_layers.append(nn.Linear(dims[i], dims[i + 1]))
            encoder_layers.append(nn.ReLU())
        self.encoder = nn.Sequential(*encoder_layers)
        self.latent = nn.Linear(hidden_dims[-1], latent_dim)

        # Decoder
        decoder_layers = []
        dims = [latent_dim] + list(reversed(hidden_dims))
        for i in range(len(dims) - 1):
            decoder_layers.append(nn.Linear(dims[i], dims[i + 1]))
            decoder_layers.append(nn.ReLU())
        self.decoder = nn.Sequential(*decoder_layers)
        self.output_layer = nn.Linear(hidden_dims[0], input_dim)

    def forward(self, x):
        x = self.encoder(x)
        x = self.latent(x)
        x = self.decoder(x)
        x = self.output_layer(x)  # No activation for reconstruction
        return x

autoencoder_model = Autoencoder(17737, [512, 128], 128)
autoencoder_model.load_state_dict(torch.load("models/CCL_AUTOENCODER.pth"))
autoencoder_model.eval()  # Set the model to evaluation mode

## Visualization

In [None]:
def scatter_plot(model1, model2, data_loader, device, num_genes=100, highlight_gene_indices=None):
    model1.eval()
    model2.eval()
    model1.to(device)
    model2.to(device)
    
    with torch.no_grad():
        # Select same batch for both models
        for batch in data_loader:
            x_batch = batch[0].to(device)  # Move input batch to the correct device
            reconstructed1 = model1(x_batch)  # Get reconstruction for model1
            reconstructed2 = model2(x_batch)  # Get reconstruction for model2
            x_original = x_batch.cpu().numpy()  # Move to CPU for numpy operations
            x_reconstructed1 = reconstructed1.cpu().numpy()
            x_reconstructed2 = reconstructed2.cpu().numpy()

            break  # Use the first batch for visualization

    # Select the same random subset of genes for both models to ensure comparability
    gene_indices = np.random.choice(x_original.shape[1], num_genes, replace=False)
    if highlight_gene_indices is not None:
        for gene_idx in highlight_gene_indices:
            if gene_idx not in gene_indices:
                gene_indices = np.append(gene_indices, gene_idx)

    # Assign colors to highlighted genes
    highlight_colors = cm.Dark2(range(len(highlight_gene_indices))) 
    highlighted_gene_colors = {gene: color for gene, color in zip(highlight_gene_indices, highlight_colors)}

    plt.figure(figsize=(15, 7))
    
    # Autoencoder subplot
    plt.subplot(1, 2, 1) 
    for i, gene_idx in enumerate(gene_indices):
        if gene_idx in highlighted_gene_colors:
            color = highlighted_gene_colors[gene_idx]
            alpha = 1.0
            label = f"Gene {gene_idx}"
        else:
            color = "darkgrey"
            alpha = 1
            label = None

        plt.scatter(x_original[:, gene_idx], x_reconstructed1[:, gene_idx], 
                    alpha=alpha, label=label, color=color)

    if highlight_gene_indices is not None:
        plt.legend(title="Highlighted Genes", fontsize=12, loc="upper left")

    plt.xlabel("Original Gene Expression Value")
    plt.ylabel("Reconstructed Gene Expression Value")
    plt.xlim(0, 14)
    plt.ylim(0, 14)
    plt.title(f"{model1.__class__.__name__}")
    plt.grid(True)

    # Transformer subplot
    plt.subplot(1, 2, 2) 
    for i, gene_idx in enumerate(gene_indices):
        if gene_idx in highlighted_gene_colors:
            color = highlighted_gene_colors[gene_idx]
            alpha = 1.0
            label = f"Gene {gene_idx}"
        else:
            color = "darkgrey"
            alpha = 1
            label = None

        plt.scatter(x_original[:, gene_idx], x_reconstructed2[:, gene_idx], 
                    alpha=alpha, label=label, color=color)

    if highlight_gene_indices is not None:
        plt.legend(title="Highlighted Genes", fontsize=12, loc="upper left")

    plt.xlabel("Original Gene Expression Value")
    plt.ylabel("Reconstructed Gene Expression Value")
    plt.xlim(0, 14)
    plt.ylim(0, 14)
    plt.title(f"{model2.__class__.__name__}")
    plt.grid(True)

    plt.tight_layout()
    plt.savefig(f"comparison_{model1.__class__.__name__}_{model2.__class__.__name__}_scatter.pdf", format="pdf", bbox_inches="tight") 
    plt.show()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create training and validation splits
data = torch.tensor(rna_values, dtype=torch.float32)
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)
val_dataset = TensorDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

scatter_plot(autoencoder_model, transformer_model, val_loader, device, num_genes=100, highlight_gene_indices=[1, 2, 3])