In [1]:
import sys
import os
import hydra
from omegaconf import OmegaConf

# add parent directory to path
sys.path.append(os.path.abspath(os.path.join('..')))

# initialize hydra

In [2]:
hydra.initialize(config_path="../config", version_base="1.1")

# Choose which config to load
config_name = "config"  # Change this to use a different config
print(f"Loading config: {config_name}")

# Load the config
cfg = hydra.compose(
    config_name=config_name, 
    overrides=["experiment=essential_genes", "loss=perturbseq"]
)

# Display the loaded config
print(OmegaConf.to_yaml(cfg))

Loading config: config
dataset:
  _target_: datasets.perturbseq_dataset.PerturbseqDataset
  adata_path: /orcd/data/omarabu/001/Omnicell_datasets/essential_gene_knockouts_raw/essential_gene_knockouts_raw.h5ad
  pert_embedding_path: /orcd/data/omarabu/001/Omnicell_datasets/essential_gene_knockouts_raw/pert_embeddings/GenePT.pt
  control_pert: non-targeting
  pert_key: gene
  cell_key: cell_type
  split_mode: iid
  pca_components: ${experiment.pert_embedding_dim}
  seed: 42
  set_size: 100
  data_shape:
  - 11907
  heldout_perts:
  - SUPT5H
  - ATF5
  - SRSF1
  - PSMA3
  - SNRPD3
  - RPL30
  - EXOSC2
  - CDC73
  - NUP54
  - PRIM2
  - TSR2
  - RPS11
  - KPNB1
  - NACA
  - CSE1L
  - SF3B2
  - PHAX
  - POLR2G
  - RPS15A
  - SF3A2
  heldout_cell_types:
  - k562
encoder:
  _target_: encoder.perturbseq_encoders.DistributionEncoderResNetPertPredictor
  in_dim: ${dataset.data_shape[0]}
  latent_dim: ${experiment.latent_dim}
  hidden_dim: ${experiment.hidden_dim}
  set_size: ${experiment.set_size}

In [3]:
from torch.utils.data import DataLoader

dataset = hydra.utils.instantiate(cfg.dataset)



PCA with 16 components explains 0.3266 of variance
Loaded 9220 sets (cell_type x gene combinations)


In [4]:
dataloader = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True)

In [5]:
# Create encoder
encoder = hydra.utils.instantiate(cfg.encoder)

In [6]:
# Create generator (with model already instantiated)
generator = hydra.utils.instantiate(cfg.generator)

In [7]:
# Get model parameters
model_parameters = list(encoder.parameters()) + list(generator.model.parameters())

# Create optimizer and scheduler
optimizer = hydra.utils.instantiate(cfg.optimizer)(params=model_parameters)
scheduler = hydra.utils.instantiate(cfg.scheduler)(optimizer=optimizer)

loss_manager = hydra.utils.instantiate(cfg.loss)

# Create trainer
trainer = hydra.utils.instantiate(cfg.training)

In [8]:
loss_manager

<loss.perturbseq.PerturbSeqLossManager at 0x1506bdb6bf40>

In [9]:
trainer.use_tqdm = True
output_dir, stats = trainer.train(
    encoder=encoder,
    generator=generator,
    dataloader=dataloader,
    optimizer=optimizer,
    scheduler=scheduler,
    loss_manager=loss_manager,
    output_dir=os.path.abspath('../outputs'),
    config=cfg
)

Epoch 1/100:   0%|          | 0/1209 [00:00<?, ?it/s]


torch.Size([8, 128])


OutOfMemoryError: CUDA out of memory. Tried to allocate 38.00 MiB. GPU 0 has a total capacity of 79.10 GiB of which 36.69 MiB is free. Process 281029 has 78.18 GiB memory in use. Including non-PyTorch memory, this process has 882.00 MiB memory in use. Of the allocated memory 203.48 MiB is allocated by PyTorch, and 8.52 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [10]:
import numpy as np
import pandas as pd 
import torch
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import pearsonr

def generate_set_mean_predictions(encoder, sets, X, ctrl_key, pert_keys):
    encoder = encoder.to('cuda') 

    ctrl_X = torch.tensor(X[sets[ctrl_key]]).to('cuda')
    pert_X = {k: torch.tensor(X[sets[k]]).to('cuda') for k in pert_keys}

    ctrl_S = encoder(ctrl_X.unsqueeze(0))
    
    pert_S = {k: encoder(pert_X[k].unsqueeze(0)) for k in pert_keys}
    pert_S_delta = {k: pert_S[k] - ctrl_S for k in pert_keys}

    pert_S = torch.cat([pert_S[k] for k in pert_keys], dim=0)
    pert_S_delta = torch.cat([pert_S_delta[k] for k in pert_keys], dim=0)

    ctrl_X_mean = torch.mean(ctrl_X, dim=0)
    pert_X_mean = {k: torch.mean(pert_X[k], dim=0) for k in pert_keys}
    pert_X_mean = torch.cat([pert_X_mean[k].unsqueeze(0) for k in pert_keys], dim=0)
    pert_X_delta = pert_X_mean - ctrl_X_mean.unsqueeze(0)

    pert_X_delta_recon = encoder.mean_predictor(pert_S) - ctrl_X_mean
    return ctrl_X_mean.cpu().detach().numpy(), ctrl_S.cpu().detach().numpy(), pert_X_delta.cpu().detach().numpy(), pert_S_delta.cpu().detach().numpy(), pert_X_delta_recon.cpu().detach().numpy()


def r2_score(y_true, y_pred):
    """Calculate R² using Pearson correlation."""
    r = pearsonr(y_true, y_pred, axis=1)
    return (r[0]**2).mean()

# solve optimal linear predictor
def solve_optimal_linear_predictor(Y, X, bias=True):
    if bias:
        X = np.hstack([X, np.ones((X.shape[0], 1))])
    beta = np.linalg.inv(X.T @ X) @ X.T @ Y
    if bias:
        return beta[:-1], beta[-1]
    return beta

In [11]:
cell_type = 'k562'  
ctrl_key = dataset.control_pert
pert_keys = [k for k in dataset.sets[cell_type] if k != ctrl_key and k in dataset.pert_embeddings]
eval_pert_keys = [k for k in dataset.eval_sets[cell_type] if k != ctrl_key and k in dataset.pert_embeddings]

with torch.no_grad():
    ctrl_X, ctrl_S, X_delta, S_delta, X_delta_recon = generate_set_mean_predictions(
        encoder, dataset.sets[cell_type], dataset.X, ctrl_key, pert_keys
    )
    _, _, X_delta_eval, S_delta_eval, X_delta_recon_eval = generate_set_mean_predictions(
        encoder, dataset.eval_sets[cell_type], dataset.X, ctrl_key, eval_pert_keys
    )


In [12]:

beta, bias = solve_optimal_linear_predictor(X_delta, S_delta)
X_delta_pred_full = S_delta_eval @ beta + bias
r2_score(X_delta_eval, X_delta_pred_full), mean_squared_error(X_delta_eval, X_delta_pred_full)

(0.942771173643357, 0.10214500899233878)

In [14]:
from sklearn.linear_model import LinearRegression   
from sklearn.kernel_ridge import KernelRidge
import numpy as np

Z = np.vstack([dataset.pert_embeddings[k] for k in pert_keys])
Z_eval = np.vstack([dataset.pert_embeddings[k] for k in eval_pert_keys])


reg = LinearRegression()
# reg = KernelRidge(kernel='polynomial', degree=2, alpha=0.1)
reg.fit(Z, S_delta)
S_delta_pred_eval_kr = reg.predict(Z_eval).astype(np.float32)

X_delta_pred_gde = S_delta_pred_eval_kr @ beta + bias

# mean predict the delta
reg = LinearRegression()
# reg = KernelRidge(kernel='polynomial', degree=2, alpha=0.1)
reg.fit(Z, X_delta)
X_delta_pred_full = reg.predict(Z_eval).astype(np.float32)

r2_score(X_delta_eval, X_delta_pred_gde), mean_squared_error(X_delta_eval, X_delta_pred_gde), r2_score(X_delta_eval, X_delta_pred_full), mean_squared_error(X_delta_eval, X_delta_pred_full)

(0.43110166928780036, 2.2951323160105894, 0.4315087, 2.2952776)