## Propose notebook
Notebook to implement PROPOSE for gene panel selection

In [1]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
import torch
import pickle
import numpy as np
import torch.nn as nn
from propose import PROPOSE, HurdleLoss, ExpressionDataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load in pre-processed data from glutamatergic class designation, subclass to all rank_gene_groups already performed
gluData = sc.read("../Data/clusterData.h5ad")

with open("../Data/shortGenes.pickle", "rb") as f:
    shortGenes = pickle.load(f)
    
# Remove genes likely to be too short for MERSCOPE
keepGenes = list(set(gluData.var_names) - set(shortGenes))
gluData = gluData[:,keepGenes]



In [3]:
# To use full dataset:
binary = (gluData.X.A > 0).astype(np.float32)
labelCodes = pd.Categorical(gluData.obs["cluster_label"]).codes

In [4]:
# For data splitting
n = binary.shape[0]
n_train = int(0.8 * n)
n_test = int(0.1 * n)
all_rows = np.arange(n)
np.random.seed(0)
np.random.shuffle(all_rows)
train_inds = all_rows[:n_train]
val_inds = all_rows[n_train:-n_test]
test_inds = all_rows[-n_test:]
print(f'{n} total examples, {len(train_inds)} training examples, {len(val_inds)} validation examples, {len(test_inds)} test examples')

62784 total examples, 50227 training examples, 6279 validation examples, 6278 test examples


## Run PROPOSE

In [5]:
# Set up datasets

# # Unsupervised version
# train_dataset = ExpressionDataset(binary[train_inds], testRaw[train_inds])
# val_dataset = ExpressionDataset(binary[val_inds], testRaw[val_inds])

# Pre-assigned labels version
train_dataset = ExpressionDataset(binary[train_inds], labelCodes[train_inds])
val_dataset = ExpressionDataset(binary[val_inds], labelCodes[val_inds])

In [6]:
# Set up GPU device
device = torch.device('cuda')

In [7]:
num_genes = [350, 500]
propose_results = {}

In [8]:
# Set up selector
selector = PROPOSE(train_dataset,
                   val_dataset,
                   loss_fn=torch.nn.CrossEntropyLoss(),
                   device=device,
                   hidden=[128, 128])

# Eliminate many candidates
candidates, model = selector.eliminate(target=1000, mbsize=128, max_nepochs=600)

using CrossEntropyLoss, starting with lam = 0.0001


Training epochs: 100%|██████████| 600/600 [1:06:26<00:00,  6.64s/it]


lam = 0.000100 yielded 1281 genes
Warm starting model for next iteration
next attempt is lam = 0.000129


Training epochs: 100%|██████████| 600/600 [1:06:24<00:00,  6.64s/it]


lam = 0.000129 yielded 1160 genes
done, lam = 0.000129 yielded 1160 genes


In [9]:
candidateGenes = list(gluData.var_names[candidates])
with open("../Data/filtered_PROPOSE_candidates.pickle", 'wb') as f:
    pickle.dump(candidateGenes, f)

In [None]:
# If something crashes and candidate genes need to be reloaded:

with open("../Data/filtered_PROPOSE_candidates.pickle", 'rb') as f:
    candidateGenes = pickle.load(f)

candidates1 = np.empty(len(candidateGenes), dtype = 'int64')
i = 0
for gene in candidateGenes:
    candidates1[i] = gluData.var_names.get_loc(gene)
    i += 1
    
# Still need to "set_genes" within PROPOSE object

In [10]:
for num in num_genes:
    # Select specific number of genes
    inds, model = selector.select(num_genes=num, mbsize=128, max_nepochs=600)
    propose_results[num] = inds

Training epochs: 100%|██████████| 600/600 [1:57:25<00:00, 11.74s/it]


done, selected 350 genes


Training epochs: 100%|██████████| 600/600 [2:43:33<00:00, 16.36s/it]  

done, selected 500 genes





In [11]:
geneDict = {}
for num in num_genes:
    candidateGenes = list(gluData.var_names[propose_results[num]])
    geneDict[num] = candidateGenes
    
with open("../Data/cluster_markers_propose.pickle", 'wb') as f:
    pickle.dump(geneDict, f)