## Propose notebook
Notebook to trial implementing PROPOSE for gene panel selection

In [60]:
import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
import torch
import pickle
import numpy as np
import torch.nn as nn
from propose import PROPOSE, HurdleLoss, ExpressionDataset

In [61]:
# cuda would be better, unavailable on local machine
device = torch.device('cpu',0)

In [62]:
# Load in pre-processed data from glutamatergic class designation, subclass to all rank_gene_groups already performed
gluData = sc.read("../Data/gluData.h5ad")



In [70]:
# Subset to use
subIndx = 1000

# Prep data
testRaw = gluData.X[0:subIndx,0:800].todense()

# Binarized data
binary = np.asarray((testRaw > 0).astype(np.float32))

# Subclass labels
labels = gluData.obs["subclass_label"][0:subIndx]
labelCodes = pd.Categorical(labels).codes # convert categories to numeric (hopefully won't need to)

In [71]:
# For data splitting
n = testRaw.shape[0]
n_train = int(0.8 * n)
n_test = int(0.1 * n)
all_rows = np.arange(n)
np.random.seed(0)
np.random.shuffle(all_rows)
train_inds = all_rows[:n_train]
val_inds = all_rows[n_train:-n_test]
test_inds = all_rows[-n_test:]
print(f'{n} total examples, {len(train_inds)} training examples, {len(val_inds)} validation examples, {len(test_inds)} test examples')

1000 total examples, 800 training examples, 100 validation examples, 100 test examples


## Run PROPOSE

In [72]:
# Set up datasets

# # Unsupervised version
# train_dataset = ExpressionDataset(binary[train_inds], testRaw[train_inds])
# val_dataset = ExpressionDataset(binary[val_inds], testRaw[val_inds])

# Pre-assigned labels version
train_dataset = ExpressionDataset(binary[train_inds], labelCodes[train_inds])
val_dataset = ExpressionDataset(binary[val_inds], labelCodes[val_inds])

In [67]:
num_genes = (32, 64)
propose_results = {}

In [75]:
# Set up selector
selector = PROPOSE(train_dataset,
                   val_dataset,
                   loss_fn=torch.nn.CrossEntropyLoss(),
                   device=device,
                   hidden=[128, 128])

# Eliminate many candidates
candidates, model = selector.eliminate(target=500, mbsize=128, max_nepochs=500)

for num in num_genes:
    # Select specific number of genes
    inds, model = selector.select(num_genes=num, mbsize=128, max_nepochs=500)
    propose_results[num] = inds

using CrossEntropyLoss, starting with lam = 0.0001


Training epochs:   0%|          | 0/500 [00:00<?, ?it/s]

RuntimeError: expected scalar type Long but found Int