## ActiveSVM
Notebook to implement activeSVM iterative gene panel selection

https://www.nature.com/articles/s43588-022-00263-8  
https://github.com/xqchen/activeSVC

In [1]:
import numpy as np
import time
import random
import os

from sklearn.preprocessing import normalize 
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
from matplotlib import gridspec

from activeSVC import min_complexity, min_acquisition
import pandas as pd
import pickle
import os, psutil
import resource

import anndata as ad
import scanpy as sc


def text_create(path, name, msg):
    full_path = path + "/" + name + '.pickle'
    f=open(full_path,'wb') 
    pickle.dump(msg,f)
    f.close()

class TimerError(Exception):
     """A custom exception used to report errors in use of Timer class"""

class Timer:
    def __init__(self):
        self._start_time = None

    def start(self):
        if self._start_time is not None:
            raise TimerError(f"Timer is running. Use .stop() to stop it")

        self._start_time = time.perf_counter()

    def stop(self):
        if self._start_time is None:
            raise TimerError(f"Timer is not running. Use .start() to start it")

        elapsed_time = time.perf_counter() - self._start_time
        self._start_time = None
        print(f"Total run time: {elapsed_time:0.4f} seconds")
        return elapsed_time
        




## Load and split data

In [2]:
# Load in pre-processed data from glutamatergic class designation, subclass to all rank_gene_groups already performed
gluData = sc.read("../Data/clusterData.h5ad")

with open("../Data/shortGenes.pickle", "rb") as f:
    shortGenes = pickle.load(f)
    
# Remove genes likely to be too short for MERSCOPE
keepGenes = list(set(gluData.var_names) - set(shortGenes))
gluData = gluData[:,keepGenes]



In [3]:
# Subset data
# Load in pre-processed data from glutamatergic class designation, subclass to all rank_gene_groups already performed
data = normalize(gluData.X,axis=1, norm="l2")
target = pd.Categorical(gluData.obs["cluster_label"]).codes
keys = pd.Categorical(gluData.obs["cluster_label"]).categories

idx = np.arange(np.shape(data)[0])
random.shuffle(idx)
X_train = data[idx[:int(np.shape(data)[0]*4/5)],:]
y_train = target[idx[:int(np.shape(data)[0]*4/5)]]
X_test = data[idx[int(np.shape(data)[0]*4/5):],:]
y_test = target[idx[int(np.shape(data)[0]*4/5):]]

print(type(data))
print(np.shape(data),np.shape(target),len(np.unique(target)))
print(np.shape(X_train))
print(np.shape(X_test))
for i in np.unique(target):
    print('class '+keys[i]+': '+str(np.count_nonzero((target==i)*1)))

<class 'scipy.sparse.csr.csr_matrix'>
(62784, 26589) (62784,) 101
(50227, 26589)
(12557, 26589)
class 100_Rxfp1 Epb4_RE: 35
class 101_Rxfp1 Epb4_RE: 226
class 102_Rxfp1 Epb4_MG-SPFp-PP-POL-SGN-PoT-PIL: 57
class 103_Rxfp1 Epb4_MG-SPFp-PP-POL-SGN-PoT-PIL: 57
class 104_Rxfp1 Epb4_VPMpc-VPLpc: 110
class 105_Rxfp1 Epb4_VPMpc-VPLpc: 71
class 106_Rxfp1 Epb4_POL-SGN: 51
class 107_Rxfp1 Epb4_LP: 72
class 108_Rxfp1 Epb4_LP: 50
class 109_Prkcd Grin2c_LP: 168
class 10_MHb_Tac1 Wif1: 89
class 110_Rxfp1 Epb4_IMD-PVT: 270
class 111_Rxfp1 Epb4_IMD-CM: 2244
class 112_Rxfp1 Epb4_PCN-CM: 524
class 113_Rxfp1 Epb4_PCN-CM: 809
class 114_Rxfp1 Epb4_PCN-CM: 453
class 116_Rxfp1 Epb4_POL: 406
class 117_Rxfp1 Epb4_PCN-SPFp: 518
class 118_Rxfp1 Epb4_PVT-IMD: 1998
class 119_Rxfp1 Epb4_IMD-CM-CL: 1783
class 11_MHb_Tac1 Wif1: 282
class 120_Prkcd Grin2c_LP-(midbrain-area-between-LP-LH): 1592
class 121_Prkcd Grin2c_CL-LP-PO-(a-to-p-multiple-nuclei): 527
class 122_Prkcd Grin2c_MD: 3562
class 123_Prkcd Grin2c_LP-POL: 30

## Setup parameters

In [4]:
'''
Parameters
'''
num_features = 500
num_samples=20
init_samples=100
balance=False
gene = gluData.var_names

folder='../Data/activeSVM'
path=folder+'/cluster_'+str(num_features)+'_'+str(num_samples)

try:
    os.mkdir('results')
except OSError:
    print ("Creation of the directory %s failed" % 'results')
else:
    print ("Successfully created the directory %s " % 'results')
try:
    os.mkdir(folder)
except OSError:
    print ("Creation of the directory %s failed" % folder)
else:
    print ("Successfully created the directory %s " % folder)
try:
    os.mkdir(path)
except OSError:
    print ("Creation of the directory %s failed" % path)
else:
    print ("Successfully created the directory %s " % path)

Creation of the directory results failed
Creation of the directory ../Data/activeSVM failed
Creation of the directory ../Data/activeSVM/cluster_500_20 failed


## Select genes

In [5]:
if __name__ == '__main__':

    t=Timer()
    t.start()
    feature_selected, num_samples_list, train_errors,test_errors,train_scores,test_scores, step_times= min_complexity(
        X_train,y_train,X_test,y_test,num_features=num_features,num_samples=num_samples,init_samples=init_samples, balance=balance, max_iter=1000)
    elapsed_time=t.stop()
    
    memorys=[]
    memorys.append(psutil.Process(os.getpid()).memory_info().rss / 1024 ** 2)
    memorys.append(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/ 1024 ** 2)
    
    text_create(path,'feature_selected',feature_selected)
    text_create(path,'error',train_errors+test_errors)
    text_create(path,'accuracy',train_scores+test_scores)
    text_create(path,'num_samples_list',num_samples_list)
    text_create(path,'genes_name',gene[feature_selected])
    text_create(path,'elapsed_time',elapsed_time)
    text_create(path,'memory',memorys)

Elapsed time: 20.8551 seconds
feature 0 : gene [0]  100 samples
training error=-3.989885917932586 test error=-3.2349287250139365
training accuracy=0.07466103888346905 test accuracy=0.07565501314008123
Elapsed time: 10.4762 seconds
feature 1 : gene 1  120 samples
training error=-3.989885917932586 test error=-3.2349287250139365
training accuracy=0.07466103888346905 test accuracy=0.07565501314008123
Elapsed time: 10.3021 seconds
feature 2 : gene 2  140 samples
training error=2.9959185298743702 test error=3.7508162777733536
training accuracy=0.08726382224699862 test accuracy=0.08847654694592658
Elapsed time: 11.0534 seconds
feature 3 : gene 13846  160 samples
training error=-0.2837119477571824 test error=-0.20028669268137295
training accuracy=0.09522766639456866 test accuracy=0.09070637891216055
Elapsed time: 11.5704 seconds
feature 4 : gene 7496  180 samples
training error=-3.051326975531089 test error=-3.7596559687823525
training accuracy=0.13156270531785694 test accuracy=0.1252687743887

NameError: name 'gene' is not defined

In [8]:
gluData.var_names[feature_selected[0:350]]

Index(['Spns3', '4932411N23Rik', '2810002D19Rik', 'Erbb4', 'Dock10', 'Prlr',
       'Nrp2', 'Gpc5', 'Ebf1', 'Kcnip1',
       ...
       'Tor1aip1', 'Morn2', 'Tmtc4', 'AW551984', 'Gm4924', 'Mitf', 'Ttc28',
       'Fam189a2', 'Hpcal4', 'Col26a1'],
      dtype='object', name='gene', length=350)

In [9]:
clusterDict = {350 : gene[feature_selected[0:350]], 500 : gene[feature_selected]}

with open("../Data/activeSVM_p11_p14_panel.pickle", 'wb') as f:
    pickle.dump(clusterDict, f)