In [1]:
# Dependencies:
# pip: scikit-learn, anndata, scanpy
#
# Modified from the Python starter kit for the NeurIPS 2021 Single-Cell Competition.
# Parts with `TODO` are supposed to be changed by you.
#
# More documentation:
#
# https://viash.io/docs/creating_components/python/

In [2]:
import logging
import anndata as ad
import sys

from scipy.sparse import csc_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LinearRegression
import numpy as np

logging.basicConfig(level=logging.INFO)

In [3]:
## VIASH START
# Anything within this block will be removed by `viash` and will be
# replaced with the parameters as specified in your config.vsh.yaml.
meta = { 'resources_dir': '.' }

par = { 
    'input_train_mod1': 'sample_data/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod1.h5ad',
    'input_train_mod2': 'sample_data/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.train_mod2.h5ad',
    'input_test_mod1': 'sample_data/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod1.h5ad',
    'distance_method': 'minkowski',
    'output': 'output.h5ad',
    'n_pcs': 50,
}
## VIASH END
test_mod2_file = 'sample_data/openproblems_bmmc_cite_starter/openproblems_bmmc_cite_starter.test_mod2.h5ad '


In [4]:
## VIASH START
# Anything within this block will be removed by `viash` and will be
# replaced with the parameters as specified in your config.vsh.yaml.
meta = { 'resources_dir': '.' }

par = {
    'input_train_mod1': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod1.h5ad',
    'input_train_mod2': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.train_mod2.h5ad',
    'input_test_mod1': 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod1.h5ad',
    'distance_method': 'minkowski',
    'output': 'output.h5ad',
    'n_pcs': 50,
}
## VIASH END
test_mod2_file = 'sample_data/openproblems_bmmc_multiome_starter/openproblems_bmmc_multiome_starter.test_mod2.h5ad'


In [5]:
method_id = 'basic_beans'
sys.path.append(meta['resources_dir'])

In [6]:
logging.info('Reading `h5ad` files...')
input_train_mod1 = ad.read_h5ad(par['input_train_mod1'])
input_train_mod2 = ad.read_h5ad(par['input_train_mod2'])
input_test_mod1 = ad.read_h5ad(par['input_test_mod1'])



INFO:root:Reading `h5ad` files...


In [7]:

# TODO: implement own method
from bean import method
adata = method(input_train_mod1, input_train_mod2, input_test_mod1)

adata.uns["method_id"] = method_id



INFO:root:Performing dimensionality reduction on modality 1 values...
INFO:root:Performing dimensionality reduction on modality 2 values...
INFO:root:Running K nearest neigbors...


In [8]:

from scipy.sparse import issparse
issparse(adata.X)

True

In [9]:

logging.info('Storing annotated data...')
adata.write_h5ad(par['output'], compression = "gzip")


INFO:root:Storing annotated data...


In [10]:
from pygam import LinearGAM

In [11]:
from sklearn.decomposition import TruncatedSVD
logging.info('Performing dimensionality reduction on modality 1 values...')    
input_mod1 = ad.concat(
    {"train": input_train_mod1, "test": input_test_mod1},
    axis=0,
    join="outer",
    label="group",
    fill_value=0,
    index_unique="-"
)
    
embedder_mod1 = TruncatedSVD(n_components=50)
mod1_pca = embedder_mod1.fit_transform(input_mod1.X)

logging.info('Performing dimensionality reduction on modality 2 values...')
embedder_mod2 = TruncatedSVD(n_components=50)
mod2_pca = embedder_mod2.fit_transform(input_train_mod2.X)

# split dimred back up
X_train = mod1_pca[input_mod1.obs['group'] == 'train']
X_test = mod1_pca[input_mod1.obs['group'] == 'test']
y_train = mod2_pca
    
logging.info('Running Linear regression...')


INFO:root:Performing dimensionality reduction on modality 1 values...
INFO:root:Performing dimensionality reduction on modality 2 values...
INFO:root:Running Linear regression...


In [12]:
from statistics import sqrt
from math import ceil
N = X_train.shape[0]
k = ceil(sqrt(N))

In [13]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=k)
neigh.fit(X_train, y_train)

KNeighborsRegressor(n_neighbors=15)

In [14]:
y_pred = neigh.predict(X_test)

In [15]:
y_pred = neigh.predict(X_test)

In [16]:

# Project the predictions back to the modality 2 feature space
y_pred = y_pred @ embedder_mod2.components_

# Store as sparse matrix to be efficient. Note that this might require
# different classifiers/embedders before-hand. Not every class is able
# to support such data structures.
y_pred = csc_matrix(y_pred)

adata2 = ad.AnnData(
    X=y_pred,
    obs=input_test_mod1.obs,
    var=input_train_mod2.var,
    uns={
        'dataset_id': input_train_mod1.uns['dataset_id'],
        'method_id': 'starter_kit'
    },
)


In [17]:
true_test_mod2 = ad.read_h5ad(test_mod2_file)

In [18]:

from sklearn.metrics import mean_squared_error
def calculate_rmse(true_test_mod2, pred_test_mod2):
    return  mean_squared_error(true_test_mod2.X.toarray(), pred_test_mod2.X.toarray(), squared=False)

In [19]:
calculate_rmse(true_test_mod2, adata)

0.2997299

In [20]:
calculate_rmse(true_test_mod2, adata2)

0.30013496

#go backwards
tests={}
for i in range (2, 200, 10):
    pred_data = method(input_train_mod2, input_train_mod1, true_test_mod2, k=i)
    tests[i]=calculate_rmse(input_test_mod1, pred_data)
tests