In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import torch
torch.manual_seed(0)

%load_ext autoreload 
%autoreload 2
%config InlineBackend.figure_format = 'retina'

  from .autonotebook import tqdm as notebook_tqdm


Marker proteins of anatomical regions listed in "The kidney transcriptome and proteome defined by transcriptomics and antibody-based profiling".

In [2]:
hpa_markers = pd.read_csv('./data/hpa_markers.csv',comment='#')
gene_symbols = pd.read_table('./data/gene_symbols.tsv').iloc[:,[0,-1]]
hpa_markers = hpa_markers.merge(gene_symbols,
                                left_on='gene',right_on='Approved symbol',how='inner')
hpa_markers = hpa_markers.set_index(hpa_markers.columns[-1])[['region','gene']]
hpa_markers

Unnamed: 0_level_0,region,gene
Ensembl ID(supplied by Ensembl),Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000128567,glomerulus,PODXL
ENSG00000158457,glomerulus,TSPAN33
ENSG00000113578,glomerulus,FGF1
ENSG00000116218,glomerulus,NPHS2
ENSG00000198743,glomerulus,SLC5A3
...,...,...
ENSG00000105707,collecting duct,HPN
ENSG00000214128,collecting duct,TMEM213
ENSG00000100362,collecting duct,PVALB
ENSG00000132677,collecting duct,RHBG


Load embeddings and normalized per-cell-type mean scRNA expression of each gene.

In [3]:
embeddings = np.load('./data/kidney_embeddings.npy')
index = !cat './data/kidney_embeddings.txt'
embeddings = pd.DataFrame(embeddings,index=index)
rna = pd.read_csv('./data/kidney_rna.csv',index_col=0)
df = pd.read_csv('./data/hpa_v21_kidney.csv',index_col=0)
adata = embeddings.join(df, how='inner')
adata = sc.AnnData(adata[embeddings.columns], obs=adata[df.columns])
adata.var_names = pd.Series(range(adata.shape[1])).astype(str)
adata.obs = adata.obs.join(rna, on='Gene', how='left')

adata

Observation names are not unique. To make them unique, call `.obs_names_make_unique`.


AnnData object with n_obs × n_vars = 66138 × 128
    obs: 'Batch', 'Slide', 'Well', 'Tissue', 'Gene name', 'Gene', 'UniProt', 'Antibody', 'nTPM', 'Staining', 'Sex', 'Age', 'Patient', 'URL', 'Level', 'Reliability', 'epithelial cell of proximal tubule', 'fibroblast', 'glomerular visceral epithelial cell', 'kidney capillary endothelial cell', 'kidney connecting tubule epithelial cell', 'kidney distal convoluted tubule epithelial cell', 'kidney loop of Henle thick ascending limb epithelial cell', 'leukocyte', 'mesangial cell', 'parietal epithelial cell', 'renal alpha-intercalated cell', 'renal beta-intercalated cell', 'renal principal cell'

Hold out images of marker proteins.

In [4]:
train_genes = set(adata.obs.dropna()['Gene'])
test_genes = train_genes & set(hpa_markers.index)
train_genes = train_genes - set(hpa_markers['gene'])

train_mask = adata.obs['Gene'].isin(train_genes)
test_mask = adata.obs['Gene'].isin(test_genes)

cell_types = list(rna.columns)

X_train = adata[train_mask].to_df() # embeddings
Y_train = adata[train_mask].obs[cell_types].copy() # RNA

X_test = adata[test_mask].to_df()
Y_test = adata[test_mask].obs[cell_types].copy()

sum(train_mask), sum(test_mask)

(52237, 583)

Fit softmax linear model using gradient descent.

In [5]:
from src.classifier import SoftmaxRegression, platt_scaling

clf = SoftmaxRegression(max_iters=1000, lr=0.01, verbose=True)
clf.fit(X_train, Y_train)

Y_pred = clf.predict_proba(adata.X)
Y_pred /= Y_pred.sum(1,keepdims=True)
Y_pred = pd.DataFrame(Y_pred, index=adata.obs_names, columns=cell_types)

100%|██████████| 1000/1000 [00:03<00:00, 283.36it/s]


In [6]:
Y_pred.to_csv('./data/kidney_scores.csv')
Y_pred.shape

(66138, 13)