In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import torch
torch.manual_seed(0)

%load_ext autoreload 
%autoreload 2
%config InlineBackend.figure_format = 'retina'

  from .autonotebook import tqdm as notebook_tqdm


Marker proteins of anatomical regions listed in "The kidney transcriptome and proteome defined by transcriptomics and antibody-based profiling".

In [2]:
TISSUE = 'kidney'
VERSION = 'final'

In [3]:
markers = pd.read_csv(f'./data/{TISSUE}_labels.csv',index_col=0)
markers

Unnamed: 0_level_0,region,gene
Ensembl ID(supplied by Ensembl),Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000128567,glomerulus,PODXL
ENSG00000158457,glomerulus,TSPAN33
ENSG00000113578,glomerulus,FGF1
ENSG00000116218,glomerulus,NPHS2
ENSG00000198743,glomerulus,SLC5A3
...,...,...
ENSG00000105707,collecting duct,HPN
ENSG00000214128,collecting duct,TMEM213
ENSG00000100362,collecting duct,PVALB
ENSG00000132677,collecting duct,RHBG


Load embeddings and normalized per-cell-type mean scRNA expression of each gene.

In [4]:
embeddings = pd.read_csv(f'./data/{TISSUE}_{VERSION}_embeddings.csv',index_col=0)

rna = pd.read_csv(f'./data/{TISSUE}_rna.csv',index_col=0)
cell_types = list(rna.columns)

df = pd.read_csv(f'./data/hpa_v21_{TISSUE}.csv',index_col=0)

df_train = embeddings.join(df, how='inner').join(rna, on='Gene', how='inner')
df_train['duplicated'] = df_train.index.value_counts()[df_train.index] > 1
# Only train on high-quality images.
df_train = ( df_train.query('(Staining=="high")|(Staining=="medium")')
                     .query('Reliability=="Enhanced"')
                     .query('~duplicated') )
# Hold out images of marker proteins.
df_train = df_train.loc[~df_train['Gene'].isin(set(markers.index))]

adata_train = sc.AnnData(df_train[embeddings.columns], obs=df_train.drop(columns=embeddings.columns))
adata_train.var_names = pd.Series(range(adata_train.shape[1])).astype(str)

adata_train

AnnData object with n_obs × n_vars = 9299 × 128
    obs: 'Batch', 'Slide', 'Well', 'Tissue', 'Gene name', 'Gene', 'UniProt', 'Antibody', 'nTPM', 'Staining', 'Sex', 'Age', 'Patient', 'URL', 'Level', 'Reliability', 'epithelial cell of proximal tubule', 'fibroblast', 'glomerular visceral epithelial cell', 'kidney capillary endothelial cell', 'kidney connecting tubule epithelial cell', 'kidney distal convoluted tubule epithelial cell', 'kidney loop of Henle thick ascending limb epithelial cell', 'leukocyte', 'mesangial cell', 'parietal epithelial cell', 'renal alpha-intercalated cell', 'renal beta-intercalated cell', 'renal principal cell', 'duplicated'

In [5]:
adata_train.obs['Gene'].nunique()

1898

In [6]:
df_test = embeddings.join(df, how='inner').join(markers[[]], on='Gene', how='inner')
df_test.shape

(633, 144)

Fit softmax linear model using gradient descent.

In [7]:
from src.classifier import SoftmaxRegression

X_train = adata_train.to_df() # embeddings
Y_train = adata_train.obs[cell_types].copy() # RNA

clf = SoftmaxRegression(max_iters=1000, lr=0.01, verbose=True)
clf.fit(X_train, Y_train)

100%|██████████| 1000/1000 [00:01<00:00, 779.16it/s]


In [8]:
# evaluate on entire dataset
scores = pd.DataFrame(
    clf.predict_proba(embeddings), 
    index=embeddings.index, 
    columns=cell_types
)

scores.to_csv(f'./data/{TISSUE}_{VERSION}_scores.csv')

scores

Unnamed: 0,epithelial cell of proximal tubule,fibroblast,glomerular visceral epithelial cell,kidney capillary endothelial cell,kidney connecting tubule epithelial cell,kidney distal convoluted tubule epithelial cell,kidney loop of Henle thick ascending limb epithelial cell,leukocyte,mesangial cell,parietal epithelial cell,renal alpha-intercalated cell,renal beta-intercalated cell,renal principal cell
135_A_8_5,0.067175,0.099638,0.060604,0.088363,0.083898,0.059190,0.063560,0.068804,0.064528,0.115039,0.061195,0.079349,0.088657
135_A_9_5,0.056025,0.070333,0.086232,0.059097,0.114928,0.047416,0.063425,0.061776,0.065347,0.141700,0.050191,0.055870,0.127660
135_A_7_5,0.055575,0.080684,0.105568,0.061484,0.100784,0.046676,0.060622,0.064008,0.070504,0.134826,0.047050,0.052870,0.119349
31204_A_9_5,0.068164,0.078026,0.069417,0.084450,0.071341,0.077415,0.092955,0.093621,0.069491,0.064949,0.080665,0.077426,0.072080
31204_A_8_5,0.088695,0.075706,0.089836,0.076678,0.073056,0.071267,0.080145,0.077999,0.065340,0.074644,0.079211,0.078130,0.069294
...,...,...,...,...,...,...,...,...,...,...,...,...,...
101305_A_8_5,0.103462,0.083811,0.063365,0.069240,0.066830,0.065430,0.062682,0.100442,0.073917,0.077410,0.078619,0.091256,0.063536
101305_A_9_5,0.155601,0.053652,0.040576,0.070032,0.060602,0.058026,0.052906,0.103153,0.070515,0.076597,0.091172,0.098552,0.068616
25866_A_7_5,0.089510,0.056203,0.073787,0.103354,0.083635,0.075279,0.093007,0.089733,0.064784,0.063060,0.062251,0.064700,0.080696
25866_A_9_5,0.065783,0.075539,0.101608,0.080042,0.075400,0.076294,0.083781,0.095453,0.066955,0.069281,0.074619,0.061260,0.073985
