In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scanpy as sc
import torch
torch.manual_seed(0)

%load_ext autoreload 
%autoreload 2
%config InlineBackend.figure_format = 'retina'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Marker proteins of anatomical regions listed in "The kidney transcriptome and proteome defined by transcriptomics and antibody-based profiling".

In [10]:
TISSUE = 'kidney'
VERSION = 'positives'

In [11]:
markers = pd.read_csv(f'./data/{TISSUE}_labels.csv',index_col=0)
markers

Unnamed: 0_level_0,region,gene
Ensembl ID(supplied by Ensembl),Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000128567,glomerulus,PODXL
ENSG00000158457,glomerulus,TSPAN33
ENSG00000113578,glomerulus,FGF1
ENSG00000116218,glomerulus,NPHS2
ENSG00000198743,glomerulus,SLC5A3
...,...,...
ENSG00000105707,collecting duct,HPN
ENSG00000214128,collecting duct,TMEM213
ENSG00000100362,collecting duct,PVALB
ENSG00000132677,collecting duct,RHBG


Load embeddings and normalized per-cell-type mean scRNA expression of each gene.

In [12]:
embeddings = pd.read_csv(f'./data/{TISSUE}_{VERSION}_embeddings.csv',index_col=0)

rna = pd.read_csv(f'./data/{TISSUE}_rna.csv',index_col=0)
cell_types = list(rna.columns)

df = pd.read_csv(f'./data/hpa_v21_{TISSUE}.csv',index_col=0)

df_train = embeddings.join(df, how='inner').join(rna, on='Gene', how='inner')
df_train['duplicated'] = df_train.index.value_counts()[df_train.index] > 1
# Only train on high-quality images.
df_train = ( df_train.query('(Staining=="high")|(Staining=="medium")')
                     .query('Reliability=="Enhanced"')
                     .query('~duplicated') )
# Hold out images of marker proteins.
df_train = df_train.loc[~df_train['Gene'].isin(set(markers.index))]

adata_train = sc.AnnData(df_train[embeddings.columns], obs=df_train.drop(columns=embeddings.columns))
adata_train.var_names = pd.Series(range(adata_train.shape[1])).astype(str)

adata_train

AnnData object with n_obs × n_vars = 9299 × 128
    obs: 'Batch', 'Slide', 'Well', 'Tissue', 'Gene name', 'Gene', 'UniProt', 'Antibody', 'nTPM', 'Staining', 'Sex', 'Age', 'Patient', 'URL', 'Level', 'Reliability', 'epithelial cell of proximal tubule', 'fibroblast', 'glomerular visceral epithelial cell', 'kidney capillary endothelial cell', 'kidney connecting tubule epithelial cell', 'kidney distal convoluted tubule epithelial cell', 'kidney loop of Henle thick ascending limb epithelial cell', 'leukocyte', 'mesangial cell', 'parietal epithelial cell', 'renal alpha-intercalated cell', 'renal beta-intercalated cell', 'renal principal cell', 'duplicated'

In [13]:
adata_train.obs['Gene'].nunique()

1898

In [14]:
df_test = embeddings.join(df, how='inner').join(markers[[]], on='Gene', how='inner')
df_test.shape

(633, 144)

Fit softmax linear model using gradient descent.

In [15]:
from src.classifier import SoftmaxRegression

X_train = adata_train.to_df() # embeddings
Y_train = adata_train.obs[cell_types].copy() # RNA

clf = SoftmaxRegression(max_iters=1000, lr=0.01, verbose=True)
clf.fit(X_train, Y_train)

100%|██████████| 1000/1000 [00:05<00:00, 187.75it/s]


In [16]:
# evaluate on entire dataset
scores = pd.DataFrame(
    clf.predict_proba(embeddings), 
    index=embeddings.index, 
    columns=cell_types
)

scores.to_csv(f'./data/{TISSUE}_{VERSION}_scores.csv')

scores

Unnamed: 0,epithelial cell of proximal tubule,fibroblast,glomerular visceral epithelial cell,kidney capillary endothelial cell,kidney connecting tubule epithelial cell,kidney distal convoluted tubule epithelial cell,kidney loop of Henle thick ascending limb epithelial cell,leukocyte,mesangial cell,parietal epithelial cell,renal alpha-intercalated cell,renal beta-intercalated cell,renal principal cell
135_A_7_5,0.063070,0.042382,0.113628,0.054086,0.113314,0.058952,0.077605,0.032902,0.043328,0.152136,0.052738,0.037405,0.158454
135_A_9_5,0.084687,0.062490,0.156482,0.080691,0.074087,0.060904,0.078259,0.070253,0.050264,0.102789,0.053602,0.047767,0.077726
135_A_8_5,0.086296,0.090864,0.150998,0.091875,0.054021,0.052620,0.054796,0.106047,0.069173,0.097759,0.050207,0.041310,0.054036
24830_A_8_5,0.047091,0.077461,0.045242,0.083679,0.088508,0.080859,0.093762,0.093549,0.058782,0.066642,0.101111,0.081669,0.081644
24830_A_9_5,0.054573,0.076662,0.048080,0.074463,0.087677,0.074535,0.089473,0.105379,0.048413,0.073023,0.099636,0.084127,0.083959
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2177_A_9_5,0.087656,0.101412,0.078308,0.076056,0.071183,0.075130,0.079069,0.076815,0.069001,0.082945,0.061540,0.075529,0.065355
2177_A_7_5,0.077308,0.076208,0.080435,0.077752,0.078518,0.069758,0.083767,0.089529,0.066955,0.066128,0.075465,0.084176,0.074000
24713_A_8_5,0.105727,0.080281,0.060775,0.076951,0.082655,0.067881,0.092989,0.061747,0.062673,0.082863,0.068709,0.073525,0.083225
24713_A_9_5,0.131571,0.066891,0.076042,0.068146,0.079660,0.072613,0.096482,0.061887,0.051859,0.063545,0.078379,0.074247,0.078678
