In [1]:
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
from contextlib import contextmanager
@contextmanager
def ignore_warnings():
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        yield

from numba.core.errors import NumbaDeprecationWarning
warnings.filterwarnings('ignore', category=NumbaDeprecationWarning)
import scanpy as sc
import anndata as ad
import mudata as md
import muon as mu
import pyranges as pr

%config InlineBackend.figure_format = 'retina'

2024-12-08 16:24:11.576650: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-08 16:24:11.592440: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-08 16:24:11.611626: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-08 16:24:11.617475: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-08 16:24:11.631330: I tensorflow/core/platform/cpu_feature_guar

In [2]:
from stwg_grn_params import *

RAW_MDATA_PATH=../data/stwg-v2.h5mu
MDATA_PATH=../data/stwg-v2-filtered.h5mu
OUTPUT_DIR=../analysis/
JASPAR_PATH=../data/JASPAR2024_CORE_vertebrates_non-redundant_pfms_jaspar.txt
REFTSS_PATH=../data/reftss.pkl
HPA_PATH=../data/hpa_tfs.pkl
CHIPATLAS_PATH=../data/chipatlas_kidney_promoters.pkl
PLATFORM=batch
SAMPLE=sample
CELLTYPE=celltype
GEX=rna
ACC=atac
MIN_CELLS=30
PROXIMAL_BP=5000
RANDOM_STATE=0
NORMALIZE_TOTAL=False
NUM_CELLS=None
READS_PER_CELL=None
NUM_TOP_GENES=1000
MIN_SAMPLES=-1
NUM_TREES=20
LEARNING_RATE=0.5
MAX_DEPTH=None
EARLY_STOPPING=3
FEATURE_FRACTION=1
BAGGING_FRACTION=1
LEAVE_P_OUT=2
IMPORTANCE_THRESHOLD=0.95
CORRELATION_THRESHOLD=0.2


# Collate single-cell data

In [3]:
rna = sc.read('../data/rna.h5ad')

rna.obsm = {}
rna.varm = {}
rna.obsp = {}
rna.uns = {}

rna.obs['celltype'] = rna.obs['Deepscore_Annotation']

# duplicate the multiome snRNA
multiome = rna[rna.obs['batch'] == 'snRNA'].copy()
multiome.obs['batch'] = 'snMultiome'
rna = ad.concat([rna, multiome])

rna.obs['barcode'] = rna.obs_names.str.split('-').str[0]
rna.obs_names = rna.obs['barcode'] + '-' + rna.obs['sample'].astype(str) + '-' + rna.obs['batch'].astype(str)

rna.obs = rna.obs[['sample','barcode','batch','celltype']]

rna.X = rna.layers['counts']

rna = rna[~rna.obs.index.duplicated(keep='first')]

rna.obs

  utils.warn_names_duplicates("obs")


Unnamed: 0,sample,barcode,batch,celltype
AAACGAAAGGTGCCAA-lib_55-scRNA,lib_55,AAACGAAAGGTGCCAA,scRNA,ICA
AAACGAACACGTAGAG-lib_55-scRNA,lib_55,AAACGAACACGTAGAG,scRNA,ICA
AAACGAAGTACTCGCG-lib_55-scRNA,lib_55,AAACGAAGTACTCGCG,scRNA,TAL
AAACGAATCGAAGAAT-lib_55-scRNA,lib_55,AAACGAATCGAAGAAT,scRNA,aPT
AAACGAATCTCATTAC-lib_55-scRNA,lib_55,AAACGAATCTCATTAC,scRNA,ICB
...,...,...,...,...
TTTGTTGGTTGGATAT-lib_51-snMultiome,lib_51,TTTGTTGGTTGGATAT,snMultiome,CNT
TTTGTTGGTTGGTTGA-lib_15-snMultiome,lib_15,TTTGTTGGTTGGTTGA,snMultiome,ICA
TTTGTTGGTTTACTTG-lib_15-snMultiome,lib_15,TTTGTTGGTTTACTTG,snMultiome,DCT
TTTGTTGGTTTCCGGC-lib_15-snMultiome,lib_15,TTTGTTGGTTTCCGGC,snMultiome,aPT


In [4]:
atac = sc.read('../data/atac.h5ad',gex_only=False)

atac.obsm = {}
atac.varm = {}
atac.obsp = {}
atac.uns = {}

atac.obs['celltype'] = atac.obs['Deepscore_Annotation']
atac.obs['batch'] = 'snMultiome'

atac.obs['barcode'] = atac.obs_names.str.split('-').str[0]
atac.obs_names = atac.obs['barcode'] + '-' + atac.obs['sample'].astype(str) + '-' + atac.obs['batch'].astype(str)

atac.obs = atac.obs[['sample','barcode','batch','celltype']]

atac.X = atac.layers['counts']

atac = atac[~atac.obs.index.duplicated(keep='first')]

atac.obs

  utils.warn_names_duplicates("obs")


Unnamed: 0,sample,barcode,batch,celltype
AAACAGCCAACACCTA-lib_15-snMultiome,lib_15,AAACAGCCAACACCTA,snMultiome,PT
AAACAGCCAAGGCCAA-lib_57-snMultiome,lib_57,AAACAGCCAAGGCCAA,snMultiome,PT
AAACAGCCAATAACGA-lib_57-snMultiome,lib_57,AAACAGCCAATAACGA,snMultiome,PT
AAACAGCCACAGCCAT-lib_51-snMultiome,lib_51,AAACAGCCACAGCCAT,snMultiome,FIB
AAACAGCCACCAAAGG-lib_54-snMultiome,lib_54,AAACAGCCACCAAAGG,snMultiome,aPT
...,...,...,...,...
TTTGTTGGTTGAGCCG-lib_15-snMultiome,lib_15,TTTGTTGGTTGAGCCG,snMultiome,DCT
TTTGTTGGTTGGATAT-lib_51-snMultiome,lib_51,TTTGTTGGTTGGATAT,snMultiome,CNT
TTTGTTGGTTTACTTG-lib_15-snMultiome,lib_15,TTTGTTGGTTTACTTG,snMultiome,DCT
TTTGTTGGTTTCCGGC-lib_15-snMultiome,lib_15,TTTGTTGGTTTCCGGC,snMultiome,aPT


In [5]:
mdata = mu.MuData({
    GEX: rna,
    ACC: atac
})

var = mdata.var.copy()

gex_obs = mdata.obs.filter(like=GEX).astype(np.object_)
gex_obs.columns = gex_obs.columns.str.replace(f'{GEX}:','')
acc_obs = mdata.obs.filter(like=ACC).astype(np.object_)
acc_obs.columns = acc_obs.columns.str.replace(f'{ACC}:','')
obs = gex_obs.fillna(acc_obs)
mdata.obs = pd.concat([mdata.obs,obs],axis=1)

mdata.var = var.loc[mdata.var_names]

intervals = mdata['atac'].var_names.str.split(':|-',expand=True).to_frame()
intervals.index = mdata['atac'].var_names
intervals.columns = ['Chromosome','Start','End']
intervals['Start'] = intervals['Start'].astype(int)
intervals['End'] = intervals['End'].astype(int)

mdata['atac'].var = mdata['atac'].var.join(intervals)

gene_annots = pd.read_csv('../data/gencode.v44.basic.annotation.gtf.gz',sep='\t',header=None,comment='#')
gene_annots.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
gene_annots = gene_annots.query('feature=="gene"')
attributes = (gene_annots['attribute'].str.extractall(r'(\w+)\s+"([^"]+)"')
              .droplevel(1).reset_index().drop_duplicates(subset=['index',0])
              .pivot(index='index',columns=0,values=1))
gene_annots = gene_annots.join(attributes)

gene_annots = gene_annots.drop_duplicates(subset='gene_name',keep='first')
gene_annots = gene_annots.set_index('gene_name')

mdata['rna'].var = mdata['rna'].var.join(gene_annots).drop(columns='artif_dupl')

mdata.write(RAW_MDATA_PATH)

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)
  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


# Filter single-cell data

In [6]:
mdata = mu.read(RAW_MDATA_PATH)

mdata.shape

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


(182280, 190515)

In [7]:
# limit to biological samples represented in all modalities. there are *three* of these
celltypes = mdata.obs[CELLTYPE].unique()
platforms = mdata.obs[PLATFORM].unique()

min_samples = len(platforms) if MIN_SAMPLES == -1 else MIN_SAMPLES
    
mdata = mdata[mdata.obs.groupby(SAMPLE,observed=True)[PLATFORM].transform('nunique') == min_samples].copy()
mdata.shape

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


(42464, 190515)

In [8]:
# downsample each sample x platform to same bulk number of cells
# (ensuring we only use cells with GEX avialabel)
# because cell type bias is a property of the platform, don't touch it
# (currently not doing this - it's confounded with the platform itself?)

if NUM_CELLS is not None:
    indices = []
    for (platform, sample), _df in mdata[GEX].obs.groupby([PLATFORM,SAMPLE],observed=True):
        indices.extend(_df.sample(int(num_cells), random_state=RANDOM_STATE).index.tolist())

    mdata = mdata[indices].copy()
    
mdata.shape

(42464, 190515)

In [9]:
for (platform, sample), _df in mdata[GEX].obs.groupby([PLATFORM,SAMPLE],observed=True):
    print(platform, sample, len(_df))

scRNA lib_09 1813
scRNA lib_10 1578
scRNA lib_36 1002
scRNA5p lib_09 1080
scRNA5p lib_10 2720
scRNA5p lib_36 2616
snMultiome lib_09 3507
snMultiome lib_10 2722
snMultiome lib_36 8631
snRNA lib_09 3507
snRNA lib_10 2722
snRNA lib_36 8631


In [10]:
for (platform, sample), _df in mdata[GEX].obs.groupby([PLATFORM,SAMPLE],observed=True):
    print(platform, sample, mdata[GEX][_df.index].X.sum(axis=1).mean())

scRNA lib_09 3107.1494
scRNA lib_10 6582.6426
scRNA lib_36 1360.1996
scRNA5p lib_09 5144.8276
scRNA5p lib_10 4927.6562
scRNA5p lib_36 6101.9565
snMultiome lib_09 3563.9612
snMultiome lib_10 3076.3335
snMultiome lib_36 6483.363
snRNA lib_09 3563.9612
snRNA lib_10 3076.3335
snRNA lib_36 6483.363


In [11]:
mdata.write(MDATA_PATH)

  self._update_attr("var", axis=0, join_common=join_common)
  self._update_attr("obs", axis=1, join_common=join_common)


# HPA TFs

In [12]:
#!wget https://www.proteinatlas.org/download/normal_tissue.tsv.zip
hpa_annots = pd.read_csv('../data/normal_tissue.tsv.zip',sep='\t')
hpa = pd.read_csv('../data/hpa_v23_kidney.tsv',sep='\t',header=None)
hpa.columns = ['Gene name','Gene','uniprot_id','antibody_id','Tissue','Cell type',
               'staining','intensity','quantity','location','sex','age','donor_id','image_url']
hpa = hpa.merge(hpa_annots,on=['Gene','Gene name','Tissue','Cell type'])

df = hpa.query('Reliability=="Enhanced"')
df = df[['Gene name','location','staining']].drop_duplicates()
df['location'] = df['location'].str.split(',')
df = df.explode('location')
df['staining'] = df['staining'].map({'not detected':0.}).fillna(1.)
df = df.pivot_table(index='Gene name',columns='location',values='staining',aggfunc='mean').fillna(0.)

df.to_pickle(HPA_PATH)

len(df)

3677

# RefTSS start sites

In [13]:
tss_bed = pd.read_csv('../data/refTSS_v4.1_human_coordinate.hg38.bed.txt.gz',sep='\t')
tss_annot = pd.read_csv('../data/refTSS_v4.1_human_hg38_annotation.txt.gz',sep='\t')
tss = tss_bed.merge(tss_annot,on='refTSS_ID').dropna(subset='Gene_symbol')
tss.columns = tss.columns.str.capitalize()
tss['gene'] = tss['Gene_symbol']

tss.to_pickle(REFTSS_PATH)

len(tss)

98695

# Kidney ChipSeq links

In [14]:
def read_chipatlas(fn):
    chipseq = pd.read_csv(fn,sep='\t',engine='pyarrow',header=None,skiprows=1).iloc[:,:6]
    chipseq.columns = ['Chromosome', 'Start', 'End', 'attributes', 'score', 'Strand']
    chipseq['tf'] = chipseq['attributes'].str.split(';').str[1].str.split('=').str[1].str.split('%20').str[0]
    del chipseq['attributes']
    chipseq['Chromosome'] = chipseq['Chromosome'].astype('category')
    chipseq['tf'] = chipseq['tf'].astype('category')
    chipseq['Strand'] = chipseq['Strand'].astype('category')
    chipseq = chipseq.query('tf!="Epitope" and tf!="GFP"')
    return chipseq

chipseq = read_chipatlas('../data/Oth.Kid.05.AllAg.AllCell.bed')
chipseq.to_pickle('../data/Oth.Kid.05.AllAg.AllCell.pkl')

atac = read_chipatlas('../data/ATC.Kid.05.AllAg.Kidney_Cortex.bed')
atac.to_pickle('../data/ATC.Kid.05.AllAg.Kidney_Cortex.pkl')

In [15]:
# from ChipAtlas
chipseq = pd.read_pickle('../data/Oth.Kid.05.AllAg.AllCell.pkl')
pr_chipseq = pr.PyRanges(chipseq)

atac = pd.read_pickle('../data/ATC.Kid.05.AllAg.Kidney_Cortex.pkl')
pr_atac = pr.PyRanges(atac)
pr_chipseq = pr_chipseq.intersect(pr_atac)

tss = pd.read_pickle('../data/reftss.pkl')
pr_tss = pr.PyRanges(tss)

chipseq_grn = pr_tss.join(pr_chipseq, slack=500).df

chipseq_grn = chipseq_grn[['tf','gene']].drop_duplicates()
chipseq_grn.to_pickle(CHIPATLAS_PATH)

len(chipseq_grn)

1255823