# task-perturb-multiomics-grn
## Creating resources
### by Jalil Nourisa

# Multiomics

In [1]:
import anndata as ad
import pandas as pd

import numpy as np
data_dir = '../../perturb-multiomics-grn/output/'

resource_dir = '../resources/grn-benchmark'

In [2]:
adata_rna = ad.read_h5ad(f'{data_dir}/scRNA/adata_rna.h5ad')
adata_atac = ad.read_h5ad(f'{data_dir}/scATAC/adata_atac.h5ad')

In [8]:
adata_rna.obs = adata_rna.obs.drop(columns=['cell_type_original', 'Donor', 'Cell type'])
adata_rna.write(f'{resource_dir}/multiomics_rna.h5ad')

In [33]:
# np.savetxt(f'{resource_dir}/multiomics_genes.txt', adata_rna.var_names, fmt='%s')

In [9]:
# adata_atac.obs = adata_atac.obs[['obs_id']]
adata_atac.obs = adata_atac.obs.set_index('obs_id')

In [7]:
adata_atac.obs = adata_atac.obs[['obs_id', 'cell_type', 'donor_id']]
adata_atac.var = adata_atac.var.drop(columns=['chrom','chromStart', 'chromEnd'])

In [11]:
adata_atac.write(f'{resource_dir}/multiomics_atac.h5ad')

In [4]:
adata_atac = ad.read_h5ad(f'{resource_dir}/multiomics_atac.h5ad')

In [11]:
# adata_atac.obs.reset_index()

# Benchmark

In [2]:
adata_bulk = ad.read_h5ad(f'{data_dir}/preprocess/bulk_adata_integrated.h5ad')

In [3]:
adata_bulk.obs = adata_bulk.obs[['cell_type', 'sm_name', 'donor_id', 'plate_name', 'row', 'well', 'cell_count']]
adata_bulk.layers = {key:adata_bulk.layers[key] for key in ['n_counts', 'scgen_pearson', 'scgen_lognorm']}

In [5]:
adata_bulk.layers 

Layers with keys: n_counts, scgen_pearson, scgen_lognorm

In [6]:
adata_bulk.write(f'{resource_dir}/perturbation_data.h5ad')

# test resources

In [3]:
import os
test_resource_dir = f'{resource_dir}/../../resources_test/grn-benchmark'
os.makedirs(test_resource_dir, exist_ok=True)


In [2]:
adata_rna = ad.read_h5ad(f'{resource_dir}/multiomics_rna.h5ad')
adata_atac = ad.read_h5ad(f'{resource_dir}/multiomics_atac.h5ad')

In [5]:
adata_atac
adata_rna

AnnData object with n_obs × n_vars = 25034 × 22778
    obs: 'cell_type', 'donor_id'

In [5]:

peaks = pd.read_csv(f'{resource_dir}/peak_gene_models/granie.csv').peak.to_numpy()
hvgs = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['hvgs']
genes_multi = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['gene_names']
tfs = ad.read_h5ad(f'{resource_dir}/prior_data.h5ad').uns['tf_list']
genes = set(tfs) & set(genes_multi)

peaks = np.random.choice(peaks, 1000)


In [6]:
# shorten rna 
mask = adata_rna.obs.donor_id=='donor_0'
adata_rna_s = adata_rna[mask]
random_indices = np.random.choice(adata_rna_s.obs.index, 1000, replace=False)
adata_rna_s = adata_rna_s[random_indices, adata_rna_s.var_names.isin(genes)]

In [7]:
# shorten atac
adata_atac_s = adata_atac[adata_atac.obs.index.isin(adata_rna_s.obs.index), adata_atac.var.index.isin(peaks)]
adata_atac_s

View of AnnData object with n_obs × n_vars = 1000 × 868
    obs: 'cell_type', 'donor_id'

In [8]:
adata_rna_s.write(f'{test_resource_dir}/multiomics_rna.h5ad')
adata_atac_s.write(f'{test_resource_dir}/multiomics_atac.h5ad')


In [79]:
# shorten perturbation
adata_bulk = ad.read_h5ad(f'{resource_dir}/perturbation_data.h5ad')
adata_bulk[:200, adata_bulk.var_names.isin(genes)].write(f'{test_resource_dir}/perturbation_data.h5ad')

# Prior

## tf names


In [16]:
prior_adata = ad.AnnData()

In [17]:
# tfs list 
tf_list = np.loadtxt("https://resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt", dtype=str)
prior_adata.uns['tf_list'] = tf_list


## gene names

In [18]:
bulk_adata = ad.read_h5ad(f'{resource_dir}/perturbation_data.h5ad')
prior_adata.uns['gene_names_pert'] = bulk_adata.var_names.to_numpy()


In [19]:
bulk_adata = ad.read_h5ad(f'{resource_dir}/multiomics_rna.h5ad')
prior_adata.uns['gene_names'] = bulk_adata.var_names.to_numpy()

In [20]:
bulk_adata = ad.read_h5ad(f'{resource_dir}/multiomics_atac.h5ad')
prior_adata.uns['peak'] = bulk_adata.var_names.to_numpy()

In [22]:
prior_adata.uns['hvgs'] = np.loadtxt(f'{resource_dir}/hvgs.txt', dtype=str)

In [23]:
prior_adata.write(f'{resource_dir}/prior_data.h5ad')


## Gene names

In [12]:
np.savetxt(f'{work_dir}/benchmark/perturb_gene_names.txt', bulk_adata.var_names.values  , fmt='%s')

In [1]:
import requests

url = "https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_46/gencode.v46.annotation.gtf.gz"
local_filename = "gencode.v46.annotation.gtf.gz"

with requests.get(url, stream=True) as r:
    r.raise_for_status()
    with open(local_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            f.write(chunk)

## HVGs

In [5]:
n_hvgs = 3000

In [1]:
%load_ext rpy2.ipython

In [4]:
%%R -i work_dir
library(scry)
library(zellkonverter)
library(SingleCellExperiment)
options(digits=5, max.print=100)  # Adjust numbers as needed



adata = readH5AD(paste0(work_dir, "/preprocess/bulk_adata_f.h5ad")) # raw counts
sce = devianceFeatureSelection(adata, assay="X", batch=colData(adata)$plate_name)
writeH5AD(sce, paste0(work_dir, "/preprocess/adata_sce.h5ad"))


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

  openrlib.rlib.R_tryEval(


Registered S3 methods overwritten by 'zellkonverter':
  method                                             from      
  py_to_r.numpy.ndarray                              reticulate
  py_to_r.pandas.core.arrays.categorical.Categorical reticulate
Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats

Attaching package: ‘MatrixGenerics’

The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    r

In [2]:
adata_sce = ad.read(f"{work_dir}/preprocess/adata_sce.h5ad")
binomial_deviance  = adata_sce.var['binomial_deviance']




In [7]:
adata_sce.var['binomial_deviance']

A1BG         3242.560255
A1BG-AS1     3081.247245
A2M         51924.570033
A2M-AS1     47137.741205
A2MP1        4798.622262
                ...     
ZXDB         3123.833965
ZXDC         2762.431522
ZYG11B      14322.309802
ZYX         60193.206178
ZZEF1        5337.865927
Name: binomial_deviance, Length: 15215, dtype: float64

In [None]:
indices = binomial_deviance.argsort()[-n_hvgs:]
mask = np.zeros(adata_sce.var.shape[0], dtype=bool)
mask[indices] = True
hvgs_sce = adata_sce[:, mask].var.index.values

In [8]:
np.savetxt( f'{work_dir}/benchmark/hvgs.txt',hvgs_sce, fmt='%s')

## Gene annotation

In [84]:
# load the annotations and subset it for genes in multiomics data. also, remove
from local_utils import annotation

multiomics_genes =  np.loadtxt(f'{work_dir}/benchmark/multiomics_genes.txt', dtype=str)
annot_database = annotation.ensembl_gene_annotation()

transcript_types = ['protein_coding', 'lncRNA', 'miRNA']

annot_database_f = annot_database[annot_database.Gene.isin(multiomics_genes)].reset_index()

annot_database_f = annot_database_f[['Gene', 'Transcript_type']].drop_duplicates()
annot_database_f = annot_database_f[annot_database_f.Transcript_type.isin(transcript_types)]
rename_map = {'protein_coding':'Protein coding', 'lncRNA':'LncRNA',  'miRNA':'MicroRNA'}
annot_database_f.Transcript_type = annot_database_f.Transcript_type.map(rename_map)
annot_database_f.reset_index(drop=True, inplace=True)
annot_database_f.head()

Unnamed: 0,Gene,Transcript_type
0,AC007325.4,Protein coding
1,AC107375.1,LncRNA
2,AC022306.3,LncRNA
3,ALDOC,Protein coding
4,HEMK1,Protein coding


In [85]:
 #only keep genes with one annotation
annot_size = annot_database_f.groupby('Gene').size()
kept_genes = annot_size[~(annot_size>1)].index
annot_database_f = annot_database_f[annot_database_f.Gene.isin(kept_genes)].reset_index(drop=True)
annot_database_f.shape

(22139, 2)

In [86]:
annot_database_f.Transcript_type.value_counts()

Transcript_type
Protein coding    15282
LncRNA             6856
MicroRNA              1
Name: count, dtype: int64

In [87]:
#save 
annot_database_f.to_csv(f'{work_dir}/benchmark/gene_annotation.csv')

## peak annotation

In [13]:
adata_atac = ad.read_h5ad(f'{work_dir}/scATAC/adata_atac.h5ad')




In [14]:

## check if all the peaks in grns given in atac data
import re

def format_peak(peaks):
    formatted_peaks = []
    for peak in peaks:
        chr_, start, end = re.split(r'[:\-_]', peak)
        peak = f"{chr_}:{start}-{end}"

        formatted_peaks.append(peak)
    return formatted_peaks


atac_peaks = format_peak(adata_atac.var_names)



In [16]:
peaks = pd.DataFrame({'chr':[peak.split(':')[0] for peak in atac_peaks],
                                     'range':[peak.split(':')[1] for peak in atac_peaks]})
peaks

Unnamed: 0,chr,range
0,chr10,100001032-100001800
1,chr10,100006075-100006963
2,chr10,100009475-100010367
3,chr10,100013993-100014884
4,chr10,100020278-100021136
...,...,...
135353,chrY,7765105-7765991
135354,chrY,7814158-7815060
135355,chrY,7818681-7819599
135356,chrY,8535565-8536421


In [17]:
%%R -i peaks -o peaks_annotated_df
options(digits=5, max.print=100)  # Adjust numbers as needed
set.seed(123)

# install.packages("IRanges")
# install.packages("GenomicRanges")
# install.packages("ggplot2")
# install.packages("TxDb.Hsapiens.UCSC.hg38.knownGene")

library(IRanges)
library(GenomicRanges)
library(ggplot2)
library(TxDb.Hsapiens.UCSC.hg38.knownGene)

txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene


peaks = GRanges(peaks$chr, IRanges(peaks$range))
peaks_annotated = suppressMessages(ChIPseeker::annotatePeak(
    peaks,
    tssRegion = c(-1000, 1000), # extended from -5kb to 5
    TxDb = txdb,
    level = "transcript", 
    assignGenomicAnnotation = TRUE,  # the default
    genomicAnnotationPriority = c("Promoter", "5UTR", "3UTR", "Exon", "Intron",
                                "Downstream", "Intergenic"),  # the default
    annoDb = NULL,
    sameStrand = FALSE, # the default
    ignoreOverlap = FALSE, # the default
    ignoreUpstream = FALSE, # the default
    ignoreDownstream = FALSE, # the default
    overlap = "TSS", # the default
    verbose = TRUE # the default
))
peaks_annotated_df = as.data.frame(peaks_annotated)
# write.table(peaks_annotated_df, paste0(temp_dir, name, '_annot.txt'), sep=',', row.names = FALSE)



>> preparing features information...		 2024-06-18 14:17:45 
>> identifying nearest features...		 2024-06-18 14:17:46 
>> calculating distance from peak to TSS...	 2024-06-18 14:17:48 
>> assigning genomic annotation...		 2024-06-18 14:17:48 
>> assigning chromosome lengths			 2024-06-18 14:18:10 
>> done...					 2024-06-18 14:18:10 


Loading required package: GenomicFeatures
Loading required package: AnnotationDbi


In [19]:
peaks_annotated_df.head()

Unnamed: 0,seqnames,start,end,width,strand,annotation,geneChr,geneStart,geneEnd,geneLength,geneStrand,geneId,transcriptId,distanceToTSS
1,chr10,100001032,100001800,769,*,"Intron (ENST00000324109.9/23268, intron 1 of 16)",10,99875577,100009947,134371,2,23268,ENST00000324109.9,8147.0
2,chr10,100006075,100006963,889,*,"Intron (ENST00000324109.9/23268, intron 1 of 16)",10,99875577,100009947,134371,2,23268,ENST00000324109.9,2984.0
3,chr10,100009475,100010367,893,*,Promoter,10,99875577,100009947,134371,2,23268,ENST00000324109.9,0.0
4,chr10,100013993,100014884,892,*,Distal Intergenic,10,99875577,100009947,134371,2,23268,ENST00000324109.9,-4046.0
5,chr10,100020278,100021136,859,*,Distal Intergenic,10,99875577,100009947,134371,2,23268,ENST00000324109.9,-10331.0


In [29]:
map_={'Intron':'Intron', 'Exon':'Exon', 'Promoter':'Promoter', 'Distal':'Distal Intergenic', "3'":"3' UTR", 'Downstream':'Downstream (<=300)', "5'":"5' UTR"}

ann = peaks_annotated_df.annotation.str.split(' ', expand=True)[0]
ann = ann.map(map_)
peaks = peaks_annotated_df['seqnames'].astype(str)+':'+peaks_annotated_df['start'].astype(str) +'-' + peaks_annotated_df['end'].astype(str)
df = pd.DataFrame({'annotation':ann, 'peak':peaks})
df.to_csv(f'{resource_dir}/benchmark/peak_annotation.csv')
