In [1]:
import os

import matplotlib as mpl

from scipy import sparse
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 # enables correct plotting of text
rcParams['figure.figsize'] = (5,5)
import seaborn as sns

In [2]:
import anndata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
import scvi
import muon as mu
from muon import atac as ac
import episcanpy as epi

  warn(f"Failed to load image Python extension: {e}")
Global seed set to 0


## Generate GENE ACTIVITY MATRIX FOR ATAC DATA 

In [4]:
atac = sc.read(f'objects/atac_clusters.h5ad')

In [5]:
atac.layers['tfidf'] = atac.X.copy()

In [6]:
atac.X = atac.layers['counts'].copy()

In [7]:
atac.var_names = [i.replace(':','_') for i in atac.var_names]
atac.var_names = [i.replace('-','_') for i in atac.var_names]

In [8]:
atac.var_names.tolist()[0:5]


['chr1_181019_181792',
 'chr1_191047_191897',
 'chr1_629498_630394',
 'chr1_631150_632220',
 'chr1_633506_634827']

In [None]:
def gtf(gtf_file,
        upstream=2000,
        feature_type='gene',
        gene_biotype='protein_coding',
        annotation='ensembl_havana',):
    gtf = {}
    with open(gtf_file) as f:
        for line in f:
            if line[0:2] != '#!' and '\t'+feature_type+'\t' in line and '\t'+annotation+'\t' in line and "gene_biotype "+gene_biotype:
                line = line.rstrip('\n').split('\t')
                if line[6] == '-':
                    if 'chr'+line[0] not in gtf.keys():
                        gtf['chr'+line[0]] = [[int(line[3]), int(line[4])+upstream,line[-1].split(';')[:-1]]]
                    else:
                        gtf['chr'+line[0]].append([int(line[3]), int(line[4])+upstream,line[-1].split(';')[:-1]])
                else:
                    if 'chr'+line[0] not in gtf.keys():
                        gtf['chr'+line[0]] = [[int(line[3])-upstream, int(line[4]),line[-1].split(';')[:-1]]]
                    else:
                        gtf['chr'+line[0]].append([int(line[3])-upstream, int(line[4]),line[-1].split(';')[:-1]])
    return gtf
def raw_adata_features(raw_adata):
    raw_adata_features = {}
    feature_index = 0
    # print(raw_adata.var_names.tolist())
    for line in raw_adata.var_names.tolist():
        line = line.split('_')
        if line[0] not in raw_adata_features.keys():
            raw_adata_features[line[0]] = [[int(line[1]),int(line[2]), feature_index]]
        else:
            raw_adata_features[line[0]].append([int(line[1]),int(line[2]), feature_index])
        feature_index += 1
    return raw_adata_features

def gene_activity_X_(raw_adata, raw_adata_features, gtf):
    gene_index = []
    gene_activity_X = []
    i=0
    for chrom in gtf.keys():
        print(chrom)
        #print(chrom)
        if chrom in raw_adata_features.keys():
            # print(gtf.keys())

            chrom_index = 0
            previous_features_index = 0
            for gene in gtf[chrom]:
                # print(gene[-1])
                gene_values = []
                gene_start = gene[0]
                gene_end = gene[1]
                for feature in raw_adata_features[chrom]:
                    feature_index = 0
                    if (feature[1]<= gene_start): # the window is before the gene. we need to test the next window.
                        continue
                    elif (gene_end <= feature[0]): # the window is after the gene. we need totest the next gene.
                        break
                    else: # the window is overlapping the gene. 
                        gene_values.append(raw_adata.X[:,feature[2]].todense())
                if gene_values != []:
                    # print(chrom,i)
                    i+=1
                    gene_activity_X.append(np.sum(gene_values, axis=0))
                    gene_index.append(gene[-1])
    return gene_activity_X, gene_index



gtf_ = gtf('../../../data/reference/Homo_sapiens.GRCh38.109.gtf')
raw_f = raw_adata_features(atac)




In [10]:
from tqdm import tqdm

In [11]:
def gene_activity_X_vectorized_(raw_adata, raw_adata_features, gtf):

    gene_index = []
    gene_activity_X = []

    for chrom in gtf.keys():
        print('Chrom: ', chrom)
        if chrom in raw_adata_features.keys():
            gene_start = np.array([gene[0] for gene in gtf[chrom]])
            gene_end = np.array([gene[1] for gene in gtf[chrom]])
            gene_names = np.array([gene[-1] for gene in gtf[chrom]])
            raw_adata_chrom_features = raw_adata_features[chrom]
            feature_start = np.array([feature[0] for feature in raw_adata_chrom_features])
            feature_end = np.array([feature[1] for feature in raw_adata_chrom_features])
            feature_indices = np.array([feature[2] for feature in raw_adata_chrom_features])
            
            for j, (start, end, name) in tqdm(enumerate(zip(gene_start, gene_end, gene_names)), total=len(gene_start),disable = False):
                indices = np.logical_and(feature_start <= end, start <= feature_end)
                gene_values = raw_adata.X[:, feature_indices[indices]].todense()
                if gene_values.size > 0:
                    gene_activity_X.append(np.sum(gene_values, axis=1))
                    gene_index.append(name)

    return gene_activity_X, gene_index

In [12]:
gene_activity_M, gene_index = gene_activity_X_vectorized_(atac, raw_f, gtf_)



Chrom:  chr1


100%|███████████████████████████████████████| 2013/2013 [25:12<00:00,  1.33it/s]


Chrom:  chr2


100%|███████████████████████████████████████| 1226/1226 [15:08<00:00,  1.35it/s]


Chrom:  chr3


100%|███████████████████████████████████████| 1042/1042 [13:08<00:00,  1.32it/s]


Chrom:  chr4


100%|█████████████████████████████████████████| 737/737 [09:34<00:00,  1.28it/s]


Chrom:  chr5


100%|█████████████████████████████████████████| 839/839 [10:43<00:00,  1.30it/s]


Chrom:  chr6


100%|███████████████████████████████████████| 1001/1001 [12:23<00:00,  1.35it/s]


Chrom:  chr7


100%|█████████████████████████████████████████| 879/879 [11:21<00:00,  1.29it/s]


Chrom:  chrX


100%|█████████████████████████████████████████| 812/812 [08:25<00:00,  1.61it/s]


Chrom:  chr8


100%|█████████████████████████████████████████| 662/662 [08:39<00:00,  1.27it/s]


Chrom:  chr9


100%|█████████████████████████████████████████| 760/760 [09:34<00:00,  1.32it/s]


Chrom:  chr11


100%|███████████████████████████████████████| 1266/1266 [14:28<00:00,  1.46it/s]


Chrom:  chr10


100%|█████████████████████████████████████████| 728/728 [09:32<00:00,  1.27it/s]


Chrom:  chr12


100%|█████████████████████████████████████████| 999/999 [12:51<00:00,  1.30it/s]


Chrom:  chr13


100%|█████████████████████████████████████████| 320/320 [04:13<00:00,  1.26it/s]


Chrom:  chr14


100%|█████████████████████████████████████████| 616/616 [07:53<00:00,  1.30it/s]


Chrom:  chr15


100%|█████████████████████████████████████████| 580/580 [07:37<00:00,  1.27it/s]


Chrom:  chr16


100%|█████████████████████████████████████████| 830/830 [10:29<00:00,  1.32it/s]


Chrom:  chr17


100%|███████████████████████████████████████| 1143/1143 [14:06<00:00,  1.35it/s]


Chrom:  chr18


100%|█████████████████████████████████████████| 265/265 [03:52<00:00,  1.14it/s]


Chrom:  chr20


100%|█████████████████████████████████████████| 538/538 [07:40<00:00,  1.17it/s]


Chrom:  chr19


100%|███████████████████████████████████████| 1366/1366 [18:34<00:00,  1.23it/s]


Chrom:  chrY


100%|███████████████████████████████████████████| 64/64 [00:11<00:00,  5.60it/s]


Chrom:  chr22


100%|█████████████████████████████████████████| 457/457 [05:57<00:00,  1.28it/s]


Chrom:  chr21


100%|█████████████████████████████████████████| 222/222 [02:37<00:00,  1.41it/s]


In [13]:
gene_activity_M = np.concatenate(tuple(gene_activity_M), axis=-1)

In [14]:
np.save('./gene_activity.npy', gene_activity_M)
np.save('./gene_index.npy', gene_index)



In [None]:


feature_type='gene',
raw_adata = atac
raw_adata_features = raw_f
# get the variable metadata
if feature_type=='transcript':
    gene_name = [x[7].lstrip(' transcript_name "').rstrip('"') for x in gene_index]
else:
    gene_name = [x[2].lstrip(' gene_name "').rstrip('"') for x in gene_index]

metadata_genes = {'gene_id' : [],
                  'transcript_id' : [],
                  'gene_type' : [],
                  'gene_name' : [],
                  'transcript_type' : [],
                  'transcript_name' : [],
                  'protein_id' : []}

for line in gene_index:
    dico_line = {}
    for element in line:
        if ' "' in element:
            dico_line[element.rstrip('"').lstrip(" ").split(' "')[0]] = element.rstrip('"').lstrip(" ").split(' "')[1]

    for key in metadata_genes.keys():
        if key in dico_line.keys():
            metadata_genes[key].append(dico_line[key])
        else:
            metadata_genes[key].append('NA')  

dataframe_genes = pd.DataFrame.from_dict(metadata_genes)
dataframe_genes.index = gene_name

#adata.layers[layer_name] = ad.AnnData(gene_activity_X, var=dataframe_genes, obs=raw_adata.obs)
gene_adata = sc.AnnData(gene_activity_M, var=dataframe_genes, obs=raw_adata.obs)
gene_adata.uns = atac.uns.copy()
gene_adata.obsm = atac.obsm.copy()
gene_adata.obsp = atac.obsp.copy()
gene_adata.write('objects/gene_activity_from_RNA.h5ad')
gene_adata.write('objects/GA_atac.h5ad')

