## for now only accounting for mitochondrial genes 

In [2]:
import anndata as ad
import os
import rpy2
import numpy as np
import pandas as pd
import rapids_singlecell as rsc
import sys
import scipy.sparse
import scanpy as sc



In [3]:
data_path = '/data/hadjantalab/lucas/atlas/data'

In [4]:
file_path = os.path.join(data_path, 'adata.h5ad')
adata = ad.read_h5ad(file_path)

In [5]:
adata

AnnData object with n_obs × n_vars = 535935 × 8342
    obs: 'batch', 'stage', 'sample', 'covariate_composite', 'celltype_nowotschin', 'celltype_rosshandler', 'celltype_pijuansala', 'nowotschin_cluster', 'rosshandler_somite_count', 'rosshandler_anatomy', 'rosshandler_phase', 'rosshandler_S_score', 'rosshandler_G2M_score', 'rosshandler_louvain', 'rosshandler_leiden', 'annotation', 'mt-Atp6', 'mt-Atp8', 'mt-Co1', 'mt-Co2', 'mt-Co3', 'mt-Cytb', 'mt-Nd1', 'mt-Nd2', 'mt-Nd3', 'mt-Nd4', 'mt-Nd4l', 'mt-Nd5', 'mt-Nd6', 'S_score', 'G2M_score', 'phase', '_scvi_batch', '_scvi_labels'
    var: 'highly_variable'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'colormap_annotation', 'colormap_batch', 'colormap_rosshandler', 'colormap_stage', 'log1p', 'neighbors_scVI', 'phase_colors', 'symbolmap_batch', 'symbolmap_plotly_batch', 'symbolmap_plotly_stage', 'symbolmap_stage', 'umap'
    obsm: 'X_scVI', 'X_umap', 'X_umap_scVI', '_scvi_extra_categorical_covs', '_scvi_extra_continuous_covs'
    layers: 'log_no

In [6]:
adata.obs['annotation_old'] = adata.obs['annotation'].copy()
del adata.obs['annotation']

In [9]:
# getting some more information on memory requirement

def get_total_size_of_anndata(adata):
    total_size = sys.getsizeof(adata)  # Base size of the AnnData object
    
    # Add sizes of the attributes
    total_size += sys.getsizeof(adata.X)         # Expression matrix
    total_size += sys.getsizeof(adata.obs)       # Observations
    total_size += sys.getsizeof(adata.var)       # Variables (genes)
    total_size += sys.getsizeof(adata.obsm)      # Multi-dimensional observations
    total_size += sys.getsizeof(adata.varm)      # Multi-dimensional variables
    total_size += sys.getsizeof(adata.uns)       # Unstructured annotations
    total_size += sys.getsizeof(adata.layers)    # Layers of data
    
    # Include the size of adata.raw, if it exists
    if adata.raw is not None:
        total_size += sys.getsizeof(adata.raw)
        total_size += sys.getsizeof(adata.raw.X)
        total_size += sys.getsizeof(adata.raw.var)
    
    return total_size / (1024 ** 3)  # Convert bytes to GB

# Example usage:
print(f"Total size of AnnData object (in RAM): {get_total_size_of_anndata(adata):.2f} GB")

Total size of AnnData object (in RAM): 16.98 GB


## Cell cycle genes

In [13]:
## loading R to get list from satija lab

## setting R_HOME and R_USER
os.environ['R_HOME'] = '/home/schroel1/miniconda3/envs/gpu2/lib/R'
os.environ['R_USER'] = '/home/schroel1/miniconda3/envs/gpu2/lib/R'

# Import the rpy2 interface
import rpy2.robjects as ro
from rpy2.robjects import r


# Load the Seurat package in R
r('library(Seurat)')

R[write to console]: Loading required package: SeuratObject

R[write to console]: Loading required package: sp

R[write to console]: 
Attaching package: ‘SeuratObject’


R[write to console]: The following objects are masked from ‘package:base’:

    intersect, t





    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

0,1,2,3,4,5,6
'Seurat','SeuratOb...,'sp',...,'datasets','methods','base'


In [14]:
## list from https://satijalab.org/seurat/archive/v3.0/cell_cycle_vignette.html 

# Define and execute the R code to access the gene sets
r('s_genes_satija <- cc.genes$s.genes')
r('g2m_genes_satija <- cc.genes$g2m.genes')

# Access the variables in Python
s_genes_satija = ro.globalenv['s_genes_satija']
g2m_genes_satija = ro.globalenv['g2m_genes_satija']

# convert them to Python objects
s_genes_satija = list(s_genes_satija)
g2m_genes_satija = list(g2m_genes_satija)

## combine the different cell cycle genes
cell_cycle_satija = s_genes_satija + g2m_genes_satija
# Convert both adata.var_names and cell_cycle_genes to lowercase for case-insensitive comparison
lowercase_mgi_symbols = adata.var_names.str.lower()
lowercase_cell_cycle_satija = [gene.lower() for gene in cell_cycle_satija]

# Perform the case-insensitive isin check
mask_cell_cycle_satija = lowercase_mgi_symbols.isin(lowercase_cell_cycle_satija)
filtered_cell_cycle_satija = list(adata.var_names[mask_cell_cycle_satija])
print(f'Genes in the list: {len(cell_cycle_satija)}, genes found in data: {len(filtered_cell_cycle_satija)}')


Genes in the list: 97, genes found in data: 45


In [12]:
# list from https://yoseflab.github.io/scvi-tools-reproducibility/scvi_covariates/ 

cell_cycle_yosef = ['PCNA', 'dnk', 'RnrS', 'RnrL', 'Claspin', 'Mcm5', 'Caf1-180',
       'RPA2', 'HipHop', 'stg', 'Mcm6', 'dup', 'WRNexo', 'Mcm7', 'dpa',
       'CG10336', 'Mcm3', 'Mcm2', 'RpA-70', 'Chrac-14', 'CG13690', 'RPA3',
       'asf1', 'DNApol-alpha73', 'CycE', 'DNApol-alpha50',
       'Kmn1', 'Lam', 'Nph', 'msd5', 'msd1', 'ctp', 'Set', 'scra',
       'Chrac-16', 'ncd', 'Ote', 'pzg', 'HDAC1', 'nesd', 'tum', 'CG8173',
       'aurB', 'feo', 'pav', 'CG6767', 'sip2', 'Det', 'Cks30A', 'CycB',
       'B52']

# Convert both adata.var_names and cell_cycle_yosef to lowercase for case-insensitive comparison
lowercase_mgi_symbols = adata.var_names.str.lower()
lowercase_cell_cycle_yosef = [gene.lower() for gene in cell_cycle_yosef]

# Perform the case-insensitive isin check
filtered_cell_cycle_yosef = adata.var_names[lowercase_mgi_symbols.isin(lowercase_cell_cycle_yosef)]
## check the genes found
print(f'Genes in the list: {len(cell_cycle_yosef)}, genes found in data: {len(filtered_cell_cycle_yosef)}')
filtered_cell_cycle_yosef


## converting to a python list
filtered_cell_cycle_yosef = list(filtered_cell_cycle_yosef)


Genes in the list: 51, genes found in data: 1


In [13]:
# Get the indices of all the genes in `filtered_cell_cycle_satija`
#gene_indices = adata.var.index.get_indexer(filtered_cell_cycle_satija)

# Extract all columns corresponding to these gene indices from the sparse matrix at once
#genes_expression = adata.layers['log_normalized'][:, gene_indices]

# Convert to dense matrix only if necessary (e.g., for adding to adata.obs)
# Instead of converting one column at a time, convert the entire block at once
#genes_expression_dense = genes_expression.toarray()

# Add each gene's expression to adata.obs, all at once
#for i, gene in enumerate(filtered_cell_cycle_satija):
#    adata.obs[gene] = genes_expression_dense[:, i]


In [14]:
#adata._inplace_subset_var(~mask_cell_cycle_satija)

## Mitochondrial genes

In [10]:
lowercase_mgi_symbols = adata.var_names.str.lower()
mask_mito = lowercase_mgi_symbols.str.startswith('mt-')
filtered_mito = list(adata.var_names[mask_mito])
filtered_mito

['mt-Atp6',
 'mt-Atp8',
 'mt-Co1',
 'mt-Co2',
 'mt-Co3',
 'mt-Cytb',
 'mt-Nd1',
 'mt-Nd2',
 'mt-Nd3',
 'mt-Nd4',
 'mt-Nd4l',
 'mt-Nd5',
 'mt-Nd6']

In [11]:
# Get the indices of all the genes in `filtered_cell_cycle_satija`
gene_indices = adata.var.index.get_indexer(filtered_mito)

# Extract all columns corresponding to these gene indices from the sparse matrix at once
genes_expression = adata.layers['log_normalized'][:, gene_indices]

# Convert to dense matrix only if necessary (e.g., for adding to adata.obs)
# Instead of converting one column at a time, convert the entire block at once
genes_expression_dense = genes_expression.toarray()

# Add each gene's expression to adata.obs, all at once
for i, gene in enumerate(filtered_mito):
    adata.obs[gene] = genes_expression_dense[:, i]


In [12]:
adata._inplace_subset_var(~mask_mito)

## Ribosomal genes

In [18]:
lowercase_mgi_symbols = adata.var_names.str.lower()
mask_ribo = lowercase_mgi_symbols.str.startswith('rps') | lowercase_mgi_symbols.str.startswith('rpl')
filtered_ribo = list(adata.var_names[mask_ribo])
filtered_ribo

['Rpl10-ps3',
 'Rpl10l',
 'Rpl12',
 'Rpl13-ps3',
 'Rpl13a',
 'Rpl14',
 'Rpl15',
 'Rpl23a',
 'Rpl27',
 'Rpl28',
 'Rpl29',
 'Rpl30',
 'Rpl36-ps4',
 'Rpl39l',
 'Rpl4',
 'Rpl6',
 'Rplp0',
 'Rplp1',
 'Rps19',
 'Rps2',
 'Rps20',
 'Rps27rt',
 'Rps28',
 'Rps29',
 'Rps6',
 'Rps6ka3',
 'Rps6ka6',
 'Rpsa']

## Pseudogenes

In [19]:
lowercase_mgi_symbols = adata.var_names.str.lower()
mask_pseudogenes = lowercase_mgi_symbols.str.match(r'^gm\d+$')
filtered_pseudogenes = list(adata.var_names[mask_pseudogenes])
filtered_pseudogenes

['Gm10053',
 'Gm10076',
 'Gm10116',
 'Gm10184',
 'Gm10220',
 'Gm10283',
 'Gm10287',
 'Gm10320',
 'Gm10324',
 'Gm10334',
 'Gm10354',
 'Gm10390',
 'Gm10415',
 'Gm10419',
 'Gm10425',
 'Gm10435',
 'Gm10445',
 'Gm10482',
 'Gm10522',
 'Gm10558',
 'Gm10564',
 'Gm10603',
 'Gm10604',
 'Gm10605',
 'Gm10612',
 'Gm10629',
 'Gm10684',
 'Gm10687',
 'Gm10710',
 'Gm10715',
 'Gm10827',
 'Gm10837',
 'Gm10857',
 'Gm10912',
 'Gm1110',
 'Gm11128',
 'Gm11211',
 'Gm11228',
 'Gm11232',
 'Gm11264',
 'Gm11377',
 'Gm11379',
 'Gm11426',
 'Gm11437',
 'Gm11444',
 'Gm11464',
 'Gm11479',
 'Gm11496',
 'Gm11497',
 'Gm11521',
 'Gm11559',
 'Gm11563',
 'Gm11567',
 'Gm11627',
 'Gm11639',
 'Gm11657',
 'Gm11665',
 'Gm11681',
 'Gm11690',
 'Gm11728',
 'Gm11734',
 'Gm11747',
 'Gm11789',
 'Gm11808',
 'Gm11884',
 'Gm11992',
 'Gm12050',
 'Gm12052',
 'Gm12056',
 'Gm12107',
 'Gm12144',
 'Gm12169',
 'Gm12171',
 'Gm12239',
 'Gm12249',
 'Gm12280',
 'Gm12295',
 'Gm12326',
 'Gm12405',
 'Gm12415',
 'Gm12426',
 'Gm12436',
 'Gm12446',
 'Gm1

In [20]:
#adata._inplace_subset_var(~mask_pseudogenes)

## Sex related genes

In [25]:
## import r line magic
#%load_ext rpy2.ipython
#%R library(biomaRt)

In [22]:
#%R print(listAttributes(mart)$name)

In [26]:
## filter the list on the 
#%R mart <- useMart(biomart="ensembl", dataset="mmusculus_gene_ensembl")
#%R genes_sex_male <- getBM(attributes = c("mgi_symbol"), filters = "chromosome_name", values = "Y", mart = mart)
#%R genes_sex_male

In [None]:
#genes_sex_male = ro.globalenv['genes_sex_male']
#genes_sex_male = list(genes_sex_male[0])
#genes_sex_female = ['Xist', 'Tsix']
#genes_sex = genes_sex_male + genes_sex_female

In [None]:
#lowercase_mgi_symbols = adata.var_names.str.lower()
#mask_sex = lowercase_mgi_symbols.isin(str.lower(gene) for gene in genes_sex)
#filtered_sex = list(adata.var_names[mask_sex])
#filtered_sex

In [None]:
# Get the indices of all the genes in `filtered_cell_cycle_satija`
#gene_indices = adata.var.index.get_indexer(filtered_sex)

# Extract all columns corresponding to these gene indices from the sparse matrix at once
#genes_expression = adata.layers['log_normalized'][:, gene_indices]

# Convert to dense matrix only if necessary (e.g., for adding to adata.obs)
# Instead of converting one column at a time, convert the entire block at once
#genes_expression_dense = genes_expression.toarray()

# Add each gene's expression to adata.obs, all at once
#for i, gene in enumerate(filtered_sex):
#    adata.obs[gene] = genes_expression_dense[:, i]


In [None]:
#adata._inplace_subset_var(~mask_sex)

## Chemistry

In [None]:
## adding chemistry did not change a thing

#adata.obs['chemistry'] = 'to_be_determined'
#adata.obs.loc[adata.obs['batch'] == 'pijuan-sala', 'chemistry'] = 'v1'
#adata.obs.loc[adata.obs['batch'] == 'nowotschin', 'chemistry'] = 'v2'
#adata.obs.loc[adata.obs['batch'] == 'rosshandler', 'chemistry'] = 'v3'
#adata.obs.loc[adata.obs['batch'] == 'pulse_chase_labeled', 'chemistry'] = 'v3'
#adata.obs['chemistry'].value_counts()

## Cell cycle score

In [16]:
s_genes_satija = [gene.capitalize() for gene in s_genes_satija]
g2m_genes_satija = [gene.capitalize() for gene in g2m_genes_satija]

In [18]:
print(sum(adata.var_names.isin(s_genes_satija)))
print(sum(adata.var_names.isin(g2m_genes_satija)))

15
30


In [17]:
## USING RAW INSTEAD TO GET MORE INFORMATION
print(sum(adata.raw.var_names.isin(s_genes_satija)))
print(sum(adata.raw.var_names.isin(g2m_genes_satija)))

42
54


In [37]:
s_genes = adata.raw.var_names[adata.raw.var_names.isin(s_genes_satija)]
g2m_genes = adata.raw.var_names[adata.raw.var_names.isin(g2m_genes_satija)]


In [38]:
%%time
sc.tl.score_genes_cell_cycle(adata, s_genes = s_genes, g2m_genes = g2m_genes)

CPU times: user 1min 39s, sys: 21.2 s, total: 2min
Wall time: 2min 1s


## Saving adata object

In [15]:
print(f"Total size of AnnData object (in RAM): {get_total_size_of_anndata(adata):.2f} GB")

Total size of AnnData object (in RAM): 16.90 GB


In [41]:
data_path = '/data/hadjantalab/lucas/atlas/data'

In [42]:
%%time
## saving the complete object, it ain't that big
file_path = os.path.join(data_path, "adata.h5ad")
adata.write_h5ad(file_path)

CPU times: user 9.4 s, sys: 9.15 s, total: 18.6 s
Wall time: 31.7 s
