# Preprocess h5ad files for MAST

Last updated: 06/14/2022

- This script could ultimately be a function that is incorporated into the sc-utils

- This function grabs an Anndata(h5ad) file and check whether there is a "counts" layer, if so, convert the adata.X to that layer.
- If there's no "counts" layer, it should throw an error message. 
- [Optional] What we could do is use the np.sum(adata.X.todense(),1) and see if this value is integer or not, then throw an error.
- [Optional] We could also check whether the adata.X is log-normalized by using np.expm1

In [17]:
import pandas as pd
import scanpy as sc
import anndata as ad
import numpy as np

# define the function
def preprocess_h5ad_to_raw_counts(filepath, filename):
    # import the anndata
    adata = sc.read_h5ad(filepath+filename+'.h5ad')
    
    # check if there is "counts" layer in this anndata
    if "counts" in adata.layers:
        adata.X = adata.layers["counts"]
        
        # save the anndata
        adata.write_h5ad(filepath+filename+'_rawCounts.h5ad')
        # return adata
        return adata
    else:
        print('error: counts layer does not exist. Check your AnnData.')
        




## Convert all four h5ad objects from XAUT1/XAUT2- Blood and Biopsies, respectively

In [22]:
# XAUT1 - Blood
filepath = '/mnt/ibm_lg/yangjoon.kim/UC_UCSF_Multiome/XAUT1/RNA_Seq_data/merged_SCG11_14_Blood/Cellxgene/Blood MK version/'
filename = '22_6_13_blood_RNA_V6'

adata_test = preprocess_h5ad_to_raw_counts(filepath, filename)

In [23]:
# test if the raw counts conversion worked well
np.sum(adata_test.X.todense(),1)

matrix([[2446.],
        [ 179.],
        [2798.],
        ...,
        [1753.],
        [2598.],
        [3074.]])

In [24]:
# XAUT1 - Biopsies
filepath = '/mnt/ibm_lg/yangjoon.kim/UC_UCSF_Multiome/XAUT1/RNA_Seq_data/merged_SCG1_10_Biopsies/Cellxgene/MK_version/'
filename = '22_6_13_RNA'

adata_test = preprocess_h5ad_to_raw_counts(filepath, filename)

In [26]:
# test if the raw counts conversion worked well
np.sum(adata_test.X.todense(),1)

matrix([[ 700.],
        [1974.],
        [ 270.],
        ...,
        [1995.],
        [6917.],
        [2073.]], dtype=float32)

In [27]:
# XAUT2 - Blood
filepath = '/mnt/ibm_lg/yangjoon.kim/UC_UCSF_Multiome/XAUT2/RNA_Seq_data/merged_SCG9_12_Blood/Cellxgene/'
filename = '220613_blood_RNA'

adata_test = preprocess_h5ad_to_raw_counts(filepath, filename)

In [28]:
# test if the raw counts conversion worked well
np.sum(adata_test.X.todense(),1)

matrix([[3106.],
        [7262.],
        [3688.],
        ...,
        [2495.],
        [3016.],
        [1724.]], dtype=float32)

In [29]:
# XAUT2 - Biopsies
filepath = '/mnt/ibm_lg/yangjoon.kim/UC_UCSF_Multiome/XAUT2/RNA_Seq_data/merged_SCG1_8_Biopsies/Cellxgene/'
filename = '220613_biopsy_RNA'

adata_test = preprocess_h5ad_to_raw_counts(filepath, filename)

In [30]:
# test if the raw counts conversion worked well
np.sum(adata_test.X.todense(),1)

matrix([[ 2191.],
        [12651.],
        [ 3286.],
        ...,
        [19421.],
        [16452.],
        [25064.]], dtype=float32)