# Generate Exon Gene h5ad and Select Highly Variable Genes

Ensure that the exon gene count table is ready. This section will process the exon gene count table and select highly variable genes for downstream analysis.


In [1]:
import os
import pandas as pd
import numpy as np
import anndata
from tqdm import tqdm
import sys

In [4]:
main_folder = "./00_data_generation/"

# Define the folder where the exon gene count table is stored
exongene_folder = os.path.join(main_folder, "03_exon_star")

# Define your study name (update this for your specific study)
study_name = "STUDY"

# Define the full path for the output file
output_directory = os.path.join(main_folder, "final_data")
os.makedirs(output_directory, exist_ok=True)

output = os.path.join(output_directory, "ExonGene_"+study_name+".h5ad")
output_hvg = os.path.join(output_directory, "ExonGene_hvg_"+study_name+".h5ad")

In [3]:
metadata = "your_metaData.csv"
gtf_path = "./Homo_sapiens.GRCh38.107.gtf"

In [None]:
pd_gt = pd.read_csv(metadata, sep='\t')
pd_gt

In [None]:
pd_gt.shape[1]

### Read exon Gene count table

In [12]:
### Check if Cells in Metadata Have Corresponding Count Tables
cnt_files = []
for f in os.listdir(exongene_folder):
    if f.endswith("count.txt"):
        if f.split(".")[0] in list(pd_gt["CB"]):
            cnt_files.append(f)
if len(cnt_files) != pd_gt.shape[0]:
    sys.exit("There is a mismatch between the metadata and the FeatureCounts results. Please check.")

In [None]:
pd_count = pd.DataFrame([])
for i, f in enumerate(tqdm(cnt_files)):
    _cb = f.split(".")[0]
    pd_cb = pd.read_csv(exongene_folder+f, sep="\t", skiprows=1)
    pd_cb.columns = [*pd_cb.columns[:-1], _cb]
    pd_cb = pd_cb[["Geneid", _cb]]
    if i == 0:
        pd_count = pd_cb
    else:
        pd_count= pd.merge(pd_count, pd_cb, left_on=["Geneid"], right_on=["Geneid"], how='outer')

In [15]:
pd_count = pd_count.set_index("Geneid", drop=False)
pd_count.index.name=None
pd_count_t = pd_count.drop('Geneid', axis=1).T
pd_count_t = pd.merge(pd_gt, pd_count_t, left_on="CB",right_index=True)
pd_count_t.set_index('CB', drop=False, inplace=True)
pd_count_t.index.name = None

### get gene name

In [22]:
####complete function, may need slight modification based on your gtf format####
def get_ens_dict(file_path):
    with open(file_path) as f:
        gtf = list(f)

    gtf = [x for x in gtf if not x.startswith('#')]
    gtf = [x for x in gtf if 'gene_id "' in x and 'gene_name "' in x]
    if len(gtf) == 0:
        print('you need to change gene_id " and gene_name " formats')
    
    gtf = list(map(lambda x: (x.split('gene_id "')[1].split('"')[0], x.split('gene_name "')[1].split('"')[0]), gtf))
    gtf = dict(set(gtf))
    return gtf

gtf_dict = get_ens_dict(gtf_path)

In [23]:
pd_count["GeneName"] = pd_count["Geneid"]
pd_count = pd_count.replace({"GeneName": gtf_dict})
pd_count.set_index("GeneName", drop=False, inplace=True)
pd_count.index.name=None

In [None]:
#conver to h5ad file
## dataframe for annotating the observations = sample name
obs = pd_count_t[pd_gt.columns]

## dataframe for annotating the variables = geneid
var = pd_count[["Geneid", "GeneName"]]

# # # ##the data matrix 
X = pd_count_t.iloc[:,pd_gt.shape[1]:].values
adata = anndata.AnnData(X, obs=obs, var=var, dtype=np.float32)

adata.write(os.path.join(output))
adata

### Highly Variable Genes

In [20]:
import scanpy as sc

In [None]:
adata = sc.read_h5ad(output)
adata.var_names_make_unique()
sc.pp.filter_genes(adata, min_cells=3)
adata.var['mt'] = adata.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=5000)
adata.raw = adata
adata = adata[:, adata.var.highly_variable]
adata.write(output_hvg)