# Exon-Level Differential Gene Analysis Using Feature Matrix

In this section, we perform exon-level differential gene analysis by utilizing the feature matrix. This analysis helps identify genes that show significant differences in expression between different conditions or cell types at the exon level, enabling a more granular understanding of gene regulation.


In [1]:
import pandas as pd
import numpy as np
import statsmodels.stats.multitest as smm

In [2]:
def create_df_seurat(temp_name, leiden_res, num):
    print(temp_name)
    pd_temp = pd.read_csv("./"+temp_name+"/AdjacencyComp_"+temp_name+"_"+leiden_res+"_cluster_"+num+".csv")
    pd_temp = pd_temp.rename(columns={"Unnamed: 0":"Exon_names"})
    pd_temp['Gene_names'] = pd_temp['Exon_names'].apply(lambda x: x[:x.rfind('-')] if '-' in x else x)
    pd_temp = pd_temp.rename(columns={"Gene_names":"Cancer_gene_names", "Exon_names":"Cancer_names","p_val":temp_name+"_pvals", "p_val_adj":temp_name+"_pvals_adj", "avg_log2FC":temp_name+"_logfoldchanges", "pct.1":temp_name+"_pct.1", "pct.2":temp_name+"_pct.2"})
    print(len(set(pd_temp[(pd_temp[temp_name+"_pvals_adj"] < 0.05)&(abs(pd_temp[temp_name+"_logfoldchanges"])>1)]["Cancer_gene_names"])))
    return pd_temp

In [3]:
pd_combine = create_df_seurat("MAST", "leiden_0_4_8", "2")

MAST
234


## stouffer method and average p-value and average log2fc

In [4]:
for type in ["MAST"]:
    pd_combine[type+'_abs_avg_log2FC'] = pd_combine.groupby('Cancer_gene_names')[type+'_logfoldchanges'].transform(lambda x: x.dropna().abs().mean())

In [7]:
from scipy.stats import norm

def stouffer_method(pvals):
    """
    Combine p-values using the unweighted Stouffer's method.
    
    Parameters:
    - pvals: list or array of p-values
    
    Returns:
    - combined_p_value: The combined p-value after applying Stouffer's method.
    - combined_z: The combined z-score.
    """
    non_nan_pvals = pvals[~np.isnan(pvals)]
    
    if (len(non_nan_pvals) == 0):
        return np.nan
    
    # Convert p-values to z-scores
    z_scores = norm.isf(non_nan_pvals)  # Inverse survival function (equivalent to 1 - CDF)
    
    # Calculate the unweighted z-score
    combined_z = np.sum(z_scores) / np.sqrt(len(non_nan_pvals))
    
    # Convert the combined z-score back to a p-value
    combined_p_value = norm.sf(combined_z)  # Survival function (1 - CDF)
    
    return combined_p_value

In [8]:
for type in ["MAST"]:
    temp_stouffer_p = pd_combine.groupby('Cancer_gene_names').apply(
        lambda x: stouffer_method(x[type+'_pvals'].values)
    ).reset_index(name='temp_p_value')

    dict_temp_p_value = dict(zip(temp_stouffer_p['Cancer_gene_names'], temp_stouffer_p['temp_p_value']))

    pd_combine[type+'_stouffer_pval'] = pd_combine['Cancer_gene_names'].map(dict_temp_p_value)
    
    ## adjusted weighted fisher's p-value    
    df_sub_temp = pd_combine.dropna(subset=[type+'_stouffer_pval']).copy()
    df_sub_temp = df_sub_temp.drop_duplicates(subset=['Cancer_gene_names'])
    _, adjusted_pvals, _, _ = smm.multipletests(df_sub_temp[type+'_stouffer_pval'], method='fdr_bh')
    _, adjusted_pvals_bonf, _, _ = smm.multipletests(df_sub_temp[type+'_stouffer_pval'], method='bonferroni')
    
    df_sub_temp["temp_adj_p"] = adjusted_pvals
    df_sub_temp["temp_adj_p_bonf"] = adjusted_pvals_bonf
    
    dict_p_adj = dict(zip(df_sub_temp["Cancer_gene_names"], df_sub_temp["temp_adj_p"]))
    dict_p_adj_bonf = dict(zip(df_sub_temp["Cancer_gene_names"], df_sub_temp["temp_adj_p_bonf"]))

    pd_combine[type+"_stouffer_pval_adj"] = pd_combine["Cancer_gene_names"].map(dict_p_adj)
    pd_combine[type+"_stouffer_pval_adj_bonf"] = pd_combine["Cancer_gene_names"].map(dict_p_adj_bonf)

In [13]:
pd_combine_sub = pd_combine[(abs(pd_combine["MAST_abs_avg_log2FC"]) >1) & (pd_combine["MAST_stouffer_pval_adj_bonf"] < 0.05)]

In [15]:
len(set(pd_combine_sub["Cancer_gene_names"]))

742

In [21]:
pd_combine.to_csv("./MAST/AdjacencyComp_leiden_0_4_8_cluster_2_MAST_v2_"+str(len(set(pd_combine_sub["Cancer_gene_names"])))+".csv", index=False)

In [20]:
pd_combine_sub["Cancer_gene_names"].drop_duplicates().to_csv("./MAST/AdjacencyComp_leiden_0_4_8_cluster_2_MAST_v2_Topp_"+str(len(set(pd_combine_sub["Cancer_gene_names"])))+".csv", index=False, header = False)