In [None]:
import pandas as pd
from scipy.stats import fisher_exact
import os








In [None]:
def run_enrichment(folder, condition_label):
    # Load KE stats
    ke_file = os.path.join(folder, 'KE-gene-stats.csv')
    ke_df = pd.read_csv(ke_file)

    # Load background gene dataset (path independent of folder)
    bg_file = f'../data/processed/{condition_label}_condition.csv'
    bg_df = pd.read_csv(bg_file, sep='\t')

    # Background totals
    total_genes = bg_df.shape[0]
    sig_genes = bg_df['Significance'].sum()
    non_sig_genes = total_genes - sig_genes

    results = []

    for _, row in ke_df.iterrows():
        ke_title = row['KE title'].strip()
        n_ke_genes = row['number of genes']
        n_sig_in_ke = row['number of significant genes']
        n_sig_up = row['number of significant genes up']
        n_sig_down = row['number of significant genes down']

        pct_sig = 100 * n_sig_in_ke / n_ke_genes
        pct_up = 100 * n_sig_up / n_ke_genes
        pct_down = 100 * n_sig_down / n_ke_genes

        n_non_sig_in_ke = n_ke_genes - n_sig_in_ke

        # Contingency table values
        a = n_sig_in_ke                               # significant & in KE
        b = sig_genes - a                             # significant & not in KE
        c = n_non_sig_in_ke                           # not significant & in KE
        d = non_sig_genes - c                         # not significant & not in KE

        contingency = [[a, b], [c, d]]
        oddsratio, p_value = fisher_exact(contingency, alternative='greater')

        results.append({
            'Condition': condition_label,
            'KE title': ke_title,
            'number of genes': n_ke_genes,
            'number of significant genes': n_sig_in_ke,
            'number of significant genes down': row['number of significant genes down'],
            'number of significant genes up': row['number of significant genes up'],
            'Percentage significant': round(pct_sig, 1),
            'Percentage upregulated': round(pct_up, 1),
            'Percentage downregulated': round(pct_down, 1),

            'a (sig & in KE)': a,
            'b (sig & not in KE)': b,
            'c (non-sig & in KE)': c,
            'd (non-sig & not in KE)': d,
            'Enrichment p-value': f"{p_value:.4f}",
            'Odds ratio': round(oddsratio, 4)
        })

    return pd.DataFrame(results)

In [None]:

# Run analysis for both hypo and hyper
hypo_results = run_enrichment('../data/results/hypo', 'hypothyroid')
hyper_results = run_enrichment('../data/results/hyper', 'hyperthyroid')

# Combine and sort
combined = pd.concat([hypo_results, hyper_results], ignore_index=True)
combined_sorted = combined.sort_values(['Condition', 'Enrichment p-value'])

# Show and optionally save
print(combined_sorted)
combined_sorted.to_csv('../data/results/combined_enrichment_results.csv', index=False)