<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#IV-day-3" data-toc-modified-id="IV-day-3-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>IV day 3</a></span></li><li><span><a href="#IV-day-8" data-toc-modified-id="IV-day-8-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>IV day 8</a></span></li><li><span><a href="#IV-day-both" data-toc-modified-id="IV-day-both-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>IV day both</a></span></li></ul></div>

In [1]:
import pandas as pd
import os
import numpy as np
from scipy import stats
from scipy.stats.mstats import gmean  # For geometric mean
from statsmodels.stats.multitest import multipletests # for fdr correction

In [2]:
# File path
filepath = "/Users/nehamurad/Desktop/Ph1/Ph1_Epistem/counts/dosedvsplacebo/"

# IV day 3

In [3]:
# Define cohorts, contrasts, and prefixes for file names
cohorts = ["A", "B", "D", "F"]

contrasts = ["Day_3_48hr_vs_placebo"]

prefixes = ["IVcombined_cohort_"]
# prefixes = ["IVcombined_cohort_", "IV_cohort_"]

# Set significance threshold
alpha = 0.05

# Create an empty list to store all DataFrames
all_results = []

# Loop over cohorts, contrasts, and prefixes
for cohort in cohorts:
    for contrast in contrasts:
        for prefix in prefixes:
            filename = f"{filepath}{prefix}{cohort}_{contrast}.csv"
            if os.path.exists(filename):
                df = pd.read_csv(filename, index_col=0)

                # Add cohort and contrast columns to the DataFrame
                df["cohort"] = cohort
                df["contrast"] = contrast
                df["prefix"] = prefix
                
                df = df.dropna(subset=['pvalue'])

                all_results.append(df)

# Concatenate all DataFrames into one
merged_df = pd.concat(all_results)

# Group by gene name and apply the meta-analysis
combined_results = merged_df.groupby(merged_df.index).agg({
    "pvalue": lambda x: stats.combine_pvalues(x, method='fisher')[1],  # Combine pvalues values directly
    "log2FoldChange": lambda x: np.log2(gmean(2**x))  # Calculate geometric mean on fold changes
})

# Rename columns and count significant results per gene
combined_results = combined_results.rename(columns={"pvalue": "combined_pvalue", "log2FoldChange": "combined_log2FoldChange"})
combined_results["significance_count"] = merged_df.groupby(merged_df.index)["pvalue"].apply(lambda x: (x < alpha).sum())
combined_results["combined_padj"] = multipletests(combined_results["combined_pvalue"], method='fdr_bh')[1]
combined_results["-log10(pvalue_adj)"] = -np.log10(combined_results["combined_padj"])

# Sort the results (no need to adjust again)
combined_results = combined_results.sort_values(
    by=["significance_count", "combined_pvalue"], ascending=[False, True]
)

fp = "/Users/nehamurad/Desktop/Ph1/Ph1_Epistem/counts/metanalysis_results/"

fn = "dosed_placebo_IV_day_3.csv"

combined_results.to_csv(fp+fn)

# IV day 8

In [4]:
# Define cohorts, contrasts, and prefixes for file names
cohorts = ["A", "B", "D", "F"]

contrasts = ["Day_8__vs_placebo"]

prefixes = ["IVcombined_cohort_"]
# prefixes = ["IVcombined_cohort_", "IV_cohort_"]

# Set significance threshold
alpha = 0.05

# Set significance threshold
alpha = 0.05

# Create an empty list to store all DataFrames
all_results = []

# Loop over cohorts, contrasts, and prefixes
for cohort in cohorts:
    for contrast in contrasts:
        for prefix in prefixes:
            filename = f"{filepath}{prefix}{cohort}_{contrast}.csv"
            if os.path.exists(filename):
                df = pd.read_csv(filename, index_col=0)

                # Add cohort and contrast columns to the DataFrame
                df["cohort"] = cohort
                df["contrast"] = contrast
                df["prefix"] = prefix
                
                df = df.dropna(subset=['pvalue'])

                all_results.append(df)

# Concatenate all DataFrames into one
merged_df = pd.concat(all_results)

# Group by gene name and apply the meta-analysis
combined_results = merged_df.groupby(merged_df.index).agg({
    "pvalue": lambda x: stats.combine_pvalues(x, method='fisher')[1],  # Combine pvalues values directly
    "log2FoldChange": lambda x: np.log2(gmean(2**x))  # Calculate geometric mean on fold changes
})

# Rename columns and count significant results per gene
combined_results = combined_results.rename(columns={"pvalue": "combined_pvalue", "log2FoldChange": "combined_log2FoldChange"})
combined_results["significance_count"] = merged_df.groupby(merged_df.index)["pvalue"].apply(lambda x: (x < alpha).sum())
combined_results["combined_padj"] = multipletests(combined_results["combined_pvalue"], method='fdr_bh')[1]
combined_results["-log10(pvalue_adj)"] = -np.log10(combined_results["combined_padj"])

# Sort the results (no need to adjust again)
combined_results = combined_results.sort_values(
    by=["significance_count", "combined_pvalue"], ascending=[False, True]
)

fp = "/Users/nehamurad/Desktop/Ph1/Ph1_Epistem/counts/metanalysis_results/"

fn = "dosed_placebo_IV_day_8.csv"

combined_results.to_csv(fp+fn)

# IV day both

In [5]:
# Define cohorts, contrasts, and prefixes for file names
cohorts = ["A", "B", "D", "F"]

contrasts = ["Day_3_48hr_vs_placebo", "Day_8__vs_placebo"]

prefixes = ["IVcombined_cohort_"]
# prefixes = ["IVcombined_cohort_", "IV_cohort_"]

# Set significance threshold
alpha = 0.05

# Set significance threshold
alpha = 0.05

# Create an empty list to store all DataFrames
all_results = []

# Loop over cohorts, contrasts, and prefixes
for cohort in cohorts:
    for contrast in contrasts:
        for prefix in prefixes:
            filename = f"{filepath}{prefix}{cohort}_{contrast}.csv"
            if os.path.exists(filename):
                df = pd.read_csv(filename, index_col=0)

                # Add cohort and contrast columns to the DataFrame
                df["cohort"] = cohort
                df["contrast"] = contrast
                df["prefix"] = prefix
                
                df = df.dropna(subset=['pvalue'])

                all_results.append(df)

# Concatenate all DataFrames into one
merged_df = pd.concat(all_results)

# Group by gene name and apply the meta-analysis
combined_results = merged_df.groupby(merged_df.index).agg({
    "pvalue": lambda x: stats.combine_pvalues(x, method='fisher')[1],  # Combine pvalues values directly
    "log2FoldChange": lambda x: np.log2(gmean(2**x))  # Calculate geometric mean on fold changes
})

# Rename columns and count significant results per gene
combined_results = combined_results.rename(columns={"pvalue": "combined_pvalue", "log2FoldChange": "combined_log2FoldChange"})
combined_results["significance_count"] = merged_df.groupby(merged_df.index)["pvalue"].apply(lambda x: (x < alpha).sum())
combined_results["combined_padj"] = multipletests(combined_results["combined_pvalue"], method='fdr_bh')[1]
combined_results["-log10(pvalue_adj)"] = -np.log10(combined_results["combined_padj"])

# Sort the results (no need to adjust again)
combined_results = combined_results.sort_values(
    by=["significance_count", "combined_pvalue"], ascending=[False, True]
)

fp = "/Users/nehamurad/Desktop/Ph1/Ph1_Epistem/counts/metanalysis_results/"

fn = "dosed_placebo_IV_day_both.csv"

combined_results.to_csv(fp+fn)

In [6]:

print(combined_results.head(20))

         combined_pvalue  combined_log2FoldChange  significance_count  \
KCNN3       2.708053e-13                -1.184576                   8   
NCAPD2      7.611390e-10                -0.300057                   7   
KLHDC7A     1.022576e-09                 0.675473                   7   
CHMP4B      4.622217e-09                 0.259551                   7   
ART1        6.446867e-09                -2.175427                   7   
MFAP5       6.813834e-20                 2.438428                   6   
ZNF586      2.457277e-14                 0.537264                   6   
RSL24D1     1.077596e-13                 0.461550                   6   
CFAP45      1.403974e-13                 1.366786                   6   
GCHFR       2.086242e-13                 0.742548                   6   
FAT2        7.228526e-13                -0.426260                   6   
NACA2       1.386802e-12                 0.448244                   6   
RPL34       1.413112e-12                 0.541825  

In [7]:
combined_results[(combined_results.significance_count>4) & (combined_results.combined_pvalue<0.05)]

Unnamed: 0,combined_pvalue,combined_log2FoldChange,significance_count,combined_padj,-log10(pvalue_adj)
KCNN3,2.708053e-13,-1.184576,8,5.015855e-10,9.299655
NCAPD2,7.611390e-10,-0.300057,7,1.480636e-07,6.829552
KLHDC7A,1.022576e-09,0.675473,7,1.856877e-07,6.731217
CHMP4B,4.622217e-09,0.259551,7,5.595601e-07,6.252153
ART1,6.446867e-09,-2.175427,7,7.287855e-07,6.137400
...,...,...,...,...,...
LSR,4.712631e-04,0.087962,5,2.734314e-03,2.563152
PIGT,5.000389e-04,-0.069324,5,2.860321e-03,2.543585
KIF22,6.970805e-04,-0.071388,5,3.664444e-03,2.435992
LONP1,7.627203e-04,0.055558,5,3.929654e-03,2.405646
