<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#IV-day-3" data-toc-modified-id="IV-day-3-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>IV day 3</a></span></li><li><span><a href="#IV-day-8" data-toc-modified-id="IV-day-8-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>IV day 8</a></span></li><li><span><a href="#IV-day-both" data-toc-modified-id="IV-day-both-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>IV day both</a></span></li></ul></div>

In [1]:
import pandas as pd
import os
import numpy as np
from scipy import stats
from scipy.stats.mstats import gmean  # For geometric mean
from statsmodels.stats.multitest import multipletests # for fdr correction

In [2]:
# File path
filepath = "/Users/nehamurad/Desktop/Ph1/Ph1_Epistem/counts/postvspre/patient_specific/"

# IV day 3

In [3]:
# Define cohorts, contrasts, and prefixes for file names
cohorts = ["A", "B", "D", "F"]

contrasts = ["day3_vs_predose"]

# Set significance threshold
alpha = 0.05

# Create an empty list to store all DataFrames
all_results = []

# Loop over cohorts, contrasts, and prefixes
for cohort in cohorts:
    for contrast in contrasts:
            filename = f"{filepath}cohort_{cohort}_{contrast}.csv"
            if os.path.exists(filename):
                df = pd.read_csv(filename, index_col=0)

                # Add cohort and contrast columns to the DataFrame
                df["cohort"] = cohort
                df["contrast"] = contrast
                
                df = df.dropna(subset=['pvalue'])

                all_results.append(df)

# Concatenate all DataFrames into one
merged_df = pd.concat(all_results)

# Group by gene name and apply the meta-analysis
combined_results = merged_df.groupby(merged_df.index).agg({
    "pvalue": lambda x: stats.combine_pvalues(x, method='fisher')[1],  # Combine padj values directly
    "log2FoldChange": lambda x: np.log2(gmean(2**x))  # Calculate geometric mean on fold changes
})

# Rename columns and count significant results per gene
combined_results = combined_results.rename(columns={"pvalue": "combined_pvalue", "log2FoldChange": "combined_log2FoldChange"})
combined_results["significance_count"] = merged_df.groupby(merged_df.index)["pvalue"].apply(lambda x: (x < alpha).sum())
combined_results["combined_padj"] = multipletests(combined_results["combined_pvalue"], method='fdr_bh')[1]
combined_results["-log10(pvalue_adj)"] = -np.log10(combined_results["combined_padj"])

# Sort the results (no need to adjust again)
combined_results = combined_results.sort_values(
    by=["significance_count", "combined_pvalue"], ascending=[False, True]
)

fp = "/Users/nehamurad/Desktop/Ph1/Ph1_Epistem/counts/metanalysis_results/"

fn = "post_pre_IV_day_3.csv"

combined_results.to_csv(fp+fn)

# IV day 8

In [4]:
# Define cohorts, contrasts, and prefixes for file names
cohorts = ["A", "B", "D", "F"]

contrasts = ["day8_vs_predose"]

# Set significance threshold
alpha = 0.05

# Create an empty list to store all DataFrames
all_results = []

# Loop over cohorts, contrasts, and prefixes
for cohort in cohorts:
    for contrast in contrasts:
            filename = f"{filepath}cohort_{cohort}_{contrast}.csv"
            if os.path.exists(filename):
                df = pd.read_csv(filename, index_col=0)

                # Add cohort and contrast columns to the DataFrame
                df["cohort"] = cohort
                df["contrast"] = contrast
                
                df = df.dropna(subset=['pvalue'])

                all_results.append(df)

# Concatenate all DataFrames into one
merged_df = pd.concat(all_results)

# Group by gene name and apply the meta-analysis
combined_results = merged_df.groupby(merged_df.index).agg({
    "pvalue": lambda x: stats.combine_pvalues(x, method='fisher')[1],  # Combine padj values directly
    "log2FoldChange": lambda x: np.log2(gmean(2**x))  # Calculate geometric mean on fold changes
})

# Rename columns and count significant results per gene
combined_results = combined_results.rename(columns={"pvalue": "combined_pvalue", "log2FoldChange": "combined_log2FoldChange"})
combined_results["significance_count"] = merged_df.groupby(merged_df.index)["pvalue"].apply(lambda x: (x < alpha).sum())
combined_results["combined_padj"] = multipletests(combined_results["combined_pvalue"], method='fdr_bh')[1]
combined_results["-log10(pvalue_adj)"] = -np.log10(combined_results["combined_padj"])

# Sort the results (no need to adjust again)
combined_results = combined_results.sort_values(
    by=["significance_count", "combined_pvalue"], ascending=[False, True]
)

fp = "/Users/nehamurad/Desktop/Ph1/Ph1_Epistem/counts/metanalysis_results/"

fn = "post_pre_IV_day_8.csv"

combined_results.to_csv(fp+fn)

# IV day both

In [5]:
# Define cohorts, contrasts, and prefixes for file names
cohorts = ["A", "B", "D", "F"]

contrasts = ["day3_vs_predose", "day8_vs_predose"]

# Set significance threshold
alpha = 0.05

# Create an empty list to store all DataFrames
all_results = []

# Loop over cohorts, contrasts, and prefixes
for cohort in cohorts:
    for contrast in contrasts:
            filename = f"{filepath}cohort_{cohort}_{contrast}.csv"
            if os.path.exists(filename):
                df = pd.read_csv(filename, index_col=0)

                # Add cohort and contrast columns to the DataFrame
                df["cohort"] = cohort
                df["contrast"] = contrast
                
                df = df.dropna(subset=['pvalue'])

                all_results.append(df)

# Concatenate all DataFrames into one
merged_df = pd.concat(all_results)

# Group by gene name and apply the meta-analysis
combined_results = merged_df.groupby(merged_df.index).agg({
    "pvalue": lambda x: stats.combine_pvalues(x, method='fisher')[1],  # Combine padj values directly
    "log2FoldChange": lambda x: np.log2(gmean(2**x))  # Calculate geometric mean on fold changes
})

# Rename columns and count significant results per gene
combined_results = combined_results.rename(columns={"pvalue": "combined_pvalue", "log2FoldChange": "combined_log2FoldChange"})
combined_results["significance_count"] = merged_df.groupby(merged_df.index)["pvalue"].apply(lambda x: (x < alpha).sum())
combined_results["combined_padj"] = multipletests(combined_results["combined_pvalue"], method='fdr_bh')[1]
combined_results["-log10(pvalue_adj)"] = -np.log10(combined_results["combined_padj"])

# Sort the results (no need to adjust again)
combined_results = combined_results.sort_values(
    by=["significance_count", "combined_pvalue"], ascending=[False, True]
)

fp = "/Users/nehamurad/Desktop/Ph1/Ph1_Epistem/counts/metanalysis_results/"

fn = "post_pre_IV_day_both.csv"

combined_results.to_csv(fp+fn)

In [None]:
combined_results[(combined_results.significance_count>2) & (combined_results.combined_pvalue<0.05)]

In [None]:
sig_res = combined_results[(combined_results.significance_count>3) & (combined_results.combined_pvalue<0.01)]
sig_res

In [None]:
sig_res.index

In [None]:
sig_res.loc["PDGFRB"]

In [None]:
merged_df.loc["COL3A1"][merged_df.loc["COL3A1","padj"]<2]

In [None]:
sig_res.loc["PDGFRB"]

In [None]:
merged_df.loc["PDGFRB"][merged_df.loc["PDGFRB","padj"]<2]

Inhibition of cell growth and proliferation: Since PDGFRB signaling is essential for the growth and proliferation of many cell types, its downregulation can lead to decreased cell proliferation and survival.