In [1]:
import os,sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from plotnine import *
from statsmodels.stats.multitest import multipletests
from pathlib import Path
from tqdm.auto import tqdm
import re
import glob
from scipy import stats

In [2]:
dir_list = pd.read_csv('/Users/chandrima.modak/Gladstone Dropbox/Chandrima Modak/gw-CRISPRa_from_cluster/h5ad_raw_files_info.csv')
lane_cols_for_eff = [c for c in dir_list.columns if c.startswith("lane")]
run = []
for _, row in dir_list[lane_cols_for_eff].iterrows():
    lane_dirs = []
    for col in lane_cols_for_eff:
        v = row[col]
        if pd.notna(v) and str(v).strip() != "":
            lane_dirs.append(f"{v}_{col}")
    if lane_dirs:
        run.append(lane_dirs)

In [3]:
def summarize_exp_stats(lane_path, lane_list):
    dfs = []  
    for lanes in lane_list:
        df = pd.read_csv(glob.glob(os.path.join(lane_path, lanes, "*_guide_count_info.csv"))[0])
        df.set_index('guide_id', inplace= True)
        dfs.append(df)
    num_cols = ['n_cells','sum_guide','sumsq_guide','ntc_cells','sum_ntc','sumsq_ntc']
    
    summary_ = pd.concat([df[num_cols] for df in dfs], axis=0).groupby('guide_id').sum()
    return summary_

In [4]:
def compute_stats(dict_key, dict_val):
    pseudocount = 5e-2
    df = dict_val.copy()

    # Means
    df["guide_mean"] = df["sum_guide"] / df["n_cells"]
    df["ntc_mean"]   = df["sum_ntc"]   / df["ntc_cells"]

    # Variances (safe when n>=2)
    df["guide_var"] = (df["sumsq_guide"] - (df["sum_guide"]**2)/df["n_cells"]) / (df["n_cells"] - 1)
    df["ntc_var"]   = (df["sumsq_ntc"]   - (df["sum_ntc"]**2)/df["ntc_cells"]) / (df["ntc_cells"] - 1)

    df["guide_std"] = np.sqrt(df["guide_var"])
    df["ntc_std"]   = np.sqrt(df["ntc_var"])

    # Handle NaN/0 std
    df.fillna({'guide_std': 0, 'ntc_std': 0}, inplace=True)
    df.loc[:, "guide_std"] = np.where(df["guide_std"] == 0, 0.01, df["guide_std"])
    df.loc[:, "ntc_std"]   = np.where(df["ntc_std"]   == 0, 0.01, df["ntc_std"])

    # Decide direction
    # CRISPRi: expect guide < NTC
    # CRISPRa: expect guide > NTC
    mode = dict_key
    is_crispri = "i" in mode.split("_")[1] 
    is_crispra = "a" in mode.split("_")[1]  

    # Fold-change definition (do once)
    if is_crispri:
        df["fc"] = (df["ntc_mean"] + pseudocount) / (df["guide_mean"] + pseudocount)  # >1 means knockdown
    elif is_crispra:
        df["fc"] = (df["guide_mean"] + pseudocount) / (df["ntc_mean"] + pseudocount)  # >1 means activation
    else:
        df["fc"] = np.nan  # unknown mode

    # Welch t-test per row
    t_stats, p_ones = [], []
    for _, row in df.iterrows():
        t_stat, p_two = stats.ttest_ind_from_stats(
            row["guide_mean"], row["guide_std"], row["n_cells"],
            row["ntc_mean"],   row["ntc_std"],   row["ntc_cells"],
            equal_var=False
        )

        if is_crispri:
            if (t_stat < 0):
                p_one = (p_two / 2) 
            else:
                p_one = 1.0
        elif is_crispra:
            if (t_stat > 0):
                p_one = (p_two / 2) 
            else:
                p_one = 1.0
        else:
            p_one = np.nan

        t_stats.append(t_stat)
        p_ones.append(p_one)

    df["t_statistic"] = t_stats
    df["p_value"] = p_ones

    # FDR
    mask = df["p_value"].notna()
    df["adj_pvals"] = np.nan
    df.loc[mask, "adj_pvals"] = multipletests(df.loc[mask, "p_value"], method="fdr_bh")[1]

    # Cohen's d (pooled SD)
    n1 = df["n_cells"].astype(float)
    n2 = df["ntc_cells"].astype(float)
    sp2 = ((n1 - 1)*df["guide_var"] + (n2 - 1)*df["ntc_var"]) / (n1 + n2 - 2)
    sp  = np.sqrt(sp2)

    df["effect_size"] = np.nan
    m = (n1 >= 2) & (n2 >= 2) & (sp > 0)
    df.loc[m, "effect_size"] = (df.loc[m, "guide_mean"] - df.loc[m, "ntc_mean"]) / sp[m]

    return df

In [5]:
experiment = {}
lane_path = '/Users/chandrima.modak/Gladstone Dropbox/Chandrima Modak/gw-CRISPRa_from_cluster'
for i in range(len(run)):
    
    summed_ = summarize_exp_stats(lane_path, run[i])
    prefix = run[i][0].rsplit('_lane', 1)[0]
    experiment[prefix] = summed_
experiment = {k: compute_stats(k, v) for k, v in experiment.items()}



In [6]:
for k in experiment.keys():
    outdir = os.path.join('/Users/chandrima.modak/Gladstone Dropbox/Chandrima Modak/gw-CRISPRa_from_cluster/qc_stats_welch', k)
    os.makedirs(outdir, exist_ok= True)
    filename = f"{k}_guide_efficency_welch.csv"
    filepath = os.path.join(outdir, filename)
    experiment[k].to_csv(filepath)