In [None]:
# a couple of QC 

1. F-statitics distribution in each cohort
2. Sensitivity analysis
 a) heterogeneity statitics
 b) Horizontal pleiotropy ( intercept )
 c) leave-one-out analysis

input dir: /mnt/f/10_osteo_MR/MR_ready/
output dir: /mnt/f/10_osteo_MR/MR_ready/senstivity_analysis/



In [4]:
# ---- Setup
library(data.table)
library(dplyr)
library(TwoSampleMR)
library(ggplot2)

in_dir  <- "/mnt/f/10_osteo_MR/MR_ready/"
out_dir <- "/mnt/f/10_osteo_MR/MR_ready/sensitivity_analysis/" 
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)
dir.create(file.path(out_dir, "plots"), showWarnings = FALSE, recursive = TRUE)

# ---------- 1) F-statistics distribution ----------
# F = (beta / se)^2; expects exposure DT with columns: SNP, gene, beta, se (per your inputs)
compute_Fstats <- function(exposure_dt, cohort_tag, gene_col = "gene",
                           max_genes_for_violin = 100L) {
  stopifnot(all(c("SNP", gene_col, "beta", "se") %in% names(exposure_dt)))
  exp_dt <- data.table::copy(exposure_dt)
  exp_dt <- exp_dt[!is.na(beta) & !is.na(se) & se > 0]
  exp_dt[, F_stat := (beta / se)^2]
  data.table::fwrite(exp_dt[, .(SNP, get(gene_col), beta, se, F_stat)],
                     file.path(out_dir, paste0("Fstats_", cohort_tag, ".tsv")),
                     sep = "\t")
  
  # per-gene summary
  gene_sum <- exp_dt[, .(
    n_snps = .N,
    F_mean = mean(F_stat),
    F_median = median(F_stat),
    F_min = min(F_stat),
    pct_Fgt10 = mean(F_stat > 10) * 100
  ), by = c(gene_col)]
  data.table::fwrite(gene_sum,
                     file.path(out_dir, paste0("Fstats_summary_", cohort_tag, ".tsv")),
                     sep = "\t")
  
  # Global histogram
  p_hist <- ggplot(exp_dt, aes(x = pmin(F_stat, 100))) +  # cap for readability
    geom_histogram(bins = 80) +
    labs(title = paste0("F-statistics (capped at 100): ", cohort_tag),
         x = "F-stat", y = "Count") +
    theme_minimal()
  ggsave(file.path(out_dir, "plots", paste0("Fstats_", cohort_tag, "_hist.png")),
         p_hist, width = 7, height = 5, dpi = 150)
  
  # Optional violin by gene (top genes by n_snps)
  top_genes <- gene_sum[order(-n_snps)][1:min(.N, max_genes_for_violin), get(gene_col)]
  v_dt <- exp_dt[get(gene_col) %in% top_genes]
  if (nrow(v_dt) > 0) {
    p_vio <- ggplot(v_dt, aes(x = .data[[gene_col]], y = pmin(F_stat, 100))) +
      geom_violin(trim = TRUE) + coord_flip() +
      labs(title = paste0("F-statistics by gene (top ", length(top_genes), " by #SNPs): ", cohort_tag),
           x = "Gene", y = "F-stat (capped at 100)") +
      theme_minimal()
    ggsave(file.path(out_dir, "plots", paste0("Fstats_", cohort_tag, "_violin_by_gene.png")),
           p_vio, width = 7, height = 10, dpi = 150)
  }
  
  invisible(list(per_snp = exp_dt, per_gene = gene_sum))
}


# --- safe helpers ---
is_nonempty_df <- function(x) is.data.frame(x) && nrow(x) > 0

safe_try <- function(expr) tryCatch(expr, error = function(e) NULL)

add_gene_if_nonempty <- function(df, gene_id) {
  if (is_nonempty_df(df)) {
    df$gene <- gene_id
    return(df)
  }
  NULL
}

# ---------- sensitivity per gene (robust) ----------
# Set min SNP thresholds explicitly
run_sensitivity_for_gene <- function(dat, gene_id, cohort_tag,
                                     min_snps_hetero = 2,   # IVW heterogeneity usually needs >=2
                                     min_snps_egger  = 3,   # Egger intercept needs >=3
                                     min_snps_loo    = 3,   # LOO needs >=3
                                     min_snps_singlesnp = 1 # single SNP can run with >=1
                                     ) {
  if (!is_nonempty_df(dat) || nrow(dat) < min(min_snps_hetero, min_snps_singlesnp))
    return(list(heterogeneity=NULL, pleiotropy=NULL, singlesnp=NULL, loo=NULL))

  # Heterogeneity
  het <- NULL
  if (nrow(dat) >= min_snps_hetero) {
    het <- safe_try(TwoSampleMR::mr_heterogeneity(dat))
    het <- add_gene_if_nonempty(het, gene_id)
  }

  # Egger intercept (horizontal pleiotropy)
  pleio <- NULL
  if (nrow(dat) >= min_snps_egger) {
    pleio <- safe_try(TwoSampleMR::mr_pleiotropy_test(dat))
    pleio <- add_gene_if_nonempty(pleio, gene_id)
  }

  # Single-SNP (Wald ratios) + optional forest plot
  ss <- NULL
  if (nrow(dat) >= min_snps_singlesnp) {
    ss <- safe_try(TwoSampleMR::mr_singlesnp(dat))
    ss <- add_gene_if_nonempty(ss, gene_id)
    if (is_nonempty_df(ss)) {
      p <- safe_try(TwoSampleMR::mr_forest_plot(ss))
      if (!is.null(p) && "Single SNP estimates" %in% names(p)) {
        ggplot2::ggsave(
          file.path(out_dir, "plots", paste0("singleSNP_", cohort_tag, "_", gene_id, ".png")),
          p[[ "Single SNP estimates" ]],
          width=7, height=9, dpi=150
        )
      }
    }
  }

  # Leave-one-out (needs >=3 typically)
  loo <- NULL
  if (nrow(dat) >= min_snps_loo) {
    loo <- safe_try(TwoSampleMR::mr_leaveoneout(dat))
    loo <- add_gene_if_nonempty(loo, gene_id)
    if (is_nonempty_df(loo)) {
      p2 <- safe_try(TwoSampleMR::mr_leaveoneout_plot(loo))
      if (!is.null(p2) && "Leave-one-out sensitivity analysis" %in% names(p2)) {
        ggplot2::ggsave(
          file.path(out_dir, "plots", paste0("leaveoneout_", cohort_tag, "_", gene_id, ".png")),
          p2[[ "Leave-one-out sensitivity analysis" ]],
          width=7, height=6, dpi=150
        )
      }
    }
  }

  list(heterogeneity=het, pleiotropy=pleio, singlesnp=ss, loo=loo)
}

# ---------- bulk runner (filters empties before writing) ----------
run_sensitivity_bulk <- function(exposure, outcome, gene_col = "gene",
                                 cohort_tag = "eqtlgen_osteo") {
  all_exp_snps <- unique(exposure$SNP)
  outcome <- outcome[SNP %in% all_exp_snps]
  genes <- unique(exposure[[gene_col]])

  HET <- list(); PLEIO <- list(); LOO <- list(); SS <- list()

  for (g in genes) {
    exp_sub <- exposure[exposure[[gene_col]] == g]
    out_sub <- outcome[SNP %in% exp_sub$SNP]
    if (nrow(exp_sub) < 1 || nrow(out_sub) < 1) next

    exp_dat <- suppressWarnings(format_data(
      as.data.frame(exp_sub), type="exposure", snp_col="SNP", beta_col="beta", se_col="se",
      effect_allele_col="effect_allele", other_allele_col="other_allele",
      phenotype_col=gene_col, pval_col="pval"
    ))
    out_dat <- suppressWarnings(format_data(
      as.data.frame(out_sub), type="outcome", snp_col="SNP", beta_col="beta", se_col="se",
      effect_allele_col="effect_allele", other_allele_col="other_allele",
      pval_col="pval"
    ))

    dat <- suppressWarnings(harmonise_data(exp_dat, out_dat, action=1))
    if (!is_nonempty_df(dat) || nrow(dat) < 1) next

    sens <- run_sensitivity_for_gene(dat, gene_id = g, cohort_tag = cohort_tag)

    if (is_nonempty_df(sens$heterogeneity)) HET[[g]] <- sens$heterogeneity
    if (is_nonempty_df(sens$pleiotropy))    PLEIO[[g]] <- sens$pleiotropy
    if (is_nonempty_df(sens$loo))           LOO[[g]] <- sens$loo
    if (is_nonempty_df(sens$singlesnp))     SS[[g]]  <- sens$singlesnp
  }

  # write only when there is at least one non-empty df
  if (length(HET)) {
    HET <- Filter(is_nonempty_df, HET)
    if (length(HET)) data.table::fwrite(dplyr::bind_rows(HET),
      file.path(out_dir, paste0("heterogeneity_", cohort_tag, ".tsv")), sep="\t")
  }
  if (length(PLEIO)) {
    PLEIO <- Filter(is_nonempty_df, PLEIO)
    if (length(PLEIO)) data.table::fwrite(dplyr::bind_rows(PLEIO),
      file.path(out_dir, paste0("pleiotropy_intercept_", cohort_tag, ".tsv")), sep="\t")
  }
  if (length(LOO)) {
    LOO <- Filter(is_nonempty_df, LOO)
    if (length(LOO)) data.table::fwrite(dplyr::bind_rows(LOO),
      file.path(out_dir, paste0("leaveoneout_", cohort_tag, ".tsv")), sep="\t")
  }
  if (length(SS)) {
    SS <- Filter(is_nonempty_df, SS)
    if (length(SS)) data.table::fwrite(dplyr::bind_rows(SS),
      file.path(out_dir, paste0("singleSNP_", cohort_tag, ".tsv")), sep="\t")
  }

  invisible(TRUE)
}

                                    

                                    
setwd('/mnt/f/10_osteo_MR/MR_ready/')

# Load outcome once
out_osteo <- fread("outcome_osteo_within_wb_DHS.tsv")

# 1) F-stats per exposure cohort
exp_eqtlgen <- fread("exposure_eqtlgen_dhs_index.tsv")
compute_Fstats(exp_eqtlgen, cohort_tag = "eqtlgen_osteo")

exp_gtex <- fread("exposure_gtex_whole_blood_eqtl_dhs_index.tsv")
compute_Fstats(exp_gtex, cohort_tag = "gtexwb_osteo")

                                    exp_ukbppp_pqtl <- fread("exposure_ukbppp_pqtl_dhs_index.tsv")
compute_Fstats(exp_ukbppp_pqtl, cohort_tag = "ukbppp_pqtl_osteo")

exp_decode_pqtl <- fread("exposure_pqtl_decode_dhs_index.tsv")
compute_Fstats(exp_decode_pqtl, cohort_tag = "decode_pqtl_osteo")

                                    
# 2) Sensitivity per cohort (per-gene)
run_sensitivity_bulk(exp_eqtlgen,       out_osteo, gene_col="gene", cohort_tag="eqtlgen_osteo")
run_sensitivity_bulk(exp_gtex,          out_osteo, gene_col="gene", cohort_tag="gtexwb_osteo")
run_sensitivity_bulk(exp_ukbppp_pqtl,   out_osteo, gene_col="gene", cohort_tag="ukbppp_pqtl_osteo")
run_sensitivity_bulk(exp_decode_pqtl,   out_osteo, gene_col="gene", cohort_tag="decode_pqtl_osteo")


                                    