# Meta Analyze Full Haplogroups
- **Author(s)** - Frank Grenn
- **Date Started** - December  2021
- **Quick Description:** Combine logistic regression results for full haplogroups. Then meta analyze haplogroups in 50 or more samples, and repeat by tool (snappy, yhaplo, ltrack)

In [None]:
library(metafor)
library(dplyr)
library(data.table)

In [None]:
WRKDIR <- "$PATH/chrY"
PLOTDIR <- paste0(WRKDIR,"/meta_analysis_plots/full_haplogroups_covs_adjusted_cohort_count")

## 1) Combine Dataset Results

In [None]:
combine_df = data.frame()

In [None]:
#amppd pd case control
amp_pd_cc_full = fread(paste0(WRKDIR,"/output_male_hemizygous_only_het_filter_run/haplotype_full_pd_case_control_no_gc_cutoff_50_new.csv"))
print(dim(amp_pd_cc_full))
amp_pd_cc_full$cohort = 'amp_pd_cc'
amp_pd_cc_full <- amp_pd_cc_full %>% select('cohort','haplo','yhaplo_controls',
 'yhaplo_control_freq',
 'yhaplo_cases',
 'yhaplo_case_freq',
 'yhaplo_p_logreg',
 'yhaplo_beta_logreg',
 'yhaplo_se_logreg',
 'snappy_controls',
 'snappy_control_freq',
 'snappy_cases',
 'snappy_case_freq',
 'snappy_p_logreg',
 'snappy_beta_logreg',
 'snappy_se_logreg',
 'ltrack_controls',
 'ltrack_control_freq',
 'ltrack_cases',
 'ltrack_case_freq',
 'ltrack_p_logreg',
 'ltrack_beta_logreg',
 'ltrack_se_logreg')
print(colnames(amp_pd_cc_full))
combine_df = rbind(combine_df,amp_pd_cc_full)
print(dim(combine_df))

In [None]:
#ukbb pd case control
ukbb_pd_cc_full = fread(paste0(WRKDIR,"/output_ukbb/haplotype_full_pd_case_control_cutoff_50_new.csv"))
print(dim(ukbb_pd_cc_full))
ukbb_pd_cc_full$cohort = 'ukbb_pd_cc'
ukbb_pd_cc_full <- ukbb_pd_cc_full %>% select('cohort','haplo','yhaplo_controls',
 'yhaplo_control_freq',
 'yhaplo_cases',
 'yhaplo_case_freq',
 'yhaplo_p_logreg',
 'yhaplo_beta_logreg',
 'yhaplo_se_logreg',
 'snappy_controls',
 'snappy_control_freq',
 'snappy_cases',
 'snappy_case_freq',
 'snappy_p_logreg',
 'snappy_beta_logreg',
 'snappy_se_logreg',
 'ltrack_controls',
 'ltrack_control_freq',
 'ltrack_cases',
 'ltrack_case_freq',
 'ltrack_p_logreg',
 'ltrack_beta_logreg',
 'ltrack_se_logreg')
print(colnames(ukbb_pd_cc_full))
combine_df = rbind(combine_df,ukbb_pd_cc_full)
print(dim(combine_df))

In [None]:
#ukbb pd proxy control
ukbb_pd_pc_full = fread(paste0(WRKDIR,"/output_ukbb/haplotype_full_pd_proxy_control_cutoff_50_new.csv"))
print(dim(ukbb_pd_pc_full))
ukbb_pd_pc_full$cohort = 'ukbb_pd_pc'
ukbb_pd_pc_full <- ukbb_pd_pc_full %>% select('cohort','haplo','yhaplo_controls',
 'yhaplo_control_freq',
 'yhaplo_proxies',
 'yhaplo_proxy_freq',
 'yhaplo_p_logreg',
 'yhaplo_beta_logreg',
 'yhaplo_se_logreg',
 'snappy_controls',
 'snappy_control_freq',
 'snappy_proxies',
 'snappy_proxy_freq',
 'snappy_p_logreg',
 'snappy_beta_logreg',
 'snappy_se_logreg',
 'ltrack_controls',
 'ltrack_control_freq',
 'ltrack_proxies',
 'ltrack_proxy_freq',
 'ltrack_p_logreg',
 'ltrack_beta_logreg',
 'ltrack_se_logreg')
colnames(ukbb_pd_pc_full) = c('cohort','haplo','yhaplo_controls',
 'yhaplo_control_freq',
 'yhaplo_cases',
 'yhaplo_case_freq',
 'yhaplo_p_logreg',
 'yhaplo_beta_logreg',
 'yhaplo_se_logreg',
 'snappy_controls',
 'snappy_control_freq',
 'snappy_cases',
 'snappy_case_freq',
 'snappy_p_logreg',
 'snappy_beta_logreg',
 'snappy_se_logreg',
 'ltrack_controls',
 'ltrack_control_freq',
 'ltrack_cases',
 'ltrack_case_freq',
 'ltrack_p_logreg',
 'ltrack_beta_logreg',
 'ltrack_se_logreg')
print(colnames(ukbb_pd_pc_full))
combine_df = rbind(combine_df,ukbb_pd_pc_full)
print(dim(combine_df))

In [None]:
#neurox pd case control
neurox_pd_cc_full = fread(paste0(WRKDIR,"/output_neurox/haplotype_full_pd_case_control_no_gc_cutoff_50_new.csv"))
print(dim(neurox_pd_cc_full))
neurox_pd_cc_full$cohort = 'neurox_pd_cc'
neurox_pd_cc_full <- neurox_pd_cc_full %>% select('cohort','haplo','yhaplo_controls',
 'yhaplo_control_freq',
 'yhaplo_cases',
 'yhaplo_case_freq',
 'yhaplo_p_logreg',
 'yhaplo_beta_logreg',
 'yhaplo_se_logreg',
 'snappy_controls',
 'snappy_control_freq',
 'snappy_cases',
 'snappy_case_freq',
 'snappy_p_logreg',
 'snappy_beta_logreg',
 'snappy_se_logreg',
 'ltrack_controls',
 'ltrack_control_freq',
 'ltrack_cases',
 'ltrack_case_freq',
 'ltrack_p_logreg',
 'ltrack_beta_logreg',
 'ltrack_se_logreg')
print(colnames(neurox_pd_cc_full))
combine_df = rbind(combine_df,neurox_pd_cc_full)
print(dim(combine_df))

In [None]:
write.csv(combine_df %>% arrange(haplo), paste0(WRKDIR,"/full_haplos_50.csv"), quote=FALSE,row.names=FALSE)

## 2) Meta Analyze

In [None]:
data = as.data.frame(fread(paste0(WRKDIR,"/full_haplos_50.csv"))) %>% rename('haplo_full'='haplo')
print(dim(data))
print(head(data))

In [None]:
data$cohort_short <- data$cohort
data[data$cohort == "ukbb_pd_cc",]$cohort_short <- "ukbb"
data[data$cohort == "ukbb_pd_pc",]$cohort_short <- "ukbb"
data[data$cohort == "neurox_pd_cc",]$cohort_short <- "neurox"
data[data$cohort == "amp_pd_cc",]$cohort_short <- "amppd"
data$cohort_count <- 0
print(head(data))

In [None]:
tools = c("snappy","yhaplo","ltrack")

In [None]:
results_df = data.frame()
for (t in tools){
    print(t)
    
    #get the tool specific columns
    grepstring = paste0(t,"|cohort|haplo_full")
    
    tool_data = data[,grepl(grepstring,names(data))]
    tool_data = tool_data[complete.cases(tool_data),]

    
    

    #run meta analysis for each full haplogroup
    for (haplo in unique(tool_data$haplo_full)){
    #for (haplo in unique(tool_data[tool_data$cohort_count>=2,]$haplo_full)){

        haplo_data <- tool_data[which(tool_data$haplo_full == haplo & tool_data$cohort!='amp_lbd_cc'),]
        
        print(haplo)
        #print(haplo_data)            
        

        
        meta_analysis_reg <- rma(yi=haplo_data[,8], sei = haplo_data[,9])
        #print(meta_analysis_reg)
        
        png(file = paste0(PLOTDIR,"/",haplo,"_",t,"_forest.png"),width = 480, height = 480)
        
        forest(meta_analysis_reg, slab = haplo_data$cohort, main=paste0("Meta Analysis of Full Haplogroup ", haplo, " from ", t),atransf=exp,  mlab = "Fixed Effects", xlab = "Odds Ratio (95% CI) for full haplogroup", col="red", border = "red", cex=.9)
        dev.off()
        
        
        row = data.frame(beta=meta_analysis_reg$`beta`[,1], se = meta_analysis_reg$`se`, zval = meta_analysis_reg$`zval`, pval = meta_analysis_reg$pval, ci_lb = meta_analysis_reg$`ci.lb`, ci_ub = meta_analysis_reg$`ci.ub`)
        row$haplo_full <- haplo
        row$tool <- t
        row$n_datasets <- nrow(haplo_data)
        row$n_cohorts <- length(unique(haplo_data$cohort_short))
        #row$datasets <- haplo_data['']
        #print(row)
        #row.names(row) <- NULL
        #print(row)
        results_df <- rbind(results_df, row)
        
    }
}

In [None]:
results_df

In [None]:
results_df[results_df$n_cohorts>=2,]

In [None]:
write.csv(results_df %>% arrange(haplo_full), paste0(WRKDIR,"/meta_analyzed_full_haplo.csv"), quote=FALSE,row.names=FALSE)
write.csv(results_df[results_df$n_cohorts>=2,] %>% arrange(haplo_full), paste0(WRKDIR,"/meta_analyzed_full_haplo_cohort_count.csv"), quote=FALSE,row.names=FALSE)

In [None]:
results_df <- as.data.frame(fread(paste0(WRKDIR,"/meta_analyzed_full_haplo_cohort_count.csv")))
print(dim(results_df))

In [None]:
print(length(unique(results_df$haplo_full)))

In [None]:
dim(results_df[results_df$pval < 0.05,])

In [None]:
#what looks significant?
results_df[results_df$pval < 0.05,]

In [None]:
results_df[results_df$pval < 0.05/12,]