# Build spreadsheet and summary statistics of top associations, implicated by peak window p-value method

In [None]:
library(data.table)

In [None]:
library(dplyr)

## List file with all results for each trait in in planta regeneration GWAS

### Read

In [None]:
in_planta_files <- list.files("/mnt/data/NSF_GWAS/notebooks/InPlantaGWAS/11_Data_mining/Results_GMMAT_ART/MethodsStacked/",
                              recursive = TRUE,
                              pattern = "csv",
                              full.names = TRUE)

In [None]:
all_files <- c(in_planta_files)

### Merge

In [None]:
files_combined <- fread(all_files[1])

In [None]:
for(file in all_files){
    file_in <- fread(file)
    files_combined <- dplyr::bind_rows(files_combined, file_in)
}

## Filter out results we didn't select due to not being optimal

### Problems fitting models for certain traits with GMMAT runs with MAF 0.01

In [None]:
trait_w_crazy_gmmat_maf01 <- 
c("callus_2w",
  "Shoot_PC2",
  "CallusShoot_PC1",
  "callus_3w",
  "Callus_PC1",
  "shoot_2w")

In [None]:
for(this_crazy_result in trait_w_crazy_gmmat_maf01){
    files_combined <- 
    files_combined[!which(
    files_combined$batch_i == "GMMAT_binarized_logitlink-batch2_maf01_geno10" &
    files_combined$raw_trait_name == this_crazy_result), ]
    
    files_combined <- 
    files_combined[!which(
    files_combined$batch_i == "GMMAT_nothreshold_duplicates_binarized.binary_logitlink-batch4_maf01_geno10" &
    files_combined$raw_trait_name == this_crazy_result), ]
    
    files_combined <- 
    files_combined[!which(
    files_combined$batch_i == "GMMAT_binarized_logitlink-batch4_maf01_geno10" &
    files_combined$raw_trait_name == this_crazy_result), ]
    
    files_combined <- 
    files_combined[!which(
    files_combined$batch_i == "GMMAT_threshold-0.198412874212136.binary_logitlink-batch4_maf01_geno10" &
    files_combined$raw_trait_name == this_crazy_result), ]
    
    files_combined <- 
    files_combined[!which(
    files_combined$batch_i == "GMMAT_threshold0.364036344426957.binary_logitlink-batch4_maf01_geno10" &
    files_combined$raw_trait_name == this_crazy_result), ]
    
    files_combined <- 
    files_combined[!which(
    files_combined$batch_i == "GMMAT_threshold0.708565670955823.binary_logitlink-batch4_maf01_geno10" &
    files_combined$raw_trait_name == this_crazy_result), ]
    print(dim(files_combined))
}

In [None]:
h2_table <- fread("/mnt/data/NSF_GWAS/notebooks/InPlantaGWAS/11_Data_mining/sorted_h2_table_with_raw_names_added.csv")

Find max heritability for each raw trait. We're doing this because sometimes traits are studied in multiple ways and heritability is higher for one than the other. We'll keep if max h2 is above 0.1

In [None]:
max_h2_table <- aggregate(h2_table$h2 ~ h2_table$raw_trait_name, FUN = "max")

In [None]:
colnames(max_h2_table) <- c("raw_trait_name", "max_h2")

In [None]:
nonheritable <- max_h2_table$raw_trait_name[which(max_h2_table$max_h2 < 0.1)]

In [None]:
for(this_trait in nonheritable){
    files_combined <- files_combined[!which(files_combined$raw_trait_name == this_trait), ]
    print(dim(files_combined))
}

### Don't keep copied, duplicate GMMAT results

We can do this quickly and easily by making batch name match and removing unique elements

In [None]:
files_combined$batch_i[which(
    files_combined$batch_i == "GMMAT_binarized_logitlink-GMMAT")] <- 
"GMMAT_binarized_logitlink-batch4_maf01_geno10"

In [None]:
files_combined <- unique(files_combined)

In [None]:
nrow(files_combined)

## Merge in lincRNA and a priori info

In [None]:
getwd()

In [None]:
apriori <- fread("../../InVitroRegenGWAS/07_Data_mining/6-IN_Hu_Tuskan_a_priori_QTLs.csv",
                 header = FALSE)

In [None]:
lincrna <- fread("../../InVitroRegenGWAS/07_Data_mining/6_IN_GreeNC_lincRNA.csv",
                 header = FALSE)

In [None]:
colnames(lincrna) <- c("latest_transcript",
                       "is_lincRNA")

In [None]:
colnames(apriori) <- c("attributes.Name",
                       "a_priori")

In [None]:
files_combined <- merge(files_combined, lincrna,
                        by = "latest_transcript",
                        all.x = TRUE)

In [None]:
files_combined <- merge(files_combined, apriori,
                        by = "attributes.Name",
                        all.x = TRUE)

In [None]:
files_combined <- unique(files_combined)

## Merge with data on peak p-values for windows and restrict to top X windows for downstream summary stats

### Load ART results and get ready to merge

In [None]:
peak_p <- fread("6-OUT_Peak_P_a1_500bp_window.csv")

In [None]:
colnames(peak_p)[1:2] <- c("CHR", "POS")
colnames(peak_p)[7:8] <- c("batch_i", "Method")

In [None]:
peak_p$file_path <- NULL

In [None]:
df <- data.frame()

In [None]:
peak_p$QTLID <- paste0(peak_p$window_peak, "_", peak_p$batch)

In [None]:
for(i in 1:nrow(peak_p)){
    thisQTLID <- peak_p$QTLID[i]
    peak_p_this_QTLID <- peak_p[which(peak_p$QTLID == thisQTLID), ]
    max_n_SNPs <- max(peak_p_this_QTLID$n_SNPs)
    if(peak_p$n_SNPs[i] == max_n_SNPs){
        df <- dplyr::bind_rows(df, peak_p[i, ])
    }
}

In [None]:
df <- unique(df)

In [None]:
peak_p <- df

In [None]:
data_reloaded <- files_combined

In [None]:
data_reloaded <- merge(data_reloaded, peak_p, by = c("CHR", "POS", "batch_i", "Method", "raw_trait_name"))

In [None]:
library(foreach)
library(stringr)

In [None]:
data <- data_reloaded

### Exclude shoot at week 2 because GEMMA results look strange and distribution is extremely sparse

### Add heritability data to table

Add heritability to our earlier output and save it out

In [None]:
data_reloaded <- merge(data_reloaded, max_h2_table, by = "raw_trait_name", all.x = TRUE)

In [None]:
fwrite(data_reloaded, "8-OUT_QTLs_pass2_GMMAT_ART_only.csv")