# Build spreadsheet of all results from all methods

In [None]:
library(data.table)
library(tools)
library(stringr)
library(tidyr)

## Start with list of raw traits

In [None]:
traits <- list.files("../05_Parsing_phenodata/pheno_files/",
                     pattern = "\\.header",
                     full.names = TRUE,
                     recursive = TRUE)

traits <- traits[-grep("copies", traits)]
traits <- traits[-grep("binarized", traits)]
traits <- traits[-grep("boxcox", traits)]
traits <- traits[-grep("rbinv", traits)]
traits <- traits[-grep("threshold", traits)]
traits <- traits[-grep("diameter", traits)]
traits <- traits[-grep("lntrans", traits)]
traits <- traits[-grep("wk2_wk5", traits)] # These are duplicates of 2w_5w versions
traits <- traits[-grep("PC5", traits)]
traits <- traits[-grep("PC6", traits)]
traits <- traits[-grep("PC11", traits)]
traits <- traits[-grep("PC12", traits)]
traits <- traits[-grep("PC13", traits)]

In [None]:
data <- as.data.table(traits)
colnames(data) <- "raw_trait_path"

In [None]:
data$raw_trait <- basename(file_path_sans_ext(file_path_sans_ext(data$raw_trait_path)))

## Add traits from GEMMA

In [None]:
gemma_list <- list.files("../07_GEMMA/output/",
                         pattern = "assoc",
                         full.names = TRUE)

In [None]:
gemma_table <- as.data.frame(gemma_list)
colnames(gemma_table)[1] <- "gemma_path"

### Parse out batch and trait IDs from filenames

In [None]:
gemma_table$raw_trait <- file_path_sans_ext(file_path_sans_ext(gemma_table$gemma_path))

In [None]:
batches <- c("_nothreshold_unique_keepoutliers_rbinv_outa3",
             "_boxcox_outa1",
             "_nothreshold_nodupfilter_keepoutliers_boxcox_outa2",
             "_nothreshold_nodupfilter_keepoutliers_boxcox_outa3",
             "_nothreshold_unique_keepoutliers_boxcox_outa3",
             "_nothreshold_unique_rmoutliers_boxcox_outa3",
             "_untransformed_outa1",
             "_nothreshold_unique_keepoutliers_rbinv_outa2",
             "_nothreshold_unique_keepoutliers_rbinv_outa3",
             "_nothreshold_unique_rmoutliers_boxcox_outa2",
             "_nothreshold_unique_keepoutliers_boxcox_outa2",
             "__unique_rmoutliers_boxcox_outa2",
             "__unique_rmoutliers_boxcox_outa3"
#              "_threshold-0.198412874212136__unique_rmoutliers_boxcox_outa3",
#              "_threshold0.708565670955823__unique_rmoutliers_boxcox_outa2",
#              "_threshold0.364036344426957__unique_rmoutliers_boxcox_outa2",
#              "_threshold-0.0145550368327804__unique_rmoutliers_boxcox_outa3",
#              "_threshold0.00047973016093833__unique_rmoutliers_boxcox_outa3",
#              "_threshold0.000918436893207641__unique_rmoutliers_boxcox_outa3"
             )

In [None]:
gemma_table$batch <- NA # Must initialize this column

In [None]:
for(batch in batches){
    print(batch)
    lines_this_batch <- which(grepl(batch, gemma_table$raw_trait))
    
    gemma_table$batch[lines_this_batch] <- batch
    
    gemma_table$raw_trait[lines_this_batch] <- gsub(batch,
                                                    "",
                                                    gemma_table$raw_trait[lines_this_batch])

    gemma_table$raw_trait[lines_this_batch] <- basename(gemma_table$raw_trait[lines_this_batch])
}

In [None]:
lines_with_threshold <- which(grepl("_threshold", gemma_table$raw_trait))

In [None]:
parsed_threshold_names <- str_split_fixed(gemma_table[lines_with_threshold, ]$gemma_path, "_threshold", 2)

In [None]:
gemma_table[lines_with_threshold, ]$batch <- 
file_path_sans_ext(
    file_path_sans_ext(
        paste0("_threshold", parsed_threshold_names[, 2])
        )
    )

In [None]:
gemma_table[lines_with_threshold, ]$raw_trait <- basename(parsed_threshold_names[, 1])

### Turn long data wide

In [None]:
gemma_table$batch <- paste0("GEMMA", gemma_table$batch)

In [None]:
gemma_table_wide <- tidyr::spread(gemma_table, batch, gemma_path)

## Add traits from GMMAT

In [None]:
gmmat_list <- list.files("../08_GMMAT/Results/", full.names = TRUE, recursive = TRUE, pattern = "glmm")

In [None]:
gmmat_table <- as.data.frame(gmmat_list)
colnames(gmmat_table)[1] <- "gmmat_path"

### Parse out raw traits from file names/paths

In [None]:
gmmat_table$raw_trait <- file_path_sans_ext(file_path_sans_ext(gmmat_table$gmmat_path))

In [None]:
head(gmmat_table$raw_trait)

In [None]:
batch_criteria_A <- str_split_fixed(gmmat_table$gmmat_path, "/", 6)[,5]

In [None]:
batches <- c("_binarized_logitlink",
             "_nothreshold_duplicates_binarized.binary_logitlink",
             "_threshold-0.198412874212136.binary_logitlink",
             "_threshold0.708565670955823.binary_logitlink",
             "_threshold0.364036344426957.binary_logitlink",
             "_threshold-0.198412874212136.binary_logitlink")

In [None]:
gmmat_table$batch <- NA

In [None]:
for(batch in batches){
    
    lines_this_batch <- which(grepl(batch, gmmat_table$raw_trait))
    
    gmmat_table$batch[lines_this_batch] <- batch
    
    gmmat_table$raw_trait[lines_this_batch] <- gsub(batch,
                                  "",
                                  gmmat_table$raw_trait[lines_this_batch])
    
    gmmat_table$raw_trait[lines_this_batch] <- basename(gmmat_table$raw_trait[lines_this_batch])
    
    gmmat_table$batch[lines_this_batch] <- paste0("GMMAT", batch, "-", batch_criteria_A[lines_this_batch])
}

### Store file prefix instead of filename

<div class="alert alert-block alert-warning"> We need the file prefix instead of file name because there are two files (score and Wald) for each run... and this would get in the way of us having one row per trait as we need.</div>


In [None]:
gmmat_table$gmmat_path <- file_path_sans_ext(file_path_sans_ext(gmmat_table$gmmat_path))

In [None]:
gmmat_table <- unique(gmmat_table)

In [None]:
data.table::fwrite(gmmat_table, "1_GMMAT_table.csv")

### Turn long data wide

In [None]:
gmmat_table_wide <- tidyr::spread(gmmat_table, batch, gmmat_path)

In [None]:
colnames(gmmat_table_wide) <- gsub("\\.", "--", colnames(gmmat_table_wide))

## Add traits from MTMCSKAT

### Build list with one file for each scaffold and trait/batch combo

#### Make lists of all files

In [None]:
skat_list.1 <- 
list.files("/mnt/data/NSF_GWAS/Results/SKAT/mtskat_with_PCs_added_over_callus_and_PCs",
           full.names = TRUE,
           recursive = TRUE,
           pattern = "csv")

skat_list.2 <- 
list.files("/mnt/data/NSF_GWAS/Results/SKAT/mtskat_noPC_over_callus_shoot_and_PCs",
           full.names = TRUE,
           recursive = TRUE,
           pattern = "csv")

In [None]:
skat_list.3 <- 
list.files("/mnt/data/NSF_GWAS/Results/SKAT/mtmcskat_SLURMS_6PC/",
           full.names = TRUE,
           recursive = TRUE,
           pattern = "csv")

skat_list.4 <- 
list.files("/mnt/data/NSF_GWAS/Results/SKAT/mtmcskat_SLURMS_7K/",
           full.names = TRUE,
           recursive = TRUE,
           pattern = "csv")

In [None]:
skat_list <- c(skat_list.1, skat_list.2, skat_list.3, skat_list.4)

#### Parse trait and covariates out from files

In [None]:
skat_table_by_scaff <- as.data.frame(skat_list)
colnames(skat_table_by_scaff)[1] <- "skat_path"

In [None]:
skat_table_by_scaff$raw_trait <- 
file_path_sans_ext(
    file_path_sans_ext(
        str_split_fixed(
            basename(skat_table_by_scaff$skat_path),
            "-",
            3)[,2]
    
))

In [None]:
skat_table_by_scaff$batch <-
str_split_fixed(skat_table_by_scaff$skat_path,
               "/",
               8)[,7]
head(skat_table_by_scaff)

In [None]:
chr_list <- str_split_fixed(skat_table_by_scaff$skat_path, "_Chr", 2)[,2]
chr_list <- str_split_fixed(chr_list, "\\.", 2)[,1] 
length(chr_list)

### Collapse down to list for each trait/batch, aggregating over scaffolds

#### Produce long table

In [None]:
skat_table_by_scaff$skat_prefix <- str_split_fixed(basename(skat_table_by_scaff$skat_path), "_Chr", 2)[,1]

In [None]:
skat_table_by_scaff$skat_prefix <- paste0(dirname(skat_table_by_scaff$skat_path), "/",
                                          skat_table_by_scaff$skat_prefix)

In [None]:
skat_table <- skat_table_by_scaff

In [None]:
skat_table$skat_path <- NULL

In [None]:
skat_table <- unique(skat_table)

#### Make wide

In [None]:
skat_table_wide <- tidyr::spread(skat_table, batch, skat_prefix)

## Merge all tables

In [None]:
merged <- merge(data, gemma_table_wide, by = "raw_trait",
                all = TRUE)

In [None]:
merged <- merge(merged, gmmat_table_wide, by = "raw_trait",
                all = TRUE)

In [None]:
merged <- merge(merged, skat_table_wide, by = "raw_trait",
                all = TRUE)

Clean up these column names a little

In [None]:
colnames(merged) <- file_path_sans_ext(file_path_sans_ext(colnames(merged)))

In [None]:
fwrite(merged, "1_Table_results_by_method_a2.1_keep_full_gmmat_batchname.csv")