# Build spreadsheet of all results from all methods

In [None]:
library(data.table)
library(tools)
library(stringr)
library(tidyr)

## Start with list of raw traits

In [None]:
traits <- list.files("../05_Parsing_phenodata/pheno_files/",
                     pattern = "\\.header",
                     full.names = TRUE,
                     recursive = TRUE)

traits <- traits[-grep("copies", traits)]
traits <- traits[-grep("binarized", traits)]
traits <- traits[-grep("boxcox", traits)]
traits <- traits[-grep("rbinv", traits)]
traits <- traits[-grep("threshold", traits)]
traits <- traits[-grep("diameter", traits)]
traits <- traits[-grep("lntrans", traits)]
traits <- traits[-grep("wk2_wk5", traits)] # These are duplicates of 2w_5w versions

In [None]:
data <- as.data.table(traits)
colnames(data) <- "raw_trait_path"

In [None]:
data$raw_trait <- basename(file_path_sans_ext(file_path_sans_ext(data$raw_trait_path)))

## Add traits from GMMAT

In [None]:
gmmat_list <- list.files("../08_GMMAT/Results/batch5_maf01_geno10_ART/", full.names = TRUE, recursive = TRUE, pattern = "glmm")

In [None]:
gmmat_table <- as.data.frame(gmmat_list)
colnames(gmmat_table)[1] <- "gmmat_path"

### Parse out raw traits from file names/paths

In [None]:
gmmat_table$raw_trait <- file_path_sans_ext(file_path_sans_ext(gmmat_table$gmmat_path))

In [None]:
batch_criteria_A <- str_split_fixed(gmmat_table$gmmat_path, "/", 6)[,5]

In [None]:
batches <- c("_binarized_logitlink",
             "_nothreshold_duplicates_binarized.binary_logitlink",
             "_threshold-0.198412874212136.binary_logitlink",
            "_threshold0.708565670955823.binary_logitlink",
            "_threshold0.364036344426957.binary_logitlink",
            "_threshold-0.198412874212136.binary_logitlink",
             "_threshold-0.198412874212136__duplicates_binarized.binary_logitlink",
             "_threshold0.000918436893207641__duplicates_binarized.binary_logitlink",
             "_threshold0.000918436893207641.binary_logitlink",
             "_threshold-0.0145550368327804.binary_logitlink",
                "_threshold-0.0145550368327804__duplicates_binarized.binary_logitlink"
            )

In [None]:
gmmat_table$batch <- NA

In [None]:
for(batch in batches){
    
    lines_this_batch <- which(grepl(batch, gmmat_table$raw_trait))
    
    gmmat_table$batch[lines_this_batch] <- batch
    
    gmmat_table$raw_trait[lines_this_batch] <- gsub(batch,
                                  "",
                                  gmmat_table$raw_trait[lines_this_batch])
    
    gmmat_table$raw_trait[lines_this_batch] <- basename(gmmat_table$raw_trait[lines_this_batch])
    
    gmmat_table$batch[lines_this_batch] <- paste0("GMMAT", batch, "-", batch_criteria_A[lines_this_batch])
}

### Store fix prefix instead of filename

<div class="alert alert-block alert-warning"> We need the file prefix instead of file name because there are two files (score and Wald) for each run... and this would get in the way of us having one row per trait as we need.</div>


In [None]:
gmmat_table$gmmat_path <- file_path_sans_ext(file_path_sans_ext(gmmat_table$gmmat_path))

In [None]:
gmmat_table <- unique(gmmat_table)

### Turn long data wide

On first attempt, got this error for tidyr::spread

Inspect data

In [None]:
data.table::fwrite(gmmat_table, "5-OUT_GMMAT_table.csv")

In [None]:
gmmat_table_wide <- tidyr::spread(gmmat_table, batch, gmmat_path)

## Merge all tables

In [None]:
merged <- merge(data, gmmat_table_wide, by = "raw_trait",
                all = TRUE)

Clean up these column names a little

In [None]:
colnames(merged) <- file_path_sans_ext(file_path_sans_ext(colnames(merged)))

In [None]:
fwrite(merged, "5-OUT_Table_results_by_method.csv")