# Merge `fastStructure` results across replicates

In this notebook, I make two types of plots (distruct plots and likelihood plots) using 10 replicates for each K from 3-13, with the 73k SNP set. These will help inform the ideal # of subpopulations for downstream analysis.

## Build list of files of interest with meanQ, produce mean matrices

In [None]:
files <- list.files("~/proj/fastStructure/", pattern = "SNPs72k", full.names = TRUE)
files <- files[grepl("meanQ", files)]

In [None]:
library(stringr)
file_df <- str_split_fixed(files, "\\.", 3)


In [None]:
filepaths_and_K <- as.data.frame(cbind(files, file_df[,2]))
colnames(filepaths_and_K) <- c("filepath", "K")

In [None]:
dir.create("fastStructure_mean_results")

In [None]:
library(data.table)

In [None]:
library(pophelper)

In [None]:
for(i in levels(factor(filepaths_and_K$K))){
    files_this_K <- filepaths_and_K[which(filepaths_and_K$K == i), ]
    #print(files_this_K)
    
    list_of_matrices <- c()
    
    for(j in 1:nrow(files_this_K)){
        list_of_matrices[[j]] <- read.table(files_this_K$filepath[j])
    }
    
    # Calculate elementwise mean matrix from a list of matrices
    # https://stackoverflow.com/questions/19218475/element-wise-mean-over-list-of-matrices
    mean_matrix <- Reduce("+", list_of_matrices) / length(list_of_matrices)
    
    outpath <- paste0("fastStructure_mean_results/SNPs72kfiltered_mean.",
                      i,
                      ".meanQ")
    
    print(paste("Writing to ", outpath))
    
    fwrite(mean_matrix,
           outpath,
           sep = "\t",
           row.names = FALSE,
           col.names = FALSE)
    
    plotQ(readQ(outpath),
          exportpath=getwd(),
          sortind="all")
    
}

## Plot mean and stdev of likelihood across replicates and values of K

In [None]:
files <- list.files("~/proj/fastStructure/", pattern = "SNPs72k", full.names = TRUE)
files <- files[grepl("log", files)]

filepaths_and_K <- as.data.frame(cbind(files, file_df[,2]))
colnames(filepaths_and_K) <- c("filepath", "K")

In [None]:
df <- data.frame(matrix(NA,
                        nrow = length(levels(factor(filepaths_and_K$K))),
                        ncol = 3))

colnames(df) <- c("K", "L_mean", "L_sd")

In [None]:
for(i in 1:length(levels(factor(filepaths_and_K$K)))){
    files_this_K <- filepaths_and_K[which(filepaths_and_K$K == levels(factor(filepaths_and_K$K))[i]), ]
    #print(files_this_K)
    
    list_of_likelihoods <- c()
    
    for(j in 1:nrow(files_this_K)){
        
        lines <- readLines(files_this_K$filepath[j])
        
        line_with_likelihood <- lines[grepl("Marginal Likelihood = ", lines)]
        
        likelihood <- as.numeric(as.character(
            str_split_fixed(line_with_likelihood, " ", 4)[,4]))
        
        list_of_likelihoods[[j]] <- likelihood
        

    }
    
    # Calculate mean likelihood for these replicates
    mean_likelihoods <- mean(unlist(list_of_likelihoods))
    sd_likelihoods <- sd(unlist(list_of_likelihoods))
    
    df$K[i] <- levels(factor(filepaths_and_K$K))[i]
    df$L_mean[i] <- mean_likelihoods
    df$L_sd[i] <- sd_likelihoods
    
#     print(i)
#     print(mean_likelihoods)
#     print(sd_likelihoods)
    
    
}

In [None]:
attach(df)

In [None]:
plot(K, `L_mean`,
     ylim=range(c(`L_mean`-L_sd,
                  `L_mean` + L_sd)),
     pch=19, xlab="K (# subpopulations)", ylab="log(L(X|K))",
     main="Log Marginal likelihood of data given K subpopulations"
)

arrows(x0 = as.numeric(as.character(K)),
       y0 = `L_mean` - L_sd,
       x1 = as.numeric(as.character(K)),
       y1 = `L_mean` + L_sd,
       length=0.05,
       angle=90,
       code=3)