# Inspect and summarize SLURM benchmarks

## Overview

## Load, merge and compare benchmark summary `csv`s

We need a function to compare our SLURM summary output files and highlight differences in a table.

In [1]:
library(dplyr)
library(tidyr)

read_and_summarize_differences <- function(directory) {
  # List all CSV files in the directory
  files <- list.files(directory, pattern = "\\.csv$", full.names = TRUE)
  files <- files[!grepl("1000000", files)]
  
  # Read each file and store in a list
  data_list <- lapply(files, function(file) {
    data <- read.csv(file)
    # Add a column to identify the source file
    data$Source <- basename(file)
    return(data)
  })
  
  # Combine all data frames into one
  combined_data <- bind_rows(data_list)
  
  # Spread the data to wide format for comparison
  wide_data <- spread(combined_data, key = Parameter, value = Value)
  
  # Identify rows with at least one difference in values
  differing_rows <- wide_data %>%
    summarise(across(-Source, ~n_distinct(.) > 1)) %>%
    select(which(colSums(.) > 0)) %>%
    names()
  
  # Filter the combined data to include only differing parameters
  summary_data <- combined_data %>%
    filter(Parameter %in% differing_rows) %>%
    select(Parameter, Source, Value)
  
  # Pivot wider for a clearer summary table, if necessary
  summary_table <- summary_data %>%
    pivot_wider(names_from = Source, values_from = Value)
  
  return(summary_table)
}


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
directory <- "/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/output/500k_window/"
summary_table <- read_and_summarize_differences(directory)
summary_table

Parameter,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-12corestotal-allcorepera-20240404-110552-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-allcorepera-20240404-110552-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-allcorepera-20240404-110552-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-allcorepera-20240404-110553-summary.csv
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chunk1,1,1,2001,2001,4001,4001,6001,6001,8001,8001
chunk2,2000,2000,4000,4000,6000,6000,8000,8000,10000,10000
num_cores,12,1,12,1,12,1,12,1,12,1
tag,libd_chr1-chr1_all-1-2000-dynamic-12corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-1-2000-dynamic-1corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-2001-4000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-4001-6000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-6001-8000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-8001-10000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-allcorepera-20240404-110553
time_started,2024-04-04 15:38:53,2024-04-04 14:59:51,2024-04-04 15:41:08,2024-04-04 14:59:51,2024-04-04 15:41:08,2024-04-04 14:59:51,2024-04-04 15:43:22,2024-04-04 15:00:53,2024-04-04 15:44:54,2024-04-04 15:00:53
time_finished,2024-04-04 16:50:59,2024-04-04 15:57:41,2024-04-04 16:57:33,2024-04-04 15:58:22,2024-04-04 17:01:02,2024-04-04 15:58:32,2024-04-04 17:11:35,2024-04-04 15:54:02,2024-04-04 17:00:31,2024-04-04 16:00:21
runtime,01:12:06,00:57:49,01:16:24,00:58:30,01:19:53,00:58:40,01:28:13,00:53:09,01:15:37,00:59:27
amount_RAM,0.245569724589586,0.245569720864296,0.245569724589586,0.245569720864296,0.245569720864296,0.245569720864296,0.245569724589586,0.245569724589586,0.245569724589586,0.245569724589586
number_cores,12,1,12,1,12,1,12,1,12,1
scaffold_ID,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-12corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-allcorepera-20240404-110553


## Compute core-hours per test

In [3]:
windows <- c(10000)
n_windows <- length(windows)
alphas <- seq(0.5) # left out 0 on first run
n_alphas <- length(alphas)
chunks <- 10000
n_tests <- n_windows * n_alphas * chunks
n_tests

In [4]:
alphas

In [5]:
colnames(summary_table)

In [6]:
summary_table

Parameter,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-12corestotal-allcorepera-20240404-110552-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-allcorepera-20240404-110552-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-allcorepera-20240404-110552-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-allcorepera-20240404-110553-summary.csv
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chunk1,1,1,2001,2001,4001,4001,6001,6001,8001,8001
chunk2,2000,2000,4000,4000,6000,6000,8000,8000,10000,10000
num_cores,12,1,12,1,12,1,12,1,12,1
tag,libd_chr1-chr1_all-1-2000-dynamic-12corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-1-2000-dynamic-1corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-2001-4000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-4001-6000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-6001-8000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-8001-10000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-allcorepera-20240404-110553
time_started,2024-04-04 15:38:53,2024-04-04 14:59:51,2024-04-04 15:41:08,2024-04-04 14:59:51,2024-04-04 15:41:08,2024-04-04 14:59:51,2024-04-04 15:43:22,2024-04-04 15:00:53,2024-04-04 15:44:54,2024-04-04 15:00:53
time_finished,2024-04-04 16:50:59,2024-04-04 15:57:41,2024-04-04 16:57:33,2024-04-04 15:58:22,2024-04-04 17:01:02,2024-04-04 15:58:32,2024-04-04 17:11:35,2024-04-04 15:54:02,2024-04-04 17:00:31,2024-04-04 16:00:21
runtime,01:12:06,00:57:49,01:16:24,00:58:30,01:19:53,00:58:40,01:28:13,00:53:09,01:15:37,00:59:27
amount_RAM,0.245569724589586,0.245569720864296,0.245569724589586,0.245569720864296,0.245569720864296,0.245569720864296,0.245569724589586,0.245569724589586,0.245569724589586,0.245569724589586
number_cores,12,1,12,1,12,1,12,1,12,1
scaffold_ID,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-12corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-allcorepera-20240404-110552,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-12corestotal-allcorepera-20240404-110553,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-allcorepera-20240404-110553


In [7]:
summary_table <- as.data.frame(t(summary_table))
colnames(summary_table) <- summary_table[1, ]
summary_table <- summary_table[-1, ]

In [8]:
summary_table$number_cores <- as.numeric(as.character(summary_table$num_cores))

In [9]:
summary_table$runtime_seconds <- sapply(summary_table$runtime, function(x) {
  parts <- strsplit(x, ":")[[1]]
  as.numeric(parts[1]) * 3600 + as.numeric(parts[2]) * 60 + as.numeric(parts[3])
})

In [10]:
summary_table$runtime_hours <- summary_table$runtime_seconds/3600

In [11]:
summary_table$chunk1 <- as.numeric(as.character(summary_table$chunk1))
summary_table$chunk2 <- as.numeric(as.character(summary_table$chunk2))

In [12]:
summary_table$runtime_core_hours <- summary_table$runtime_hours * summary_table$number_cores
summary_table$runtime_core_seconds <- summary_table$runtime_seconds * summary_table$number_cores
summary_table$core_hours_per_test <- summary_table$runtime_core_hours / (summary_table$chunk2 - summary_table$chunk1 + 1)
summary_table$core_seconds_per_test <- summary_table$runtime_core_seconds / (summary_table$chunk2 - summary_table$chunk1 + 1)

In [13]:
summary_table

Unnamed: 0_level_0,chunk1,chunk2,num_cores,tag,time_started,time_finished,runtime,amount_RAM,number_cores,scaffold_ID,runtime_seconds,runtime_hours,runtime_core_hours,runtime_core_seconds,core_hours_per_test,core_seconds_per_test
Unnamed: 0_level_1,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-12corestotal-allcorepera-20240404-110552-summary.csv,1,2000,12,libd_chr1-chr1_all-1-2000-dynamic-12corestotal-allcorepera-20240404-110552,2024-04-04 15:38:53,2024-04-04 16:50:59,01:12:06,0.245569724589586,12,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-12corestotal-allcorepera-20240404-110552,4326,1.2016667,14.42,51912,0.00721,25.956
libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-allcorepera-20240404-110552-summary.csv,1,2000,1,libd_chr1-chr1_all-1-2000-dynamic-1corestotal-allcorepera-20240404-110552,2024-04-04 14:59:51,2024-04-04 15:57:41,00:57:49,0.245569720864296,1,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-allcorepera-20240404-110552,3469,0.9636111,0.9636111,3469,0.0004818056,1.7345
libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,2001,4000,12,libd_chr1-chr1_all-2001-4000-dynamic-12corestotal-allcorepera-20240404-110553,2024-04-04 15:41:08,2024-04-04 16:57:33,01:16:24,0.245569724589586,12,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-12corestotal-allcorepera-20240404-110553,4584,1.2733333,15.28,55008,0.00764,27.504
libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-allcorepera-20240404-110552-summary.csv,2001,4000,1,libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-allcorepera-20240404-110552,2024-04-04 14:59:51,2024-04-04 15:58:22,00:58:30,0.245569720864296,1,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-allcorepera-20240404-110552,3510,0.975,0.975,3510,0.0004875,1.755
libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,4001,6000,12,libd_chr1-chr1_all-4001-6000-dynamic-12corestotal-allcorepera-20240404-110553,2024-04-04 15:41:08,2024-04-04 17:01:02,01:19:53,0.245569720864296,12,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-12corestotal-allcorepera-20240404-110553,4793,1.3313889,15.9766667,57516,0.0079883333,28.758
libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-allcorepera-20240404-110553-summary.csv,4001,6000,1,libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-allcorepera-20240404-110553,2024-04-04 14:59:51,2024-04-04 15:58:32,00:58:40,0.245569720864296,1,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-allcorepera-20240404-110553,3520,0.9777778,0.9777778,3520,0.0004888889,1.76
libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,6001,8000,12,libd_chr1-chr1_all-6001-8000-dynamic-12corestotal-allcorepera-20240404-110553,2024-04-04 15:43:22,2024-04-04 17:11:35,01:28:13,0.245569724589586,12,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-12corestotal-allcorepera-20240404-110553,5293,1.4702778,17.6433333,63516,0.0088216667,31.758
libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-allcorepera-20240404-110553-summary.csv,6001,8000,1,libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-allcorepera-20240404-110553,2024-04-04 15:00:53,2024-04-04 15:54:02,00:53:09,0.245569724589586,1,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-allcorepera-20240404-110553,3189,0.8858333,0.8858333,3189,0.0004429167,1.5945
libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-12corestotal-allcorepera-20240404-110553-summary.csv,8001,10000,12,libd_chr1-chr1_all-8001-10000-dynamic-12corestotal-allcorepera-20240404-110553,2024-04-04 15:44:54,2024-04-04 17:00:31,01:15:37,0.245569724589586,12,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-12corestotal-allcorepera-20240404-110553,4537,1.2602778,15.1233333,54444,0.0075616667,27.222
libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-allcorepera-20240404-110553-summary.csv,8001,10000,1,libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-allcorepera-20240404-110553,2024-04-04 15:00:53,2024-04-04 16:00:21,00:59:27,0.245569724589586,1,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-allcorepera-20240404-110553,3567,0.9908333,0.9908333,3567,0.0004954167,1.7835


In [14]:
#summary_table[which.min(summary_table$core_hours_per_test), ]

In [15]:
data.table::fwrite(summary_table, "05-OUT_SLURM_benchmarks_a3_500k_windows.csv")

## How long for whole dataset?

***This is for benchmarks with 500kb windows***

In [16]:
summary_table <- summary_table[which(summary_table$num_cores == 1), ]

In [17]:
n_sites <- 26000000

In [18]:
summary_table$core_hours_per_test

In [19]:
# Core hours for single run over methylation dataset (single brain region, single population dataset)

n_sites*mean(summary_table$core_hours_per_test)

In [20]:
# Core hours for 3 x 3 brain regions x population datasets

brain_regions <- 3
populations <- 3
n_sites*mean(summary_table$core_hours_per_test)*brain_regions*populations