# Inspect and summarize SLURM benchmarks

## Overview

## Load, merge and compare benchmark summary `csv`s

We need a function to compare our SLURM summary output files and highlight differences in a table.

In [1]:
library(dplyr)
library(tidyr)

read_and_summarize_differences <- function(directory) {
  # List all CSV files in the directory
  files <- list.files(directory, pattern = "\\.csv$", full.names = TRUE)
  #files <- files[grepl("alpha_05", files)]
  #files <- files[!grepl("1000000", files)]
  
  # Read each file and store in a list
  data_list <- lapply(files, function(file) {
    data <- read.csv(file)
    # Add a column to identify the source file
    data$Source <- basename(file)
    return(data)
  })
  
  # Combine all data frames into one
  combined_data <- bind_rows(data_list)
  
  # Spread the data to wide format for comparison
  wide_data <- spread(combined_data, key = Parameter, value = Value)
  
  # Identify rows with at least one difference in values
  differing_rows <- wide_data %>%
    summarise(across(-Source, ~n_distinct(.) > 1)) %>%
    select(which(colSums(.) > 0)) %>%
    names()
  
  # Filter the combined data to include only differing parameters
  summary_data <- combined_data %>%
    filter(Parameter %in% differing_rows) %>%
    select(Parameter, Source, Value)
  
  # Pivot wider for a clearer summary table, if necessary
  summary_table <- summary_data %>%
    pivot_wider(names_from = Source, values_from = Value)
  
  return(summary_table)
}


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
directory <- "/dcs04/lieber/statsgen/mnagle/mwas/CpGWAS/output/"
summary_table <- read_and_summarize_differences(directory)
summary_table

Parameter,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-1corepera-20240402-150130-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chunk1,1,2001,4001,6001,8001
chunk2,2000,4000,6000,8000,10000
tag,libd_chr1-chr1_all-1-2000-dynamic-1corestotal-1corepera-20240402-150130,libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-1corepera-20240402-150131
time_started,2024-04-02 15:02:36,2024-04-02 15:04:30,2024-04-02 15:05:35,2024-04-02 15:05:28,2024-04-02 15:23:03
time_finished,2024-04-02 17:38:10,2024-04-02 17:08:55,2024-04-02 16:53:03,2024-04-02 16:00:10,2024-04-02 17:53:51
runtime,02:35:33,02:04:25,01:47:28,00:54:42,02:30:48
type_CPU,AMD Opteron(TM) Processor 6234,AMD Opteron(tm) Processor 6380,Intel(R) Xeon(R) CPU E5-2650L v3 @ 1.80GHz,Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz,AMD Opteron(tm) Processor 6380
amount_RAM,0.492048904299736,0.476668052375317,0.122735276818275,0.491538614034653,0.476668059825897
physical_cores,24,32,24,64,32
scaffold_ID,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-1corepera-20240402-150130,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-1corepera-20240402-150131


## Compute core-hours per test

In [3]:
windows <- c(10000)
n_windows <- length(windows)
alphas <- 0.5 #seq(0, 1, .25) # left out 0 on first run
n_alphas <- length(alphas)
chunks <- 10000
n_tests <- n_windows * n_alphas * chunks
n_tests

In [4]:
alphas

In [5]:
colnames(summary_table)

In [6]:
summary_table

Parameter,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-1corepera-20240402-150130-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
chunk1,1,2001,4001,6001,8001
chunk2,2000,4000,6000,8000,10000
tag,libd_chr1-chr1_all-1-2000-dynamic-1corestotal-1corepera-20240402-150130,libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-1corepera-20240402-150131
time_started,2024-04-02 15:02:36,2024-04-02 15:04:30,2024-04-02 15:05:35,2024-04-02 15:05:28,2024-04-02 15:23:03
time_finished,2024-04-02 17:38:10,2024-04-02 17:08:55,2024-04-02 16:53:03,2024-04-02 16:00:10,2024-04-02 17:53:51
runtime,02:35:33,02:04:25,01:47:28,00:54:42,02:30:48
type_CPU,AMD Opteron(TM) Processor 6234,AMD Opteron(tm) Processor 6380,Intel(R) Xeon(R) CPU E5-2650L v3 @ 1.80GHz,Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz,AMD Opteron(tm) Processor 6380
amount_RAM,0.492048904299736,0.476668052375317,0.122735276818275,0.491538614034653,0.476668059825897
physical_cores,24,32,24,64,32
scaffold_ID,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-1corepera-20240402-150130,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-1corepera-20240402-150131,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-1corepera-20240402-150131


In [7]:
summary_table <- as.data.frame(t(summary_table))
colnames(summary_table) <- summary_table[1, ]
summary_table <- summary_table[-1, ]
summary_table$number_cores <- 1

In [8]:
summary_table$runtime_seconds <- sapply(summary_table$runtime, function(x) {
  parts <- strsplit(x, ":")[[1]]
  as.numeric(parts[1]) * 3600 + as.numeric(parts[2]) * 60 + as.numeric(parts[3])
})

In [9]:
summary_table$runtime_hours <- summary_table$runtime_seconds/3600

In [10]:
summary_table$runtime_core_hours <- summary_table$runtime_hours * summary_table$number_cores
summary_table$runtime_core_seconds <- summary_table$runtime_seconds * summary_table$number_cores
summary_table$core_hours_per_test <- summary_table$runtime_core_hours / n_tests
summary_table$core_seconds_per_test <- summary_table$runtime_core_seconds / n_tests

In [11]:
summary_table

Unnamed: 0_level_0,chunk1,chunk2,tag,time_started,time_finished,runtime,type_CPU,amount_RAM,physical_cores,scaffold_ID,number_cores,runtime_seconds,runtime_hours,runtime_core_hours,runtime_core_seconds,core_hours_per_test,core_seconds_per_test
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-1corepera-20240402-150130-summary.csv,1,2000,libd_chr1-chr1_all-1-2000-dynamic-1corestotal-1corepera-20240402-150130,2024-04-02 15:02:36,2024-04-02 17:38:10,02:35:33,AMD Opteron(TM) Processor 6234,0.492048904299736,24,libd_chr1-chr1_all-libd_chr1-chr1_all-1-2000-dynamic-1corestotal-1corepera-20240402-150130,1,9333,2.5925,2.5925,9333,0.00025925,0.9333
libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv,2001,4000,libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-1corepera-20240402-150131,2024-04-02 15:04:30,2024-04-02 17:08:55,02:04:25,AMD Opteron(tm) Processor 6380,0.476668052375317,32,libd_chr1-chr1_all-libd_chr1-chr1_all-2001-4000-dynamic-1corestotal-1corepera-20240402-150131,1,7465,2.0736111,2.0736111,7465,0.0002073611,0.7465
libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv,4001,6000,libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-1corepera-20240402-150131,2024-04-02 15:05:35,2024-04-02 16:53:03,01:47:28,Intel(R) Xeon(R) CPU E5-2650L v3 @ 1.80GHz,0.122735276818275,24,libd_chr1-chr1_all-libd_chr1-chr1_all-4001-6000-dynamic-1corestotal-1corepera-20240402-150131,1,6448,1.7911111,1.7911111,6448,0.0001791111,0.6448
libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv,6001,8000,libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-1corepera-20240402-150131,2024-04-02 15:05:28,2024-04-02 16:00:10,00:54:42,Intel(R) Xeon(R) Gold 6338 CPU @ 2.00GHz,0.491538614034653,64,libd_chr1-chr1_all-libd_chr1-chr1_all-6001-8000-dynamic-1corestotal-1corepera-20240402-150131,1,3282,0.9116667,0.9116667,3282,9.116667e-05,0.3282
libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-1corepera-20240402-150131-summary.csv,8001,10000,libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-1corepera-20240402-150131,2024-04-02 15:23:03,2024-04-02 17:53:51,02:30:48,AMD Opteron(tm) Processor 6380,0.476668059825897,32,libd_chr1-chr1_all-libd_chr1-chr1_all-8001-10000-dynamic-1corestotal-1corepera-20240402-150131,1,9048,2.5133333,2.5133333,9048,0.0002513333,0.9048


In [12]:
#summary_table[which.min(summary_table$core_hours_per_test), ]

In [12]:
data.table::fwrite(summary_table, "05-OUT_SLURM_benchmarks_a2.91.csv")

## How long for whole dataset?

In [13]:
n_sites <- 26000000

In [14]:
n_sites*0.0001107989 # From 1000000-1001000

In [15]:
n_sites*mean(summary_table$core_hours_per_test)

In [17]:
# 1240