In [84]:
library(tidyverse)
library(jsonlite)
library(tidystringdist)
library(kableExtra)
library(janitor)


Attaching package: ‘janitor’


The following objects are masked from ‘package:stats’:

    chisq.test, fisher.test




In [2]:
dat <- fromJSON('/Users/jon/Documents/NEU/Projects/llm-mutation-testing/mixtral-8x7b-instruct/template-full-0.0/run360/projects/Complex.js/summary.json')
dat %>% as.data.frame()

nrPrompts,nrCandidates,nrSyntacticallyValid,nrSyntacticallyInvalid,nrIdentical,nrDuplicate,nrLocations,totalPromptTokens,totalCompletionTokens,totalTokens,⋯,metaInfo.temperature,metaInfo.maxTokens,metaInfo.maxNrPrompts,metaInfo.rateLimit,metaInfo.nrAttempts,metaInfo.template,metaInfo.systemPrompt,metaInfo.mutate,metaInfo.ignore,metaInfo.benchmark
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<lgl>
490,1272,962,310,0,15,398,960545,96072,1056617,⋯,0,250,2000,0,3,templates/template-full.hb,SystemPrompt-MutationTestingExpert.txt,complex.js,,True


In [3]:
BASE_DIR <- '/Users/jon/Documents/NEU/Projects/llm-mutation-testing/mutation-testing-data'

# find all summary.json files in BASE_DIR
summary_files <- list.files(path = BASE_DIR, recursive = TRUE, pattern = "summary.json", full.names = TRUE)

# load in all mutants.json files, adding a column for the file path
summary <- map_df(summary_files, ~fromJSON(.x) %>% as.data.frame()%>%
                    mutate(file = .x)) %>%
        extract(file, c('model','template','temperature','run','project'), "mutation-testing-data/([^/]+)/template-([^-]+)-([^/]+)/run([^/]+)/projects/([^/]+)/")  %>%
        filter(!is.na(project))

In [4]:
# Load in all StrykerInfo.json files
stryker_files <- list.files(path = BASE_DIR, recursive = TRUE, pattern = "StrykerInfo.json", full.names = TRUE)
stryker <- map_df(stryker_files, ~fromJSON(.x) %>% as.data.frame()%>%
                    mutate(file = .x)) %>%
        extract(file, c('model','template','temperature','run','project'), "mutation-testing-data/([^/]+)/template-([^-]+)-([^/]+)/run([^/]+)/projects/([^/]+)/")  %>%
        filter(!is.na(project))

In [159]:
tableStats <- 
summary %>% select(model,template,temperature,run,project,nrPrompts,nrCandidates,nrSyntacticallyInvalid,nrIdentical,nrDuplicate,nrSyntacticallyValid) %>%
rename(Prompts=nrPrompts, Candidates=nrCandidates, Invalid=nrSyntacticallyInvalid, Identical=nrIdentical, Duplicate=nrDuplicate, mutants=nrSyntacticallyValid) %>%
inner_join(stryker, by = c('model', 'template', 'temperature', 'run', 'project')) %>%
rename(Killed=nrKilled, Survived=nrSurvived, Timeout=nrTimedOut) %>% select(-time) %>%
mutate(Killed=as.integer(Killed), Survived=as.integer(Survived), Timeout=as.integer(Timeout),
mutationScore=as.double(mutationScore)) %>%
mutate(modelLabel=paste(model,template,temperature,sep='-')) 

summarizedStats <- tableStats %>% 
select(-model,-template,-temperature) %>%
group_by(modelLabel, project) %>%
# summarise min/max per-project
summarise(
Prompts=paste0("$", format(round(mean(Prompts),0), big.mark=","), " \\pm ", round(sd(Prompts),0),"$"),
Candidates=paste0("$",format(round(mean(Candidates),0), big.mark=","), " \\pm ", round(sd(Candidates),0),"$"),
Invalid=paste0("$",format(round(mean(Invalid),0), big.mark=","), " \\pm ", round(sd(Invalid),0),"$"),
Identical=paste0("$",format(round(mean(Identical),0), big.mark=","), " \\pm ", round(sd(Identical),0),"$"),
Duplicate=paste0("$",format(round(mean(Duplicate),0), big.mark=","), " \\pm ", round(sd(Duplicate),0),"$"),
mutants=paste0("$",format(round(mean(mutants),0), big.mark=","), " \\pm ", round(sd(mutants),0),"$"),
Killed=paste0("$",format(round(mean(Killed),0), big.mark=","), " \\pm ", round(sd(Killed),0),"$"),
Survived=paste0("$",format(round(mean(Survived),0), big.mark=","), " \\pm ", round(sd(Survived),0),"$"),
Timeout=paste0("$",format(round(mean(Timeout),0), big.mark=","), " \\pm ", round(sd(Timeout),0),"$"),
mutationScore=paste0("$",format(round(mean(mutationScore),2), big.mark=","), " \\pm ", round(sd(mutationScore),2),"$")
)

totalStats <- tableStats %>%
group_by(modelLabel,project) %>%
summarise(
Prompts=mean(Prompts),
Candidates=mean(Candidates),
Invalid=mean(Invalid),
Identical=mean(Identical),
Duplicate=mean(Duplicate),
mutants=mean(mutants),
Killed=mean(Killed),
Survived=mean(Survived),
Timeout=mean(Timeout),
mutationScore=mean(mutationScore)
) %>% ungroup() %>%
group_by(modelLabel) %>%
summarise(
Prompts=paste0("$",format(round(sum(Prompts),0), big.mark=","), "$"),
Candidates=paste0("$",format(round(sum(Candidates),0), big.mark=","), "$"),
Invalid=paste0("$",format(round(sum(Invalid),0), big.mark=","), "$"),
Identical=paste0("$",format(round(sum(Identical),0), big.mark=","), "$"),
Duplicate=paste0("$",format(round(sum(Duplicate),0), big.mark=","), "$"),
mutants=paste0("$",format(round(sum(mutants),0), big.mark=","), "$"),
Killed=paste0("$",format(round(sum(Killed),0), big.mark=","), "$"),
Survived=paste0("$",format(round(sum(Survived),0), big.mark=","), "$"),
Timeout=paste0("$",format(round(sum(Timeout),0), big.mark=","), "$"),
mutationScore=paste0("$",format(round(mean(mutationScore),2), big.mark=","), "$"),
project="\\hline\\textit{Total}"
)


generateSideBySide <- function(model1, model2){
    byProject <- summarizedStats %>%
    filter(modelLabel==model1 | modelLabel==model2)  %>%
    pivot_wider(names_from = modelLabel, values_from = c(Candidates,Invalid,Identical,Duplicate,mutants,Killed,Survived,Timeout), names_vary = 'slowest') 
    overall <- totalStats %>%
    filter(modelLabel==model1 | modelLabel==model2) %>%
    pivot_wider(names_from = modelLabel, values_from = c(Candidates,Invalid,Identical,Duplicate,mutants,Killed,Survived,Timeout), names_vary = 'slowest')

    rbind(byProject, overall) %>% mutate(
        project=paste0("\\textit{",project,"}")) %>%
    # kable()
    kable(format='latex',escape = F, booktabs=T, linesep="")
}

summarizedStats %>% group_by(modelLabel) %>% tally()

[1m[22m`summarise()` has grouped output by 'modelLabel'. You can override using the
`.groups` argument.
[1m[22m`summarise()` has grouped output by 'modelLabel'. You can override using the
`.groups` argument.


modelLabel,n
<chr>,<int>
codellama-13b-instruct-full-0.0,13
codellama-34b-instruct-basic-0.0,13
codellama-34b-instruct-full-0.0,13
codellama-34b-instruct-full-0.25,13
codellama-34b-instruct-full-0.5,13
codellama-34b-instruct-full-1.0,13
codellama-34b-instruct-full-genericsystemprompt-0.0,13
codellama-34b-instruct-noexplanation-0.0,13
codellama-34b-instruct-noinstructions-0.0,13
codellama-34b-instruct-onemutation-0.0,13


In [97]:
captionStrs <- tableStats %>% group_by(modelLabel, run) %>% 
summarise(n=n()) %>%
ungroup()%>%
group_by(modelLabel) %>%
# paste together run
summarise(runs=paste0(run, collapse=", ")) %>%
ungroup() %>%
mutate(runs=paste0(modelLabel,' (runs ',runs,')'))

captionStr <- function(labels) {
    paste(captionStrs %>% filter(modelLabel %in% labels) %>% pull(runs), collapse=", ")
}

[1m[22m`summarise()` has grouped output by 'modelLabel'. You can override using the
`.groups` argument.


In [189]:
variabilityTableForOneConfig <- function(theModel){
# theModel <- 'codellama-13b-instruct-full-0.0'
    byProject <- summarizedStats %>%
    filter(modelLabel==theModel)  
    overall <- totalStats %>%
    filter(modelLabel==theModel) 

    table <- rbind(byProject, overall) %>% ungroup()%>% mutate(
         project=case_when(project=="\\hline\\textit{Total}"~project,TRUE~paste0("\\textit{",project,"}"))) %>%
         select(-modelLabel)%>%
    rename(application=project,
    `\\#prompts`=Prompts,
    `Candidates`=Candidates,
    `Invalid`=Invalid,
    `Identical`=Identical,
    `Duplicate`=Duplicate,
    `\\#mutants`=mutants,
    `\\#killed`=Killed,
    `\\#survived`=Survived,
    `\\#timeout`=Timeout,
    `mut. score`=mutationScore
    )%>%
    # # kable()
     kable(format='latex',escape = F, booktabs=T,
     linesep="",

     caption=paste0("Summary of mutants for ",captionStr(c(theModel)), ". Each column shows the average number of mutants from all runs, $\\pm$ the standard deviation."),
     label = paste0("appendix-variability-",theModel)
     ) %>%   kable_styling(latex_options="scale_down")

    #  table
    write(table, paste0("tables-for-paper/summaryByConfig/",theModel,".tex"))
    paste0("\\input{supplemental/summaryByConfig/",theModel,".tex}")
}

models <- summarizedStats %>% group_by(modelLabel) %>% tally() %>% ungroup() %>% select(modelLabel)
write(models %>% pull(modelLabel) %>% map(variabilityTableForOneConfig) %>% paste(collapse="\n"), 'tables-for-paper/summaryByConfig/all.tex')
# variabilityTableForOneConfig('codellama-13b-instruct-full-0.0')

# create the text string that links to all tables by ref
summary %>% group_by(model,template,temperature) %>% tally() %>% ungroup() %>% 
mutate(text=paste0("\\item Model ",model," with template ",template," at temperature ",temperature, ": Table \\ref{tab:appendix-variability-",model,"-",template,"-",temperature,"}")) %>% select(text) %>% pull() %>% paste(collapse=", ") %>%
# join to one string
paste0("\\begin{itemize}\n", ., "\n\\end{itemize}", sep="\n", collapse = "\n") %>% #print without escaping
writeLines("tables-for-paper/summaryByConfig/allRefs.tex")
# models %>% pull(modelLabel) %>% map(~paste0("""\\ref{tab:appendix-variability-",.x,"}")) %>% paste(collapse=", ")


In [105]:
# Table 7, mutant stats for models with higher variability. These are too wide to fit on a page.
generateSideBySide('codellama-13b-instruct-full-0.0','mixtral-8x7b-instruct-full-0.0')
generateSideBySide('llama-3.3-70b-instruct-full-0.0','gpt-4o-mini-full-0.0')

captionStr(c('codellama-13b-instruct-full-0.0','mixtral-8x7b-instruct-full-0.0','llama-3.3-70b-instruct-full-0.0','gpt-4o-mini-full-0.0'))



\begin{tabular}{lllllllllllllllll}
\toprule
project & Candidates_codellama-13b-instruct-full-0.0 & Invalid_codellama-13b-instruct-full-0.0 & Identical_codellama-13b-instruct-full-0.0 & Duplicate_codellama-13b-instruct-full-0.0 & mutants_codellama-13b-instruct-full-0.0 & Killed_codellama-13b-instruct-full-0.0 & Survived_codellama-13b-instruct-full-0.0 & Timeout_codellama-13b-instruct-full-0.0 & Candidates_mixtral-8x7b-instruct-full-0.0 & Invalid_mixtral-8x7b-instruct-full-0.0 & Identical_mixtral-8x7b-instruct-full-0.0 & Duplicate_mixtral-8x7b-instruct-full-0.0 & mutants_mixtral-8x7b-instruct-full-0.0 & Killed_mixtral-8x7b-instruct-full-0.0 & Survived_mixtral-8x7b-instruct-full-0.0 & Timeout_mixtral-8x7b-instruct-full-0.0\\
\midrule
\textit{Complex.js} & $1,411 \pm 0$ & $340 \pm 0$ & $116 \pm 0$ & $28 \pm 0$ & $955 \pm 0$ & $553 \pm 0$ & $401 \pm 0$ & $1 \pm 0$ & $1,284 \pm 11$ & $310 \pm 10$ & $0 \pm 0$ & $19 \pm 4$ & $974 \pm 11$ & $592 \pm 10$ & $382 \pm 6$ & $0 \pm 0$\\
\textit{coun


\begin{tabular}{lllllllllllllllll}
\toprule
project & Candidates_gpt-4o-mini-full-0.0 & Invalid_gpt-4o-mini-full-0.0 & Identical_gpt-4o-mini-full-0.0 & Duplicate_gpt-4o-mini-full-0.0 & mutants_gpt-4o-mini-full-0.0 & Killed_gpt-4o-mini-full-0.0 & Survived_gpt-4o-mini-full-0.0 & Timeout_gpt-4o-mini-full-0.0 & Candidates_llama-3.3-70b-instruct-full-0.0 & Invalid_llama-3.3-70b-instruct-full-0.0 & Identical_llama-3.3-70b-instruct-full-0.0 & Duplicate_llama-3.3-70b-instruct-full-0.0 & mutants_llama-3.3-70b-instruct-full-0.0 & Killed_llama-3.3-70b-instruct-full-0.0 & Survived_llama-3.3-70b-instruct-full-0.0 & Timeout_llama-3.3-70b-instruct-full-0.0\\
\midrule
\textit{Complex.js} & $1,433 \pm 3$ & $448 \pm 6$ & $0 \pm 0$ & $36 \pm 2$ & $986 \pm 4$ & $599 \pm 4$ & $387 \pm 2$ & $0 \pm 0$ & $1,413 \pm 3$ & $282 \pm 7$ & $0 \pm 0$ & $59 \pm 5$ & $1,130 \pm 6$ & $682 \pm 5$ & $448 \pm 3$ & $1 \pm 0$\\
\textit{countries-and-timezones} & $308 \pm 2$ & $101 \pm 3$ & $0 \pm 0$ & $10 \pm 2$ & $207 \pm