In [1]:
using SQLite
import SQLite.Stmt
import SQLite.DBInterface.execute
using Distributions

In [2]:
db1 = SQLite.DB("output/before/sweep_db_gathered.sqlite")
db2 = SQLite.DB("output/after/sweep_db_gathered.sqlite")

SQLite.DB("output/after/sweep_db_gathered.sqlite")

In [3]:
import StatsAPI.pvalue
import StatsBase.mean
import StatsBase.mad
import HypothesisTests.ApproximateTwoSampleKSTest

# Functions to compare using K-S tests

"""
    Do K-S tests using SQLite queries that take one parameter, run_id.

    `db1` and `db2` are assumed to have identical parameter combinations labeled the same way,
    via the `combo_id` column in the `runs` table.

    The query should return a single value for the provided run_id.

    If values from db2 need to be extracted with a different query, provide a value for `q2`.
"""
function compare(db1, db2, q1; q2 = q1)
    combo_ids = get_combo_ids(db1)
    @assert all(combo_ids .== get_combo_ids(db2))
    [(combo_id, compare(db1, db2, combo_id, q1; q2 = q2)) for combo_id in combo_ids]
end

function get_combo_ids(db)
    [combo_id for (combo_id,) in execute(db, "SELECT DISTINCT combo_id FROM runs ORDER BY combo_id")]
end

function get_run_ids(db, combo_id)
    [run_id for (run_id,) in execute(db, "SELECT run_id FROM runs WHERE combo_id = ?", [combo_id])]
end

function compare(db1, db2, combo_id, q1; q2 = q1)
    v1 = query_values_for_combo_id(db1, combo_id, q1)
    mean1 = mean(v1)
    mad1 = mad(v1; center = mean1)

    v2 = query_values_for_combo_id(db2, combo_id, q2)
    mean2 = mean(v2)
    mad2 = mad(v2; center = mean2)

    (ApproximateTwoSampleKSTest(v1, v2), (mean1, mad1), (mean2, mad2))
end

function query_values_for_combo_id(db, combo_id, q)
    stmt = Stmt(db, q)
    run_ids = get_run_ids(db, combo_id)

    [query_value_for_run_id(stmt, run_id) for run_id in run_ids]
end

function query_value_for_run_id(stmt, run_id)
    for (value,) in execute(stmt, (run_id,))
        return value
    end
end

query_value_for_run_id (generic function with 1 method)

In [4]:
for (combo_id, (test, (mean1, mad1), (mean2, mad2))) in compare(
    db1, db2, "SELECT value FROM run_meta WHERE key = \"elapsed_time\" AND run_id = ?"
)
    println("Testing elapsed time for parameter combination $(combo_id)...")
    println("    (mean1 $(mean1), mad1 $(mad1))")
    println("    (mean2 $(mean2), mad2 $(mad2))")
    println("    p-value: $(pvalue(test))")
    if pvalue(test) < 0.01
        println("    DIFFERENT distributions w/ p < 0.01")
    else
        println("    undetectable difference between distributions w/ p < 0.01")
    end
    println("    fractional difference: $((mean1 - mean2) / mean1)")
end

Testing elapsed time for parameter combination 1...
    (mean1 214.79646666666667, mad1 3.810979582594712)
    (mean2 513.8502666666666, mad2 10.681061742705813)
    p-value: 6.118046410036549e-7
    DIFFERENT distributions w/ p < 0.01
    fractional difference: -1.3922659187131254


## Mean # active infected hosts from 90 to 100 years

In [6]:
for (combo_id, (test, (mean1, mad1), (mean2, mad2))) in compare(
    db1, db2, "SELECT AVG(n_infected_active) FROM summary WHERE run_id = ? AND (time >= 9 * 360) AND (time <= 100 * 360)"
)
    println("Testing mean infected for parameter combination $(combo_id)...")
    println("    (mean1 $(mean1), mad1 $(mad1))")
    println("    (mean2 $(mean2), mad2 $(mad2))")
    println("    p-value: $(pvalue(test))")
    if pvalue(test) < 0.01
        println("    DIFFERENT distributions w/ p < 0.01")
    else
        println("    undetectable difference between distributions w/ p < 0.01")
    end
    println("    fractional difference: $((mean1 - mean2) / mean1)")
end

Testing mean infected for parameter combination 1...
    (mean1 310.1076923076923, mad1 10.423834059339312)
    (mean2 271.18461538461537, mad2 12.613523489747662)
    p-value: 4.22910789740631e-6
    DIFFERENT distributions w/ p < 0.01
    fractional difference: 0.12551470953018798


## Mean # circulating genes from 90 to 100 years

In [7]:
for (combo_id, (test, (mean1, mad1), (mean2, mad2))) in compare(
    db1, db2, "SELECT AVG(n_circulating_genes_blood) FROM gene_strain_counts WHERE run_id = ? AND (time >= 9 * 360) AND (time <= 10 * 360)"
)
    println("Testing mean infected for parameter combination $(combo_id)...")
    println("    (mean1 $(mean1), mad1 $(mad1))")
    println("    (mean2 $(mean2), mad2 $(mad2))")
    println("    p-value: $(pvalue(test))")
    if pvalue(test) < 0.01
        println("    DIFFERENT distributions w/ p < 0.01")
    else
        println("    undetectable difference between distributions w/ p < 0.01")
    end
    println("    fractional difference: $((mean1 - mean2) / mean1)")
end

Testing mean infected for parameter combination 1...
    (mean1 1199.9333333333334, mad1 0.09884014790028356)
    (mean2 1199.7666666666667, mad2 0.34594051765132955)
    p-value: 0.9250856809941739
    undetectable difference between distributions w/ p < 0.01
    fractional difference: 0.000138896605367028
