# Significance

## Binomial test

A binomial test is a simple and valid way of comparing if a system B performs significantly different (two-tailed) or better (one-tailed) than another system A (e.g. baseline). All that is required are the following variables:
* n = number of instances where B prediction differs from A
* k = number of instances where B is correct and A is wrong (relative successes)
* p = 0.5 (the null hypothesis assumes that system A and B perform equally well: the chance of success of one or the other is a coin flip)

In [3]:
import pandas
import scipy
from scipy import stats
from collections import Counter

def clean_name(name):
    """Reformat name for Table"""
    parts = name.split("_")[2:-1]
    if parts[0] == "baseline":
        return parts[0]
    fs, hp = [x.split("-")[1] for x in parts]
    return "FS: %s, HO: %s" % ({"n": "none"}.get(fs, fs), {"n": "no", "y": "yes"}[hp])

def get_pairwise_significance(exp, systems=None):
    """Binomial test
A binomial test is a simple and valid way of comparing if a system B performs significantly different (two-tailed) or better (one-tailed) than another system A (e.g. baseline). All that is required are the following variables:
n = number of instances where B prediction differs from A
k = number of instances where B is correct and A is wrong (relative successes)
p = 0.5 (the null hypothesis assumes that system A and B perform equally well: the chance of success of one or the other is a coin flip)"""
    fp = "data/results_%s_and_baseline.csv" % exp
    df = pandas.read_csv(fp) # Gold standard is always in the _0 column, predictions in the _1 column
    gold = list(df["_svm_baseline_0"])
    
    res = []
    for compare_to in systems:
        base = list(df[compare_to])
     
        sig = [clean_name(compare_to).replace("_", "\_").ljust(30)]
        for col in systems:
            pred = list(df[col])
    
            # Binomial
            n = sum(1 for i in range(len(base)) if base[i] != pred[i]) # Number of trials
            k = sum(1 for i in range(len(base)) if (base[i] != pred[i]) and (pred[i] == gold[i])) # Number of successes
            p = scipy.stats.binom_test(k, n=n, p=0.5, alternative='two-sided')
            if col == compare_to:
                sig.append(" - ")
            elif p < 0.0014:
                sig.append("***")
            elif p < 0.05:
                sig.append(" * ")
            else:
                sig.append("   ")
        res.append(sig)
    return res

def zip_results(res1, res2):
    """Combine two sets of results into an upper right (res1) and lower left (res2) half of a table"""
    res_zipped = []
    for i, pair in enumerate(zip(res1, res2)):
        res_zipped.append(pair[1][:i+1] + pair[0][i+1:])
    return res_zipped

def latex_rotate(s):
    """Format s as a vertical string"""
    return "\\rotatebox[origin=l]{90}{%s}" % s.replace("_", "\_")

def print_latex_table(exp, res):
    """Format results as a LaTeX source code for a table"""
    rotated = [latex_rotate(clean_name(s)) for s in systems]
    print "\\begin{table*}[p]\n\\begin{small}\n\\begin{center}\n\\begin{tabular}{l|ccccccccc}"
    print "& " + " & ".join(rotated) + " \\\\"
    print "\\hline"
    for sig in res: 
        print " & ".join(sig) + " \\\\"
    print "\\end{tabular}"
    print "\\caption{Significance of pairwise difference between system outputs for %s, $* <= 0.05$, $*** <= 0.0007$ (Bonferroni-adjusted). Above diagonal: F1-optimized systems, below diagonal: F2-optimized systems.}" % exp
    print "\\label{tab:%s-significance}" % exp[:3]
    print "\\end{center}\n\\end{small}\n\\end{table*}"

def exp_to_latex(exp, systems):
    """Given an experiment (relevance/severity), produce a LaTeX table with F1 and F2 optimized significances."""
    res1 = get_pairwise_significance(exp, systems=systems)
    res2 = get_pairwise_significance(exp + "_f2", systems=systems)
    res_zipped = zip_results(res1, res2)
    print_latex_table(exp, res_zipped)

exp = "relevance"
systems = ['_svm_baseline_1', '_svm_fs-n_hp-n_1', '_svm_fs-n_hp-y_1', '_svm_fs-group_hp-n_1', '_svm_fs-group_hp-y_1', '_svm_fs-nbest_hp-n_1', '_svm_fs-nbest_hp-y_1', '_svm_fs-strata_hp-n_1', '_svm_fs-strata_hp-y_1']
exp_to_latex(exp, systems)

\begin{table*}[p]
\begin{small}
\begin{center}
\begin{tabular}{l|ccccccccc}
& \rotatebox[origin=l]{90}{baseline} & \rotatebox[origin=l]{90}{FS: none, HO: no} & \rotatebox[origin=l]{90}{FS: none, HO: yes} & \rotatebox[origin=l]{90}{FS: group, HO: no} & \rotatebox[origin=l]{90}{FS: group, HO: yes} & \rotatebox[origin=l]{90}{FS: nbest, HO: no} & \rotatebox[origin=l]{90}{FS: nbest, HO: yes} & \rotatebox[origin=l]{90}{FS: strata, HO: no} & \rotatebox[origin=l]{90}{FS: strata, HO: yes} \\
\hline
baseline                       &  -  & *** & *** & *** & *** & *** & *** & *** & *** \\
FS: none, HO: no               & *** &  -  &     &  *  &  *  &     & *** & *** & *** \\
FS: none, HO: yes              & *** &     &  -  &     &     &     &     &     & *** \\
FS: group, HO: no              & *** &  *  &     &  -  &     &     &     &  *  &  *  \\
FS: group, HO: yes             & *** &  *  &  *  &     &  -  &     &     &     &     \\
FS: nbest, HO: no              & *** &     &     &     &     &  -

## Binomial test on systems from different optimization objectives

In [7]:
import pandas
import scipy
from collections import Counter

def clean_name(name):
    return "_".join(name.split("_")[2:-1])

exp1 = "relevance"
fp1 = "data/results_%s_and_baseline.csv" % exp1
df1 = pandas.read_csv(fp1) # Gold standard is always in the _0 column, predictions in the _1 column
exp2 = "relevance_f2"
fp2 = "data/results_%s_and_baseline.csv" % exp2
df2 = pandas.read_csv(fp2) # Gold standard is always in the _0 column, predictions in the _1 column

assert fp1 != fp2

system1 = '_svm_fs-strata_hp-y_1'
system2 = '_svm_fs-strata_hp-y_1'
gold = list(df1["_svm_baseline_0"])
pred1 = list(df1[system1])
pred2 = list(df2[system2])

# Binomial
n = sum(1 for i in range(len(pred1)) if pred1[i] != pred2[i]) # Number of trials
k = sum(1 for i in range(len(pred1)) if (pred1[i] != pred2[i]) and (pred1[i] == gold[i])) # Number of successes
p = scipy.stats.binom_test(k, n=n, p=0.5, alternative='two-sided')
assert pred1 != pred2 # Highly unlikely to be identical, but it is possible
if p <= 0.001:
    print p, ", SIGNIFICANT ***"
elif p <= 0.01:
    print p, ", SIGNIFICANT **"
elif p <= 0.05:
    print p, ", SIGNIFICANT *"
else:
    print p, ", NOT SIGNIFICANT"

1.0 , NOT SIGNIFICANT
