### Memory Profile Helper

In [1]:
import tracemalloc
from functools import wraps

def profile(func):
    @wraps(func)
    def profile_wrapper(*args, **kwargs):
        tracemalloc.start()
        result = func(*args, **kwargs)
        current, peak = tracemalloc.get_traced_memory()
        print(f"Peak Memory Usage: {peak/1000} MB")
        tracemalloc.clear_traces()
        return result
    return profile_wrapper

### Original

[Source](https://github.com/chipkent/science_fair_unfairness/)

In [2]:
from typing import List, Dict
import pandas
import numpy as np
import scipy.stats
import math

def load_session(session) -> pandas.DataFrame:
    """Load data for a session."""
    file = f"data/{session.lower()}.csv"
    df = pandas.read_csv(file, sep="|", na_values=["-", chr(8722)])

    # there appear to be some spurious zero values
    for c in df.columns:
        if c != "Judge":
            df[c].replace(0, np.NaN)

    return df

def compute_means(df:pandas.DataFrame) -> pandas.DataFrame:
    """Compute mean scores and errors."""
    projects = []
    means = []
    std_errs = []

    for c in df.columns:
        if c == "Judge":
            continue

        x = df[c]
        x = x[~np.isnan(x)]
        x = np.sort(x)
        x_drop_high_low = x[1:-1]

        m = np.mean(x_drop_high_low)
        s = scipy.stats.sem(x_drop_high_low)

        projects.append(c)
        means.append(m)
        std_errs.append(s)

    df = pandas.DataFrame({"Project":projects, "Mean":means, "StdErr":std_errs})
    df = df.sort_values(by=['Mean'], ascending=False)
    return df

def analyze_pair_wise_means(means:pandas.DataFrame) -> pandas.DataFrame:
    """Compare the mean estimates in a pairwise way to determine the odds that one project should win over another."""
    project1 = []
    project2 = []
    mean1 = []
    stderr1 = []
    mean2 = []
    stderr2 = []
    mean_diff = []
    ste_diff = []
    z_diff = []
    p_diff = []

    for i in range(len(means)):
        p1 = means["Project"][i]
        m1 = means["Mean"][i]
        s1 = means["StdErr"][i]

        for j in range(i,len(means)):
            p2 = means["Project"][j]
            m2 = means["Mean"][j]
            s2 = means["StdErr"][j]

            if p1 == p2:
                continue

            d = m1-m2
            s = math.sqrt(s1*s1 + s2*s2)
            z = d/s
            p = scipy.stats.norm.sf(-z)

            project1.append(p1)
            project2.append(p2)
            mean1.append(m1)
            stderr1.append(s1)
            mean2.append(m2)
            stderr2.append(s2)
            mean_diff.append(d)
            ste_diff.append(s)
            z_diff.append(z)
            p_diff.append(p)

    return pandas.DataFrame({
        "Project1":project1,
        "Project2":project2,
        "Mean1":mean1,
        "StdErr1":stderr1,
        "Mean2":mean2,
        "StdErr2":stderr2,
        "MeanDiff":mean_diff,
        "StdErrDiff":ste_diff,
        "ZDiff":z_diff,
        "P1Wins":p_diff,
        })

In [3]:
@profile
def run_original():
    session = "JA"
    df = load_session(session)
    df = compute_means(df)
    df = analyze_pair_wise_means(df)
    return df

### "Maxified"

In [4]:
import scipy.stats as ss
import pandas as pd
import numpy as np

def load_session_max(session: str) -> pd.DataFrame:
    """Load data for a session."""
    df = pd.read_csv(f"data/{session.lower()}.csv", sep="|", na_values=["-", chr(8722)])
    df = df.replace(0, np.NaN)
    return df

def compute_means_max(df: pd.DataFrame) -> pd.DataFrame:
    """Compute mean scores and errors."""
    # drop highest and lowest scores
    for col in df.columns[1:]:
        df[col] = df[col].apply(lambda x: np.NaN if (x == df.max()[col]) | (x == df.min()[col]) else x)
    df = df[df.columns[1:]].agg(['mean','sem']).T.reset_index()
    df = df.rename(columns={"index": "project"})
    return df

def analyze_pair_wise_means_max(means: pd.DataFrame) -> pd.DataFrame:
    """Compare the mean estimates in a pairwise way to determine the odds that one project should win over another."""
    df = means.merge(means, how="cross", suffixes=('_a', '_b'))
    df = df[df["project_a"] != df["project_b"]]
    # calculate project A odds of winning
    df = df.assign(**{
        "z": lambda d: (d["mean_a"] - d["mean_b"]) / np.sqrt((d["sem_a"]**2 + d["sem_b"]**2)),
        "project_a_wins": lambda d: np.round(ss.norm.sf(-d["z"]), 2)
    })
    df = df[["project_a", "project_b", "project_a_wins"]]
    df = df[df["project_a_wins"] >= 0.5].reset_index(drop=True)
    return df

In [5]:
@profile
def run_max():
    session = "JA"
    df = load_session_max(session)
    df = compute_means_max(df)
    df = analyze_pair_wise_means_max(df)
    return df

### Comparisons

In [6]:
run_max()

Peak Memory Usage: 479.797 MB


Unnamed: 0,project_a,project_b,project_a_wins
0,JA2,JA1,0.71
1,JA2,JA6,0.87
2,JA2,JA7,1.0
3,JA2,JA4,1.0
4,JA2,JA3,1.0
5,JA1,JA6,0.69
6,JA1,JA7,1.0
7,JA1,JA4,1.0
8,JA1,JA3,1.0
9,JA6,JA7,1.0


In [7]:
run_original()

Peak Memory Usage: 509.44 MB


Unnamed: 0,Project1,Project2,Mean1,StdErr1,Mean2,StdErr2,MeanDiff,StdErrDiff,ZDiff,P1Wins
0,JA2,JA1,193.0,10.893423,184.375,10.766909,8.625,15.316429,0.563121,0.713324
1,JA2,JA6,193.0,10.893423,170.166667,10.084366,22.833333,14.844565,1.538161,0.937995
2,JA2,JA7,193.0,10.893423,124.555556,5.467355,68.444444,12.188463,5.61551,1.0
3,JA2,JA4,193.0,10.893423,91.166667,7.972522,101.833333,13.499177,7.54367,1.0
4,JA2,JA3,193.0,10.893423,45.571429,21.594595,147.428571,24.186633,6.095457,1.0
5,JA1,JA6,184.375,10.766909,170.166667,10.084366,14.208333,14.751976,0.963148,0.832263
6,JA1,JA7,184.375,10.766909,124.555556,5.467355,59.819444,12.075525,4.953776,1.0
7,JA1,JA4,184.375,10.766909,91.166667,7.972522,93.208333,13.397293,6.957251,1.0
8,JA1,JA3,184.375,10.766909,45.571429,21.594595,138.803571,24.129916,5.752344,1.0
9,JA6,JA7,170.166667,10.084366,124.555556,5.467355,45.611111,11.471112,3.976172,0.999965
