# Compare two splits

This can be run on two folders that should contain the same split (testing that the randomizer is behaving with the seed to create replicable results). Or two sets of splits can be compared to each other.

In [6]:
""" Define which split directories to compare. """

# These should contain identical splits
base_a = "/data/splits/sub-all_hem-A_samp-glasser_prob-fornito/batch-train00508"
base_b = "/data/splits/sub-all_hem-A_samp-glasser_prob-fornito/batch-train00508b"


In [2]:
import pickle
import pandas as pd

def getdf(p):
    """ Load and return a dataframe whether it's from a pickled df or a csv or tsv. """
    
    if p[-3:] == ".df":
        with open(p, "rb") as f:
            return pickle.load(f)
    elif p[-4:] == ".csv":
        return pd.read_csv(p)
    elif p[-4:] == ".tsv":
        return pd.read_csv(p, sep="\t")
    return None


In [3]:
from math import sqrt

def report_comparison(a_df=None, b_df=None, a_label="a", b_label="b", common_label="_"):
    """ compare dataframes and return a description of their differences. """
    
    n_match = len(set(a_df.columns).intersection(set(b_df.columns)))
    n_possible = max(len(a_df.columns), len(b_df.columns))
    match_string = "{:,} / {:,} ids match".format(n_match, n_possible)

    rms_error = None
    if (n_match == n_possible) and (a_df.shape == b_df.shape) and (len(a_df) > 0):
        rms_error = sqrt(((a_df - b_df)**2).mean().mean())

    equality_string = "!="
    if rms_error is not None and rms_error < 0.01:
        equality_string = "=="
    elif n_match == n_possible and len(a_df) == 0 and len(b_df) == 0:
        equality_string = "=="

    rms_string = ""
    if rms_error is not None:
        rms_string = "; rms error {:5.2f}".format(rms_error)

    return "\n".join([
        "{:<40} : {} : {}{}".format(common_label, equality_string, match_string, rms_string),
        "{}{}: {:<12};  {}: {:<12}".format(" " * 48, a_label, str(a_df.shape), b_label, str(b_df.shape)),
    ])


In [7]:
""" Compare each of the 12 files per folder against its counterpart and print the results. """

import os

for S in ['wellid', 'glasser', ]:
    for P in ['wellid', 'glasser', ]:
        for F in ["{}s_splitby-{}.csv", "parcelby-{}_splitby-{}.raw.df", "parcelby-{}_splitby-{}.srs.df", ]:
            filename = F.format(P, S)
            df_a = getdf(os.path.join(base_a, filename))
            df_b = getdf(os.path.join(base_b, filename))
            
            print(report_comparison(
                a_df=df_a, a_label=base_a[-4:], 
                b_df=df_b, b_label=base_b[-4:],
                common_label=filename
            ))
            

wellids_splitby-wellid.csv               : == : 320 / 320 ids match
                                                0508: (0, 320)    ;  508b: (0, 320)    
parcelby-wellid_splitby-wellid.raw.df    : == : 320 / 320 ids match; rms error  0.00
                                                0508: (15745, 320);  508b: (15745, 320)
parcelby-wellid_splitby-wellid.srs.df    : == : 320 / 320 ids match; rms error  0.00
                                                0508: (15745, 320);  508b: (15745, 320)
glassers_splitby-wellid.csv              : == : 132 / 132 ids match
                                                0508: (0, 132)    ;  508b: (0, 132)    
parcelby-glasser_splitby-wellid.raw.df   : == : 132 / 132 ids match; rms error  0.00
                                                0508: (15745, 132);  508b: (15745, 132)
parcelby-glasser_splitby-wellid.srs.df   : == : 132 / 132 ids match; rms error  0.00
                                                0508: (15745, 132);  508b: (15745, 1

In [8]:
""" Compare each of the raw/srs pairs against each other and print the results. """

for S in ['wellid', 'glasser', ]:
    for P in ['wellid', 'glasser', ]:
        for D in [base_a, base_b ]:
            df_raw = getdf(os.path.join(D, "parcelby-{}_splitby-{}.raw.df".format(P, S)))
            df_srs = getdf(os.path.join(D, "parcelby-{}_splitby-{}.srs.df".format(P, S)))
            
            print(report_comparison(
                a_df=df_raw, a_label="raw",
                b_df=df_srs, b_label="srs",
                common_label="parcelby-{}_splitby-{}".format(P, S)
            ))


parcelby-wellid_splitby-wellid           : != : 320 / 320 ids match; rms error  7.38
                                                raw: (15745, 320);  srs: (15745, 320)
parcelby-wellid_splitby-wellid           : != : 320 / 320 ids match; rms error  7.38
                                                raw: (15745, 320);  srs: (15745, 320)
parcelby-glasser_splitby-wellid          : != : 132 / 132 ids match; rms error  7.38
                                                raw: (15745, 132);  srs: (15745, 132)
parcelby-glasser_splitby-wellid          : != : 132 / 132 ids match; rms error  7.38
                                                raw: (15745, 132);  srs: (15745, 132)
parcelby-wellid_splitby-glasser          : != : 330 / 330 ids match; rms error  7.38
                                                raw: (15745, 330);  srs: (15745, 330)
parcelby-wellid_splitby-glasser          : != : 330 / 330 ids match; rms error  7.38
                                                raw: (15745,

If the splits are actually the same, everything from the first loop should be equal in every way. Everything in the second loop should match ids, but not be equal. The 'raw' and 'srs' versions of the expression data will have the same wellid and probe labels, but values will have been adjusted. From spot-checking, it looks like root-mean-squared-error between raw and srs is in the ballpark of 7 or 8.