In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None
import numpy as np

import scipy.stats

import os

In [2]:
comp_cols = [ 'boundary_prox', 'losses', 'pvi', 'times_forgotten', 'instance_hardness', 'irt_difficulty', 'tok_len' ]

this_dir = os.getcwd()
os.chdir('..')
id_counts = pd.read_csv( 'data/dataset_id_counts.csv' )
model_diff = pd.read_csv( 'data/model_diff.csv' )
# bug where 'None' strategy (i.e., a string) loaded as None/NaN
model_diff['strat'] = model_diff['strat'].fillna('None')

os.chdir(this_dir)

def create_join_gdf( in_df, id_counts ):
    full_gdf = in_df.groupby(['score_var', 'strat'])[ comp_cols ].agg(['mean', 'var'])
    full_gdf.columns = [ '_'.join([c1, c2]) for c1, c2 in zip( full_gdf.columns.get_level_values(0),
                                                                full_gdf.columns.get_level_values(1) ) ]

    # need to consider N from the number of original data points / sources of variation
    # NOTE; we do this in "id_counts" since "sub_ledger" is too large
    # id_counts = sub_ledger.groupby(['score_var', 'split_full'])[['ID']].nunique().rename(
    #     columns={'ID': 'n_unique_IDs'}).reset_index()


    id_counts['strat'] = [ s.split('-')[-1] for s in id_counts['split_full'] ]
    id_counts = id_counts.drop('split_full', axis=1)

    join_gdf = pd.merge( full_gdf, id_counts, how='left', on=['score_var', 'strat'] )
    join_gdf = join_gdf.set_index(['score_var', 'strat'])


    # add confidence intervals according to these Ns
    z = 1.96
    for c in comp_cols:
        
        this_sem = (join_gdf[c+'_var'])**0.5 / np.sqrt(join_gdf['n_unique_IDs'])
        # join_gdf[c+'_zscore'] = this_zscore

        join_gdf[c+'_cilow'] = join_gdf[c+'_mean'] - z*this_sem
        join_gdf[c+'_cihigh'] = join_gdf[c+'_mean'] + z*this_sem

    return join_gdf

In [3]:
join_gdf_wer = create_join_gdf( model_diff, id_counts )

In [8]:
def calculate_zscore(rand_row, hard_row, met):
    z = 1.96

    rand_mean, rand_var = rand_row[met+'_mean'], rand_row[met+'_var']
    hard_mean, hard_var = hard_row[met+'_mean'], hard_row[met+'_var']

    n_rand, n_hard = rand_row['n_unique_IDs'], hard_row['n_unique_IDs']
    pooled_var = ( ((n_rand-1)*rand_var) + ((n_hard-1)*hard_var) ) / ( n_rand+n_hard-2 )
    pooled_sem = ( (pooled_var/n_rand) + (pooled_var/n_hard) )**0.5

    # expect rand sample to have higher inv metrics
    if met in [ 'boundary_prox', 'pvi' ]:
        this_zscore = (rand_mean - hard_mean) / pooled_sem
    # ...hard sample to have higher prop metrics
    else:
        this_zscore = (hard_mean - rand_mean) / pooled_sem

    return this_zscore



sv_d = {
    'Anxiety': 'Anxiety',
    'Numeracy': 'Numeracy',
    'SubjectiveLit': 'Literacy',
    'TrustPhys': 'Trust',
    'wer': 'Depr.'
    }

ic_d = {
    'boundary_prox': 'BP',
    'losses': 'Loss',
    'times_forgotten': 'TF',
    'irt_difficulty': 'IRT',
    'instance_hardness': 'IH',
    'pvi': 'PVI',
    'tok_len': 'SL'
}


def create_table2(this_join_gdf):
    mean_diff_df = pd.DataFrame()

    these_svars = np.unique( [ t[0] for t in this_join_gdf.index ] )

    for svar in these_svars:

        rand_row = this_join_gdf.loc[(svar, 'None'), :]
        hard_row = this_join_gdf.loc[(svar, 'Constant'), :]

        sub_df = pd.DataFrame()
        for met in comp_cols:
            rand_mean, rand_std = rand_row[met+'_mean'], rand_row[met+'_var']**.05
            hard_mean, hard_std = hard_row[met+'_mean'], hard_row[met+'_var']**0.5

            # note we consider the random sample to be "true"
            this_zscore = calculate_zscore(rand_row, hard_row, met)
            # print(svar, met)

            total_prob = scipy.stats.norm.cdf(-this_zscore)  # one-sided

            # apply bonferroni correction
            this_sig = '*' if total_prob < 0.05/len(ic_d) else ''

            pool_std = ((hard_std**2) + (rand_std**2) / 2)**0.5
            new_row = pd.DataFrame.from_dict([{
                'Dep. Var.': sv_d[svar], 'Metric': ic_d[met],
                'Mean Diff.': f'{hard_mean-rand_mean:.3f}{this_sig}',
                'Pool. SD': f'{pool_std:.3f}',
                # 'p<0.05': f'{total_prob:.4f}{this_sig}'
            }])

            sub_df = pd.concat([sub_df, new_row])


        mean_diff_df = pd.concat([ mean_diff_df, sub_df ])

    mean_diff_df = mean_diff_df.reset_index(drop=True)

    mean_diff_df = mean_diff_df.set_index(['Dep. Var.', 'Metric'])
    mean_diff_df.index.names = [ None, None ]
    mean_diff_df.index = pd.MultiIndex.from_tuples( [ (t[0], ( '\\red{' + t[1] +'}' if t[1] in ['BP', 'PVI'] else 
                                                            '\\blue{' + t[1] +'}' ) ) for t in mean_diff_df.index ] )
    
    mean_diff_df = mean_diff_df[['Mean Diff.']].unstack(level=-1).T
    mean_diff_df.index = [ t[1] for t in mean_diff_df.index ]
    return mean_diff_df

In [9]:
table_wer = create_table2(join_gdf_wer)

In [10]:
table_wer[ ['Anxiety', 'Literacy', 'Numeracy', 'Trust', 'Depr.'] ]

Unnamed: 0,Anxiety,Literacy,Numeracy,Trust,Depr.
\blue{IH},0.000,0.034*,0.032*,0.083*,0.000
\blue{IRT},0.043,1.200*,0.530,1.577*,0.358
\blue{Loss},0.001,0.063*,0.027*,0.093*,0.006
\blue{SL},0.996,0.867,0.907,-2.187,0.795
\blue{TF},0.159*,0.599*,0.361*,0.688*,0.409*
\red{BP},-0.004,-0.051*,-0.103*,-0.101*,-0.033*
\red{PVI},0.022,0.087,0.104,0.153,-0.007


In [11]:
table_wer.to_latex( 'comp_mean_diff_inc_wer.tex' )