In [1]:
import plotly.graph_objects as go
import plotly.express as px
import plotly
import pandas as pd
import os
import glob
import json
import numpy as np
import scipy.stats
import sys
from plotly.subplots import make_subplots
sys.path.append('./src')
os.chdir('../')

import re

In [14]:
from src.utils.get_concept_subsets import SUBSETS, READABLE
from scipy.stats import percentileofscore

# Functions

In [5]:
def compile_indiv_stats(df_filepath: str, setstop:int = None) -> pd.DataFrame:
    """
    Collates last-quarter, first quarter and overall accuracy 
    for all human subjects across all concepts
    """
    human_df = pd.read_csv(df_filepath, index_col=0)
    human_df['score'] = (human_df['answer'] == human_df['response']).astype(int)
    
    concepts = human_df['concept'].unique()

    stats_df = {k: [] for k in ['concept', 'subject', 'first_quarter', 'last_quarter', 'overall']}

    for concept in concepts:
        concept_df = human_df[human_df['concept'] == concept]
        if setstop is not None:
            concept_df = concept_df[concept_df['set'].astype(int) <= setstop]

        total_items = max(concept_df['item_num'])

        for id in concept_df['subject'].unique():
            id_concept_df = concept_df[concept_df['subject'] == id]
            
            # remove subject from concept if they attempted less than 5 sets
            subject_max = id_concept_df['set'].max()
            if subject_max < 5:
                continue

            overall = id_concept_df['score'].mean()
            firstquarter = id_concept_df[id_concept_df['item_num'] <= total_items/4]['score'].mean()
            lastquarter = id_concept_df[id_concept_df['item_num'] >= (3 * (total_items/4))]['score'].mean()

            if np.isnan(lastquarter):
                continue

            stats_df['overall'].append(overall)
            stats_df['first_quarter'].append(firstquarter)
            stats_df['last_quarter'].append(lastquarter)
            stats_df['subject'].append(id)
            stats_df['concept'].append(concept)
    
    indiv_df = pd.DataFrame.from_dict(stats_df)
    # rows_to_remove = []
    # for group in indiv_df.loc[:, ['subject', 'concept','overall']].groupby(['subject', 'concept']):
    #     # remove subject from concept if beyond 2 sd
    #     subject_mean = group[1]['overall'].iloc[0]
    #     concept_mean = indiv_df[indiv_df['concept'] == group[0][1]]['overall'].mean()
    #     concept_sd = indiv_df[indiv_df['concept'] == group[0][1]]['overall'].std()

    #     if subject_mean > (concept_mean + 2*concept_sd) or\
    #         (subject_mean < (concept_mean - 2*concept_sd)): 
    #         rows_to_remove.extend(group[1].index)

    # exclude = np.zeros(len(indiv_df))
    # np.put(exclude, rows_to_remove, 1)
    # indiv_df['exclude'] = exclude.astype(bool)


    return indiv_df


In [6]:
def compile_concept_human_stats(compiled_indiv_df):
    """
    Collates the mean and standard deviation of last quarter/overall
    accuracy for each concept given the distribution of human subjects
    that attempted each concept
    """
    concept_agg_df = {k: [] for k in [
        'concept', 'overall_mean', 'overall_sd',
        'last_quarter_mean', "last_quarter_sd",
    ]}

    for group in compiled_indiv_df.groupby(['concept']):
        concept_agg_df['concept'].append(group[0][0])
        
        concept_agg_df['overall_sd'].append(np.std(group[1]['overall']))
        concept_agg_df['overall_mean'].append(group[1]['overall'].mean())
        
        concept_agg_df['last_quarter_sd'].append(np.std(group[1]['last_quarter']))
        concept_agg_df['last_quarter_mean'].append(group[1]['last_quarter'].mean())
        # firstpercentile, fifthpercentile, firstdecile, firstquintile, firstquartile, median, thirdquartile = np.quantile(group[1]['last_quarter'], [0.01, 0.05, 0.1, 0.20, 0.25, 0.5, 0.75])

    return pd.DataFrame.from_dict(concept_agg_df)

In [356]:
def format_shape(text: str):
    shape, color, size = text.split(",")
    return " ".join([['small', 'medium', 'large'][int(size) - 1], color, shape])


def process_one_reply(obj, reply):
    objstring = format_shape(obj)
    regex = f"{objstring[2:]}" + "\s*(?:->|:)\s*(\w+)" + r"\b"
    match = re.search(regex, reply)
    if match is not None:
        return match.group(1)
    return "None"

def compile_model_stats(model_output_folder: str=None, 
                        model_name: str = None, answer_col:str='answers', 
                        check_for_L2:bool=True,
                        setstop:int = None,
                        verbose:bool=False):
    """
    Compiles overall, last and first quarter accuracies across all concepts
    for a model output directory
    """
    df_dict = {k: [] for k in ['concept', 'overall', 'last_quarter', 'first_quarter']}
    total_nans = 0
    quarter = []
    for file in os.listdir(model_output_folder): 
        if ((check_for_L2 and "L2" in file) or (not check_for_L2)) and file.endswith('csv'):
            mdf = pd.read_csv(os.path.join(model_output_folder, file))
            if setstop is not None:
                mdf = mdf[mdf['set'] <= setstop]
            
            total_items = max(mdf['item_num'])
            if 'model_reply' in mdf.columns:
                mdf['top_ans'] = mdf.apply(lambda x: process_one_reply(x['obj'], x['model_reply']), axis=1)
        
            with_ans = mdf['top_ans'].apply(lambda x: str(x).strip().lower() in ["true", "false"])
            if (len(mdf) - sum(with_ans)) != 0:
                total_nans += len(mdf) - sum(with_ans)
                mdf = mdf[with_ans]
                mdf = mdf.dropna(axis=0, subset=['top_ans'])
                
            mdf['score'] = np.array((mdf['top_ans'].astype(str).apply(
                lambda x: str(x.strip().lower())) ==\
                    mdf[answer_col].astype(str).apply(lambda x: str(x.strip().lower()))
                    ).astype(int).tolist())


            df_dict['concept'].append(mdf['concept'].iloc[0])
            df_dict['first_quarter'].append(mdf[mdf['item_num'] <= (total_items/4)]['score'].mean())
            df_dict['last_quarter'].append(mdf[mdf['item_num'] >= 3*(total_items/4)]['score'].mean())
            quarter.append(total_items/4)
            df_dict['overall'].append(mdf['score'].mean())
    
    if model_name is not None:
        df_dict['subject'] = [model_name] * len(df_dict['concept'])
        
        if verbose:
            print(model_name, " NaNs ", total_nans)
    
    if verbose:    
        print("Size of a quarter: ", "mu=", np.mean(quarter), " std=", np.std(quarter))
        
    return pd.DataFrame.from_dict(df_dict)

In [109]:
def annotate_model_stats(model_df, human_indiv_stats):
    model_gdf = model_df.groupby(['concept'])
    model_df['overall_percentile'] = model_gdf['overall'].transform(
        lambda x: percentileofscore(human_indiv_stats[human_indiv_stats['concept'] == x.name]['overall'].to_numpy(), x, 'weak'))
    model_df['last_quarter_percentile'] = model_gdf['last_quarter'].transform(
        lambda x: percentileofscore(human_indiv_stats[human_indiv_stats['concept'] == x.name]['last_quarter'].to_numpy(), x, 'weak'))
    model_df['last_quarter_difference'] = model_gdf['last_quarter'].transform(
        lambda x: x - human_indiv_stats[human_indiv_stats['concept'] == x.name]['last_quarter'].median())
    model_df['human_last_quarter_median'] = model_gdf['last_quarter'].transform(
        lambda x: human_indiv_stats[human_indiv_stats['concept'] == x.name]['last_quarter'].median())
    
    return model_df

In [272]:
def make_aggregate_table(human_agg_df, model_indiv_df):
    
    model_indiv_df['rule_type'] = model_indiv_df['concept'].isin(SUBSETS['boolean']).astype(str).replace({'True': 'Propositional', 'False': 'FOL'})
    human_agg_df['rule_type'] = human_agg_df['concept'].isin(SUBSETS['boolean']).astype(str).replace({'True': 'Propositional', 'False': 'FOL'})

    model_split_rule = model_indiv_df.groupby(['subject', 'rule_type']).mean(
        ['overall', 'last_quarter']).reset_index().loc[:,['subject', 'rule_type', 'overall', 'last_quarter']]

    model_all_rule = model_indiv_df.groupby(['subject']).mean(
        ['overall', 'last_quarter']).reset_index().loc[:,['subject', 'overall', 'last_quarter']]
    model_all_rule['rule_type'] = 'All'
    model_table = pd.concat([model_all_rule, model_split_rule])

    human_split_rule = human_agg_df.groupby(['rule_type']).agg({
        'overall_sd': lambda x: (1/len(x)) * np.sqrt((x ** 2).sum()),
        'overall_mean': 'mean',
        'last_quarter_sd': lambda x: (1/len(x)) * np.sqrt((x ** 2).sum()),
        'last_quarter_mean': 'mean',
        }).reset_index()

    
    human_all_rule = human_agg_df.agg({
        'overall_sd': lambda x: (1/len(x)) * np.sqrt((x ** 2).sum()),
        'overall_mean': 'mean',
        'last_quarter_sd': lambda x: (1/len(x)) * np.sqrt((x ** 2).sum()),
        'last_quarter_mean': 'mean',
        }).reset_index().transpose()
    human_all_rule.columns = human_all_rule.iloc[0]
    human_all_rule = human_all_rule.iloc[1:]

    human_all_rule['rule_type'] = 'All'
    human_row = pd.concat([human_all_rule, human_split_rule]).reset_index(drop=True)
    
    return model_table, human_row

# Making DataFrames

In [389]:
SETSTOP = 14
indiv_compiled_df = compile_indiv_stats('./data/compiled_individual_humans.csv', setstop=SETSTOP)
indiv_compiled_df.to_csv(f'./results/experiment_1/human_indiv_stats_to_set{SETSTOP}.csv')
concept_agg_df = compile_concept_human_stats(indiv_compiled_df)
concept_agg_df.to_csv(f'./results/experiment_1/human_concept_stats_to_set{SETSTOP}.csv')

mdfs = []
model_directory = "./results/experiment_1/raw_results"
for file in os.listdir(model_directory):
    model_id = file.split("_results")[0]
    mdfs.append(compile_model_stats(os.path.join(model_directory, file), model_name=model_id, answer_col='label', setstop=SETSTOP))

random_human_df = indiv_compiled_df.sample(frac=1).drop_duplicates(subset=['concept'], keep='first')
random_human_df['subject'] = 'human'
model_df = pd.concat(mdfs + [random_human_df])
model_df = annotate_model_stats(model_df, indiv_compiled_df)


model_df.to_csv(f'./results/experiment_1/model_indiv_stats_to_set{SETSTOP}.csv')

# Make Stats Table

In [375]:
model_table, human_row = make_aggregate_table(concept_agg_df, model_df)

In [376]:
model_table

Unnamed: 0,subject,overall,last_quarter,rule_type
0,gemma_2b,0.736474,0.767718,All
1,gemma_2b-it,0.57959,0.659823,All
2,gemma_7b,0.762316,0.804085,All
3,gemma_7b-it,0.713022,0.798324,All
4,gpt2,0.678565,0.707071,All
5,gpt2_xl,0.679986,0.70153,All
6,gpt35,0.649088,0.704285,All
7,gpt4,0.731455,0.79792,All
8,human,0.724744,0.790469,All
9,llama2_70b,0.695684,0.738178,All


In [377]:
human_row

Unnamed: 0,overall_sd,overall_mean,last_quarter_sd,last_quarter_mean,rule_type
0,0.011017,0.726063,0.016957,0.797623,All
1,0.013244,0.694758,0.020982,0.752982,FOL
2,0.019848,0.79788,0.028338,0.900036,Propositional


# Make delta graph

In [378]:
def make_delta_figure(models, model_all_stats_df, cutoffs, model_name_dict):
        # === Delta graph with relative ordering, using quartile and median
    pd.options.mode.chained_assignment = None 

    human_color='rgba(10,10,10,0.3)'
    shades = ['rgba(20,20,20,0.2)', "rgba(20,20,20,0.2)", "rgba(179,100,115,0.2)", "rgba(179,100,120,0.2)", "rgba(179,100,125,0.1)"]
    
    def fillcol(label):
        if label >= 1:
            return 'rgba(12,123,220, 0.4)'
        else:
            return human_color

    fig = make_subplots(rows=len(models), cols=2, 
                    # subplot_titles=["Propositional Rules", "Quantificational Rules"], 
                column_widths=[40/120, 80/120], 
                horizontal_spacing=0.02, 
                vertical_spacing=0.05, shared_yaxes=True, shared_xaxes=False)
    
    for i, model in enumerate(models):
        model_df = model_all_stats_df[(model_all_stats_df['subject'] == model)]
        model_df['label'] = np.where(model_df['last_quarter_difference']>=0, 1, 0)

        for j, concept_list in enumerate([SUBSETS['boolean'], SUBSETS['fol']]):

            rulegroupdf = model_df[model_df['concept'].isin(concept_list)]
            cutoffs = sorted(cutoffs)
            for cutoff in cutoffs:
                rulegroupdf[f'cutoff_{cutoff}'] = rulegroupdf['last_quarter_percentile'] >= cutoff

            rulegroupdf = rulegroupdf.sort_values([f"cutoff_{c}" for c in cutoffs] + ["last_quarter_difference"], ascending=False)
            rulegroupdf['group'] = rulegroupdf['label'].ne(rulegroupdf['label'].shift()).cumsum()
            rulegroupdf['concept_name'] = rulegroupdf['concept'].apply(lambda x: READABLE[x])
            rulegroupdf['concept_id'] = list(range(len(rulegroupdf)))

            if i == 0:
                fig['layout']['xaxis']['range'] =  [0, 33]
            else:
                fig['layout'][f'xaxis{(i*2)+1}']['range'] =  [0, 33]

            # --- INTERGRAPH SHADING FOR BOOL CONCEPTS 
            df = rulegroupdf.groupby('group')
            dfs = []
            for _, data in df:
                dfs.append(data)

            for df in dfs:
                fig.add_trace(go.Scatter(x=df['concept_id'], y = df['last_quarter'],
                                        line = dict(color='rgba(0,0,0,0)'), showlegend=False), row=i+1, col=j+1)
                
                fig.add_trace(go.Scatter(x=df['concept_id'], y = df['human_last_quarter_median'],
                                        line = dict(color='rgba(0,0,0,0)'),
                                        fill='tonexty', 
                                        fillcolor = fillcol(df['label'].iloc[0]), showlegend=False), row=i+1, col=j+1)


            # --- LINE PLOTS FOR BOOL CONCEPTS    
            fig.add_trace(go.Scatter(x=rulegroupdf['concept_id'], y=rulegroupdf['last_quarter'], mode="lines", 
                            marker={'color': 'rgba(12,123,220,1)'}, name="Model",
                            hoverinfo='text', text=rulegroupdf['concept_name'],
                            line_width=1.5, showlegend=j+i==0), 
                            row=i+1, col=j+1)
            fig.add_trace(go.Scatter(x=rulegroupdf['concept_id'], y=rulegroupdf['human_last_quarter_median'],mode="lines", 
                            marker={'color': human_color}, name="Human", 
                            hoverinfo='text', text=rulegroupdf['concept_name'],
                            line_width=1.5, showlegend=j+i==0),
                            row=i+1, col=j+1)
            
            # --- RED SHADING FOR BOOLEAN
            previous_starts = set()
            for k, cutoff in enumerate(cutoffs):
                cutoff_start = rulegroupdf[f"cutoff_{cutoff}"].reset_index()[f"cutoff_{cutoff}"].ne(1).idxmax()

                if (cutoff_start < len(rulegroupdf) - 1) and (cutoff_start > 0):
                    annot = "" if cutoff_start in previous_starts else f"P{cutoff}"
                    previous_starts.add(cutoff_start)

                    fig.add_vline(x=cutoff_start, line_width=2, line_dash="dot", line_color='black', row=i+1, col=j+1,
                                annotation={'text': annot, 'textangle': -90, 
                                            'font_size': 8, 'font_color':'rgba(179,80,115,1)'}, annotation_position='bottom right')
                    fig.add_vrect(
                            x0=rulegroupdf['concept_id'].iloc[cutoff_start], x1=rulegroupdf['concept_id'].iloc[-1],
                            fillcolor=shades[k], opacity=0.8,
                            layer="above", line_width=0, row=i+1, col=j+1)
                
        #  === AXIS LEGENDS
        fig.add_annotation(xref='x domain',
                    yref='y domain',
                    x=-0.18,
                    y=0.5,
                    text='Last Quarter Accuracy', 
                    font_size=14,
                    showarrow=False,
                    textangle=-90,
                    row=i+1, col=1)
        
        fig.add_annotation(xref='x domain',
                    yref='y domain',
                    x=1.05,
                    y=0.5,
                    text=model_name_dict[model], 
                    font_size=15,
                    showarrow=False,
                    textangle=90,
                    row=i+1, col=2)
        
    # === AXIS TITLES
    fig.add_annotation(xref='x domain',
                    yref='y domain',
                    x=0.5,
                    xanchor='center',
                    y=-0.2,
                    text='Propositional Rules', 
                    font_size=15,
                    showarrow=False,
                    row=i+1, col=1)
        
    fig.add_annotation(xref='x domain',
                    yref='y domain',
                    x=0.5,
                    y=-0.2,
                    text='Quantified Rules', 
                    font_size=15,
                    showarrow=False,
                    row=i+1, col=2)

    fig.update_layout(template='ggplot2', height=200*len(models), width=1000)
    fig.update_xaxes(showticklabels=False, dtick=1, tick0=0)

    fig.update_layout(margin_r=240)
    fig.update_layout(font=dict(family='Times New Roman', size=12), legend=dict(
        orientation="h",
        entrywidth=40,
        yanchor="bottom",
        y=1,
        xanchor="left",
        x=0
    ))
    fig.update_layout(
        margin=dict(l=70, r=30, t=20, b=40),
    )

    fig.update_yaxes({'range': [0, 1.1], 'dtick': 0.2})

    fig['layout'].xaxis2 = {'anchor': 'y2', 'domain': [0.3466666666666667, 1.0], 
                            'dtick': 1, 'showticklabels': False, 'tick0': 0, 'range': [0, 77]}
    
    fig['layout'].xaxis4 = {'anchor': 'y2', 'domain': [0.3466666666666667, 1.0], 
                            'dtick': 1, 'showticklabels': False, 'tick0': 0, 'range': [0, 77]}
    
    return fig

In [390]:
model_df = pd.read_csv(f'./results/experiment_1/model_indiv_stats_to_set{SETSTOP}.csv')
model_to_pretty = {"gpt4": "GPT4", "gemma_7b": "Gemma-7b", "mixtralinst": "Mixstral 8x7B Inst", "llam2_70b": "Llama2-70b", 'gpt2': "GPT2", "human": "Human Subsample"}
models = [
        "gpt4", 
        "gemma_7b", "mixtralinst", 
        # "llama2_70b",
        #  "Gemma-2b",
        "human"
        # "gpt2"
        ]
fig = make_delta_figure(models, model_df, [1, 10, 20, 25], model_to_pretty)
fig.write_image("./analysis/e1_delta_graph.pdf")
fig.show()

In [348]:
def calculate_distance(concept_agg_df, model_df):

    distances = []
    for concept, concept_df in model_df.groupby(['concept']):
        assert len(concept_df) == 1, "model_df must contain results only from one model"

        concept = concept[0]
        agg_data = concept_agg_df[concept_agg_df['concept'] == concept]
        concept_mean, concept_std = agg_data['last_quarter_mean'], agg_data['last_quarter_sd']
        if concept_std.item() == 0:
            concept_std += 1e-6
        if len(agg_data) == 0:
            continue

        distance = np.abs(concept_df['last_quarter'].item() - concept_mean.item()) / concept_std.item()
        distances.append(distance)
        
    return np.sum(distances)

def permutation_test(concept_agg_df, indiv_compiled_df, num_permutations: int = 100):
    """
    Runs a permutation test to find distribution of 1-D Mahalanobis distances
    summed across all concepts
    """
    concept_distances = {}

    for concept, concept_df in indiv_compiled_df.groupby(['concept']):
        concept = concept[0]
        concept_distances[concept] = []
        agg_data = concept_agg_df[concept_agg_df['concept'] == concept]
        concept_mean, concept_std = agg_data['last_quarter_mean'], agg_data['last_quarter_sd']
        if concept_std.item() == 0:
            concept_std += 1e-6
            
        for subject, subject_df in concept_df.groupby(['subject']):
            subject = subject[0]
            
            distance = np.abs(subject_df['last_quarter'].item() - concept_mean.item()) / concept_std.item()
            concept_distances[concept].append((subject, distance))
            
    distances = []
    for _ in range(num_permutations):
        distance = 0
        for concept in concept_distances:
            choice = np.random.choice([x[1] for x in concept_distances[concept]], 1)
            distance += choice
        distances.append(distance)

    return distances
        

In [358]:
distances = permutation_test(concept_agg_df, indiv_compiled_df, 1000)
px.histogram(distances)

In [365]:
percentileofscore([x.item() for x in distances], distance, 'weak')

100.0

In [366]:
mdf = compile_model_stats('./results/experiment_1/raw_results/gpt4_results', check_for_L2=False,answer_col="label")

model_directory = './results/experiment_1/raw_results/'
for file in os.listdir(model_directory):
    model_id = file.split("_results")[0]
    model_df = compile_model_stats(os.path.join(model_directory, file), model_name=model_id, answer_col='label', setstop=SETSTOP)
    distance = calculate_distance(concept_agg_df, model_df)
    
    print(model_id, distance, percentileofscore([x.item() for x in distances], distance, 'weak'))

# model_df = annotate_model_stats(pd.concat(mdfs), indiv_compiled_df)
# model_df.to_csv(f'./results/experiment_1/model_indiv_stats_to_set{SETSTOP}.csv')

# calculate_distance(concept_agg_df, mdf)

gemma_7b-it 71.76963496212225 1.8
gemma_2b-it 111240.96756610135 100.0


gpt2_xl 124.09517643924596 100.0
llama2_70b 99.77732913976206 97.7
gpt4 73.75093999185881 4.5
human 0.0 0.0
gpt35 106.35216937170816 99.5
gemma_7b 58.658865772765594 0.0
gpt2 121.08014387986486 100.0
mixtralinst 71.43929943695346 1.7000000000000002
gemma_2b 90.44409757677707 78.30000000000001
llama2_7b 85.36762344201045 52.6
llama2_70bchat 111.39138390454428 100.0


In [None]:
mdf = compile_model_stats('/users/aloo1/thesis/rq1_success/gpt4_results', check_for_L2=True,answer_col="label")
calculate_distance(concept_agg_df, mdf)

63.947490345959125