In [2]:
import plotly.graph_objects as go
import plotly.express as px
import plotly
import pandas as pd
import os
import glob
import json
import numpy as np
import scipy.stats
import sys
from plotly.subplots import make_subplots
sys.path.append('./src')
os.chdir('../')

import re

In [3]:
from src.utils.get_concept_subsets import SUBSETS, READABLE
from scipy.stats import percentileofscore

# Functions

In [5]:
def compile_indiv_stats(df_filepath: str, setstop:int = None) -> pd.DataFrame:
    """
    Collates last-quarter, first quarter and overall accuracy 
    for all human subjects across all concepts
    """
    human_df = pd.read_csv(df_filepath, index_col=0)
    human_df['score'] = (human_df['answer'] == human_df['response']).astype(int)
    
    concepts = human_df['concept'].unique()

    stats_df = {k: [] for k in ['concept', 'subject', 'first_quarter', 'last_quarter', 'overall']}

    for concept in concepts:
        concept_df = human_df[human_df['concept'] == concept]
        if setstop is not None:
            concept_df = concept_df[concept_df['set'].astype(int) <= setstop]

        total_items = max(concept_df['item_num'])

        for id in concept_df['subject'].unique():
            id_concept_df = concept_df[concept_df['subject'] == id]
            
            # remove subject from concept if they attempted less than 5 sets
            subject_max = id_concept_df['set'].max()
            if subject_max < 5:
                continue

            overall = id_concept_df['score'].mean()
            firstquarter = id_concept_df[id_concept_df['item_num'] <= total_items/4]['score'].mean()
            lastquarter = id_concept_df[id_concept_df['item_num'] >= (3 * (total_items/4))]['score'].mean()

            if np.isnan(lastquarter):
                continue

            stats_df['overall'].append(overall)
            stats_df['first_quarter'].append(firstquarter)
            stats_df['last_quarter'].append(lastquarter)
            stats_df['subject'].append(id)
            stats_df['concept'].append(concept)
    
    indiv_df = pd.DataFrame.from_dict(stats_df)
    # rows_to_remove = []
    # for group in indiv_df.loc[:, ['subject', 'concept','overall']].groupby(['subject', 'concept']):
    #     # remove subject from concept if beyond 2 sd
    #     subject_mean = group[1]['overall'].iloc[0]
    #     concept_mean = indiv_df[indiv_df['concept'] == group[0][1]]['overall'].mean()
    #     concept_sd = indiv_df[indiv_df['concept'] == group[0][1]]['overall'].std()

    #     if subject_mean > (concept_mean + 2*concept_sd) or\
    #         (subject_mean < (concept_mean - 2*concept_sd)): 
    #         rows_to_remove.extend(group[1].index)

    # exclude = np.zeros(len(indiv_df))
    # np.put(exclude, rows_to_remove, 1)
    # indiv_df['exclude'] = exclude.astype(bool)


    return indiv_df


In [6]:
def compile_concept_human_stats(compiled_indiv_df):
    """
    Collates the mean and standard deviation of last quarter/overall
    accuracy for each concept given the distribution of human subjects
    that attempted each concept
    """
    concept_agg_df = {k: [] for k in [
        'concept', 'overall_mean', 'overall_sd', 'overall_scores',
        'last_quarter_mean', "last_quarter_sd", 'last_quarter_scores'
    ]}

    for group in compiled_indiv_df.groupby(['concept']):
        concept_agg_df['concept'].append(group[0][0])
        
        concept_agg_df['overall_sd'].append(np.std(group[1]['overall']))
        concept_agg_df['overall_mean'].append(group[1]['overall'].mean())
        concept_agg_df['overall_scores'].append(group[1]['overall'].tolist())
        
        concept_agg_df['last_quarter_sd'].append(np.std(group[1]['last_quarter']))
        concept_agg_df['last_quarter_mean'].append(group[1]['last_quarter'].mean())
        concept_agg_df['last_quarter_scores'].append(group[1]['last_quarter'].tolist())

    return pd.DataFrame.from_dict(concept_agg_df)

In [7]:
def format_shape(text: str):
    shape, color, size = text.split(",")
    return " ".join([['small', 'medium', 'large'][int(size) - 1], color, shape])


def process_one_reply(obj, reply):
    objstring = format_shape(obj)
    regex = f"{objstring[2:]}" + "\s*(?:->|:)\s*(\w+)" + r"\b"
    match = re.search(regex, reply)
    if match is not None:
        return match.group(1)
    return "None"

def compile_model_stats(model_output_folder: str=None, 
                        model_name: str = None, 
                        answer_col:str='answers', 
                        check_for_L2:bool=True,
                        setstop:int = None,
                        verbose:bool=False):
    """
    Compiles overall, last and first quarter accuracies across all concepts
    for a model output directory
    """
    df_dict = {k: [] for k in ['concept', 'overall', 'last_quarter', 'first_quarter']}
    total_nans = 0
    total_all_items = 0
    quarter = []
    for file in os.listdir(model_output_folder): 
        if ((check_for_L2 and "L2" in file) or (not check_for_L2)) and file.endswith('csv'):
            mdf = pd.read_csv(os.path.join(model_output_folder, file))

            if 'item_num' not in mdf.columns:
                mdf['item_num'] = np.arange(len(mdf))

            if setstop is not None:
                mdf = mdf[mdf['set'] <= setstop]
            
            total_items = max(mdf['item_num'])
            total_all_items += total_items
            if 'model_reply' in mdf.columns:
                mdf['top_ans'] = mdf.apply(lambda x: process_one_reply(x['obj'], x['model_reply']), axis=1)
        
            with_ans = mdf['top_ans'].apply(lambda x: str(x).strip().lower() in ["true", "false"])
            if (len(mdf) - sum(with_ans)) != 0:
                total_nans += len(mdf) - sum(with_ans)
                mdf = mdf[with_ans]
                mdf = mdf.dropna(axis=0, subset=['top_ans'])
                
            mdf['score'] = np.array((mdf['top_ans'].astype(str).apply(
                lambda x: str(x.strip().lower())) ==\
                    mdf[answer_col].astype(str).apply(lambda x: str(x.strip().lower()))
                    ).astype(int).tolist())

            df_dict['concept'].append(mdf['concept'].iloc[0])
            df_dict['first_quarter'].append(mdf[mdf['item_num'] <= (total_items/4)]['score'].mean())
            df_dict['last_quarter'].append(mdf[mdf['item_num'] >= 3*(total_items/4)]['score'].mean())
            quarter.append(total_items/4)
            df_dict['overall'].append(mdf['score'].mean())
    
    if model_name is not None:
        df_dict['subject'] = [model_name] * len(df_dict['concept'])
        discard_rate = (model_name, total_nans / total_all_items if total_all_items > 0 else -1)

        if verbose and total_all_items > 0:
            print(model_name, " ## NaNs ", total_nans, "Percentage NaNs", discard_rate)
    
    if verbose:    
        print("## Size of a quarter: ", "mu=", np.mean(quarter), " std=", np.std(quarter))

    # print(model_output_folder, df_dict)
    return pd.DataFrame.from_dict(df_dict), discard_rate

In [8]:
def annotate_model_stats(model_df, human_indiv_stats):
    model_gdf = model_df.groupby(['concept'])
    model_df['overall_percentile'] = model_gdf['overall'].transform(
        lambda x: percentileofscore(human_indiv_stats[human_indiv_stats['concept'] == x.name]['overall'].to_numpy(), x, 'weak'))
    model_df['last_quarter_percentile'] = model_gdf['last_quarter'].transform(
        lambda x: percentileofscore(human_indiv_stats[human_indiv_stats['concept'] == x.name]['last_quarter'].to_numpy(), x, 'weak'))
    model_df['last_quarter_difference'] = model_gdf['last_quarter'].transform(
        lambda x: x - human_indiv_stats[human_indiv_stats['concept'] == x.name]['last_quarter'].median())
    model_df['human_last_quarter_median'] = model_gdf['last_quarter'].transform(
        lambda x: human_indiv_stats[human_indiv_stats['concept'] == x.name]['last_quarter'].median())
    
    return model_df

In [9]:
def make_aggregate_table(human_agg_df, model_indiv_df):
    
    model_indiv_df['rule_type'] = model_indiv_df['concept'].isin(SUBSETS['boolean']).astype(str).replace({'True': 'Propositional', 'False': 'FOL'})
    human_agg_df['rule_type'] = human_agg_df['concept'].isin(SUBSETS['boolean']).astype(str).replace({'True': 'Propositional', 'False': 'FOL'})

    model_split_rule = model_indiv_df.groupby(['subject', 'rule_type']).mean(
        ['overall', 'last_quarter']).reset_index().loc[:,['subject', 'rule_type', 'overall', 'last_quarter']]

    model_all_rule = model_indiv_df.groupby(['subject']).mean(
        ['overall', 'last_quarter']).reset_index().loc[:,['subject', 'overall', 'last_quarter']]
    model_all_rule['rule_type'] = 'All'
    model_table = pd.concat([model_all_rule, model_split_rule])

    human_split_rule = human_agg_df.groupby(['rule_type']).agg({
        'overall_sd': lambda x: (1/len(x)) * np.sqrt((x ** 2).sum()),
        'overall_mean': 'mean',
        'last_quarter_sd': lambda x: (1/len(x)) * np.sqrt((x ** 2).sum()),
        'last_quarter_mean': 'mean',
        }).reset_index()

    
    human_all_rule = human_agg_df.agg({
        'overall_sd': lambda x: (1/len(x)) * np.sqrt((x ** 2).sum()),
        'overall_mean': 'mean',
        'last_quarter_sd': lambda x: (1/len(x)) * np.sqrt((x ** 2).sum()),
        'last_quarter_mean': 'mean',
        }).reset_index().transpose()
    human_all_rule.columns = human_all_rule.iloc[0]
    human_all_rule = human_all_rule.iloc[1:]

    human_all_rule['rule_type'] = 'All'
    human_row = pd.concat([human_all_rule, human_split_rule]).reset_index(drop=True)
    
    return model_table, human_row

# Making DataFrames

In [29]:
SETSTOP = 25
indiv_compiled_df = compile_indiv_stats('./data/compiled_individual_humans.csv', setstop=SETSTOP)
indiv_compiled_df.to_csv(f'./results/experiment_1/human_indiv_stats_to_set{SETSTOP}.csv')
concept_agg_df = compile_concept_human_stats(indiv_compiled_df)
concept_agg_df.to_csv(f'./results/experiment_1/human_concept_stats_to_set{SETSTOP}.csv')

mdfs = []
discards = []
model_directory = "./results/experiment_1/raw_results"
for file in os.listdir(model_directory):
    model_id = file.split("_results")[0]
    mdf, discard_rate = compile_model_stats(os.path.join(model_directory, file), model_name=model_id, answer_col='label', setstop=SETSTOP, verbose=False)
    mdfs.append(mdf)
    if discard_rate[1] >= 0:
        discards.append(discard_rate)
# model_directory = "./results/experiment_3/raw_results"
# for file in os.listdir(model_directory):
#     model_id = file
#     mdfs.append(compile_model_stats(os.path.join(model_directory, file), check_for_L2=False, model_name=model_id, answer_col='answers', setstop=SETSTOP))

random_human_df = indiv_compiled_df.sample(frac=1).drop_duplicates(subset=['concept'], keep='first')
random_human_df['subject'] = 'human'
model_df = pd.concat(mdfs + [random_human_df])
model_df = annotate_model_stats(model_df, indiv_compiled_df)

model_df.to_csv(f'./results/experiment_1/model_indiv_stats_to_set{SETSTOP}.csv')

In [30]:
discards

[('gemma_7b-it', 0.0),
 ('gemma_2b-it', 0.0),
 ('gpt2_xl', 0.0398131220800325),
 ('llama2_70b', 0.0),
 ('gpt4', 0.002515572592237662),
 ('gpt35', 0.0015572592237661715),
 ('gemma_7b', 0.0),
 ('gpt2', 0.08470444850700792),
 ('mixtralinst', 0.0),
 ('gemma_2b', 0.0),
 ('llama2_7b', 0.0),
 ('llama2_70bchat', 0.0005989458552946814)]

In [36]:
print("Discard rate", "mu=", np.mean([x[1] for x in discards if ('gpt2' not in x[0]) and '-it' not in x[0]]), "std", np.std([x[1] for x in discards if ('gpt2' not in x[0]) and '-it' not in x[0]]))

Discard rate mu= 0.0005839722089123143 std 0.0008932881371750512


# Make Stats Table

In [11]:
model_table, human_row = make_aggregate_table(concept_agg_df, model_df)

In [12]:
model_table

Unnamed: 0,subject,overall,last_quarter,rule_type
0,gemma_2b,0.750972,0.771737,All
1,gemma_2b-it,0.630866,0.702876,All
2,gemma_7b,0.795173,0.840041,All
3,gemma_7b-it,0.757565,0.82233,All
4,gpt2,0.678565,0.707071,All
5,gpt2_xl,0.679986,0.70153,All
6,gpt35,0.686341,0.746131,All
7,gpt4,0.765288,0.812597,All
8,human,0.757702,0.818339,All
9,llama2_70b,0.734426,0.793641,All


In [13]:
human_row

Unnamed: 0,overall_sd,overall_mean,last_quarter_sd,last_quarter_mean,rule_type
0,0.010919,0.772877,0.014938,0.831726,All
1,0.013181,0.736237,0.018716,0.788821,FOL
2,0.019474,0.856932,0.024036,0.930156,Propositional


In [479]:
color_and_dash = {
                "SparseOr-2B": ('teal', 'solid'),
                "Human": ('rgb(105,105,105)', 'solid'),
                "Tuned112-2B": ('royalblue', 'solid'),
                "Tuned112-7B": ('royalblue', 'solid'),
                "Tuned112-7B-AnswerLoss": ('purple', 'solid'),
                "GPT2-Tuned112": ('purple', 'solid'),
                "Tuned92-7B": ('teal', 'solid'),
                "Tuned92-7B-Special": ('rgb(127, 201, 170)', 'solid'),
                "Pretrained-2B": ('skyblue', 'dashdot'),
                "Pretrained-7B": ('skyblue', 'dashdot'),
                "PTG16": ('coral', 'solid'),
                }

name_to_id = {'Tuned112-7B': 'gemma7b-tuned112', 
                'Tuned112-7B-AnswerLoss': 'gemma7b-tuned112-answerloss',
                'Pretrained-7B': 'gemma7b-pretrained'}

pretty_name = {'Tuned112-7B': 'Tuned on Human Responses', 
                'Tuned112-7B-AnswerLoss': 'Tuned on Answers',
                'Pretrained-7B': 'Pretrained'}


def make_bar(model_table, models, human_row):
    fig = go.Figure()
    
    hscore = human_row[human_row['rule_type'] == 'All']['last_quarter_mean'].item()
    hsd = human_row[human_row['rule_type'] == 'All']['last_quarter_sd'].item()
    fig.add_trace(go.Bar(x= ['Human'], 
                    y=[hscore], showlegend=False, marker_color=color_and_dash['Human'][0],
                    error_y=dict(type='data', symmetric=True, array=[hsd])))
    fig.add_annotation(x="Human", y=hscore+0.06, showarrow=False, text=str(hscore)[:5])
    
    for model in models:
        mid = name_to_id[model]
        score = model_table[(model_table['subject'] == mid) & 
                    (model_table['rule_type'] == "All")]['last_quarter'].item()
        fig.add_trace(go.Bar(x= [pretty_name[model]], 
                    y=[score], showlegend=False, marker_color=color_and_dash[model][0]))

        fig.add_annotation(x=pretty_name[model], y=score+0.04, showarrow=False, text=str(score)[:5])
    fig.update_layout(width=500, height=400, template='ggplot2')
    fig.update_layout(
        margin=dict(l=70, r=50, t=40, b=60),
    )

    fig.update_yaxes(title='Last Quarter Accuracy', range=[0,1])
    fig.update_layout(font_size=14, font_family='times new roman', 
            )
    fig.update_layout(width=350)

    return fig

fig = make_bar(model_table, ['Tuned112-7B', 'Tuned112-7B-AnswerLoss', 'Pretrained-7B'], human_row)
fig.show()
# fig.write_image('./analysis/fig/e3_lastquarteraccuracy.pdf')

# Make delta graph

In [246]:
def make_delta_figure(models, model_all_stats_df, cutoffs, model_name_dict):
        # === Delta graph with relative ordering, using quartile and median
    pd.options.mode.chained_assignment = None 

    human_color='rgba(10,10,10,0.3)'
    subsample_color = 'rgba(15,153,52,0.3)'
    shades = ['rgba(20,20,20,0.2)', "rgba(20,20,20,0.2)", "rgba(179,100,115,0.2)", "rgba(179,100,120,0.2)", "rgba(179,100,125,0.1)"]
    
    def fillcol(label, model):
        if label >= 1:
            if model == 'human':
                return subsample_color
            else:
                return 'rgba(12,123,220, 0.4)'
        else:
            return human_color

    fig = make_subplots(rows=len(models), cols=2, 
                    # subplot_titles=["Propositional Rules", "Quantificational Rules"], 
                column_widths=[40/120, 80/120], 
                horizontal_spacing=0.02, 
                vertical_spacing=0.05, shared_yaxes=True, shared_xaxes=False)
    
    for i, model in enumerate(models):
        if model == 'gpt2': # hacky to account for the fact that GPT2 is up to 14 sets
            model_df = pd.read_csv('./results/experiment_1/model_indiv_stats_to_set14.csv')
            model_df = model_df[model_df['subject'] == model]
        else:
            model_df = model_all_stats_df[(model_all_stats_df['subject'] == model)]
        
        model_df['label'] = np.where(model_df['last_quarter_difference']>=0, 1, 0)

        for j, concept_list in enumerate([SUBSETS['boolean'], SUBSETS['fol']]):

            rulegroupdf = model_df[model_df['concept'].isin(concept_list)]
            cutoffs = sorted(cutoffs)
            for cutoff in cutoffs:
                rulegroupdf[f'cutoff_{cutoff}'] = rulegroupdf['last_quarter_percentile'] >= cutoff
            rulegroupdf = rulegroupdf.sort_values([f"cutoff_{c}" for c in cutoffs] + ["last_quarter_difference"], ascending=False)
            rulegroupdf['group'] = rulegroupdf['label'].ne(rulegroupdf['label'].shift()).cumsum()
            rulegroupdf['concept_name'] = rulegroupdf['concept'].apply(lambda x: READABLE[x])
            rulegroupdf['concept_id'] = list(range(len(rulegroupdf)))

            if i == 0:
                fig['layout']['xaxis']['range'] =  [0, 33]
            else:
                fig['layout'][f'xaxis{(i*2)+1}']['range'] =  [0, 33]

            # --- INTERGRAPH SHADING FOR BOOL CONCEPTS 
            df = rulegroupdf.groupby('group')
            dfs = []
            for _, data in df:
                dfs.append(data)

            for df in dfs:
                fig.add_trace(go.Scatter(x=df['concept_id'], y = df['last_quarter'],
                                        line = dict(color='rgba(0,0,0,0)'), showlegend=False), row=i+1, col=j+1)
                
                fig.add_trace(go.Scatter(x=df['concept_id'], y = df['human_last_quarter_median'],
                                        line = dict(color='rgba(0,0,0,0)'),
                                        fill='tonexty', 
                                        fillcolor = fillcol(df['label'].iloc[0], model), showlegend=False), row=i+1, col=j+1)


            # --- LINE PLOTS FOR BOOL CONCEPTS    
            fig.add_trace(go.Scatter(x=rulegroupdf['concept_id'], y=rulegroupdf['last_quarter'], mode="lines", 
                            marker={'color': subsample_color if model == 'human' else 'rgba(12,123,220,1)'}, name="Human Subsample" if model == 'human' else "Model",
                            hoverinfo='text', text=rulegroupdf['concept_name'],
                            line_width=1.5, showlegend = (model =='human' and j ==0) or (j+i==0)), 
                            row=i+1, col=j+1)
            fig.add_trace(go.Scatter(x=rulegroupdf['concept_id'], y=rulegroupdf['human_last_quarter_median'],mode="lines", 
                            marker={'color': human_color}, name="Human Median", 
                            hoverinfo='text', text=rulegroupdf['concept_name'],
                            line_width=1.5, showlegend=j+i==0),
                            row=i+1, col=j+1)
            
            # --- RED SHADING FOR BOOLEAN
            previous_starts = set()
            for k, cutoff in enumerate(cutoffs):
                cutoff_start = rulegroupdf[f"cutoff_{cutoff}"].reset_index()[f"cutoff_{cutoff}"].ne(1).idxmax()

                if (cutoff_start < len(rulegroupdf) - 1) and (cutoff_start > 0):
                    annot = "" if cutoff_start in previous_starts else f"P{cutoff}"
                    previous_starts.add(cutoff_start)

                    fig.add_vline(x=cutoff_start, line_width=2, line_dash="dot", line_color='black', row=i+1, col=j+1,
                                annotation={'text': annot, 'textangle': -90, 
                                            'font_size': 8, 'font_color':'rgba(179,80,115,1)'}, annotation_position='bottom right')
                    fig.add_vrect(
                            x0=rulegroupdf['concept_id'].iloc[cutoff_start], x1=rulegroupdf['concept_id'].iloc[-1],
                            fillcolor=shades[k], opacity=0.8,
                            layer="above", line_width=0, row=i+1, col=j+1)
                
        #  === AXIS LEGENDS
        fig.add_annotation(xref='x domain',
                    yref='y domain',
                    x=-0.18,
                    y=0.5,
                    text='Last Quarter Accuracy', 
                    font_size=14,
                    showarrow=False,
                    textangle=-90,
                    row=i+1, col=1)
        
        fig.add_annotation(xref='x domain',
                    yref='y domain',
                    x=1.05,
                    y=0.5,
                    text=model_name_dict[model], 
                    font_size=15,
                    showarrow=False,
                    textangle=90,
                    row=i+1, col=2)
        
    # === AXIS TITLES
    fig.add_annotation(xref='x domain',
                    yref='y domain',
                    x=0.5,
                    xanchor='center',
                    y=-0.2,
                    text='Propositional Rules', 
                    font_size=15,
                    showarrow=False,
                    row=i+1, col=1)
        
    fig.add_annotation(xref='x domain',
                    yref='y domain',
                    x=0.5,
                    y=-0.2,
                    text='Quantified Rules', 
                    font_size=15,
                    showarrow=False,
                    row=i+1, col=2)

    fig.update_layout(template='ggplot2', height=200*len(models), width=1000)
    fig.update_xaxes(showticklabels=False, dtick=1, tick0=0)

    fig.update_layout(margin_r=240)
    fig.update_layout(font=dict(family='Times New Roman', size=12), legend=dict(
        orientation="h",
        entrywidth=100,
        yanchor="bottom",
        y=1,
        xanchor="left",
        x=0
    ))
    fig.update_layout(
        margin=dict(l=70, r=30, t=20, b=40),
    )

    fig.update_yaxes({'range': [0, 1.1], 'dtick': 0.2})


    fig.update_xaxes(showticklabels=False)
    fig['layout'].xaxis2 = {'anchor': 'y2', 'domain': [0.3466666666666667, 1.0], 
                            'dtick': 1, 'showticklabels': False, 'tick0': 0, 'range': [0, 77]}
    
    
    fig['layout'].xaxis4 = {'anchor': 'y4', 'domain': [0.3466666666666667, 1.0], 
                            'dtick': 1, 'showticklabels': False, 'tick0': 0, 'range': [0, 77]}
    fig['layout'].xaxis6 = {'anchor': 'y6', 'domain': [0.3466666666666667, 1.0], 
                            'dtick': 1, 'showticklabels': False, 'tick0': 0, 'range': [0, 77]}
    fig['layout'].xaxis8 = {'anchor': 'y8', 'domain': [0.3466666666666667, 1.0], 
                            'dtick': 1, 'showticklabels': False, 'tick0': 0, 'range': [0, 77]}
    
    return fig

In [248]:
SETSTOP=25
model_df = pd.read_csv(f'./results/experiment_1/model_indiv_stats_to_set{SETSTOP}.csv')
model_to_pretty = {"gpt4": "GPT4", "gemma_7b": "Gemma-7b", "mixtralinst": "Mixstral 8x7B Inst", "llama2_7b": "Llama2-7b", 'gpt2': "GPT2", "human": "Human Subsample"}
models = [
        "gpt4", 
        "gemma_7b", 
        # "mixtralinst", 
        # "llama2_7b",
        #  "Gemma-2b",
        "gpt2",
        "human"
        ]
for model in models:
    print(f"{model} | Proportion of propositional rules within top 75%", (model_df[(model_df['subject'] == model) & (model_df['concept'].isin(SUBSETS['boolean']))]['last_quarter_percentile'] >= 25).mean())
    print(f"{model} | Proportion of FOL rules within top 75%", (model_df[(model_df['subject'] == model) & (model_df['concept'].isin(SUBSETS['fol']))]['last_quarter_percentile'] >= 25).mean())
    print(f"{model} |Proportion of propositional rules in bottom 25%", (model_df[(model_df['subject'] == model)]['last_quarter_percentile'] < 25).mean())

fig = make_delta_figure(models, model_df, [1, 10, 20, 25], model_to_pretty)
fig.write_image("./analysis/fig/e1_delta_graph.pdf")
fig.show()

gpt4 | Proportion of propositional rules within top 75% 1.0
gpt4 | Proportion of FOL rules within top 75% 0.7564102564102564
gpt4 |Proportion of propositional rules in bottom 25% 0.16964285714285715
gemma_7b | Proportion of propositional rules within top 75% 0.8823529411764706
gemma_7b | Proportion of FOL rules within top 75% 0.8717948717948718
gemma_7b |Proportion of propositional rules in bottom 25% 0.125
gpt2 | Proportion of propositional rules within top 75% 0.11764705882352941
gpt2 | Proportion of FOL rules within top 75% 0.5512820512820513
gpt2 |Proportion of propositional rules in bottom 25% 0.5803571428571429
human | Proportion of propositional rules within top 75% 0.8235294117647058
human | Proportion of FOL rules within top 75% 0.8076923076923077
human |Proportion of propositional rules in bottom 25% 0.1875


In [454]:
model_df = pd.read_csv(f'./results/experiment_1/model_indiv_stats_with_exp3.csv')
model_to_pretty = {"gemma7b-tuned112": "Tuned: KL Loss", "gemma7b-pretrained": "Pretrained", "gemma7b-tuned112-answerloss": "Tuned: CE Loss", "gpt2-tuned112": "GPT2 Tuned", 'gpt2': "GPT2", "human": "Human Subsample"}
models = [
        "gemma7b-pretrained", 
        "gemma7b-tuned112",
        "gemma7b-tuned112-answerloss",
        "gpt2-tuned112"
        ]
fig = make_delta_figure(models, model_df, [1, 10, 20, 25], model_to_pretty)
# fig.write_image("./analysis/e1_delta_graph.pdf")
fig.show()

# Bootstrap/Permutation Testing (Scrapped)

In [235]:
def calculate_distance(concept_agg_df, model_df):

    distances = []
    for concept, concept_df in model_df.groupby(['concept']):
        assert len(concept_df) == 1, "model_df must contain results only from one model"

        concept = concept[0]
        agg_data = concept_agg_df[concept_agg_df['concept'] == concept]
        concept_mean, concept_std = agg_data['last_quarter_mean'], agg_data['last_quarter_sd']
        if concept_std.item() == 0:
            concept_std += 1e-6
        if len(agg_data) == 0:
            continue

        distance = np.abs(concept_df['last_quarter'].item() - concept_mean.item()) / concept_std.item()
        distances.append(distance)
        
    return np.sum(distances)

from collections import defaultdict
from tqdm import tqdm

def permutation_test(concept_agg_df, indiv_compiled_df, num_permutations: int = 100):
    """
    Runs bootstrapping tests to compare model scores against subsamples from human distribution
    """
    concept_distances = {}
    concept_percentiles = {}
    concept_scores = {}
    model_df = pd.read_csv(f'./results/experiment_1/model_indiv_stats_to_set25.csv')

    # human_indiv_stats = pd.read_csv('./results/experiment_1/human_indiv_stats_to_set25.csv')
    for concept, concept_df in indiv_compiled_df.groupby(['concept']):
        concept = concept[0]
        concept_distances[concept] = []
        concept_percentiles[concept] = []
        concept_scores[concept] = []

        agg_data = concept_agg_df[concept_agg_df['concept'] == concept]
        concept_mean, concept_std = agg_data['last_quarter_mean'], agg_data['last_quarter_sd']

        if concept_std.item() == 0:
            concept_std += 1e-6
            
        for subject, subject_df in concept_df.groupby(['subject']):
            subject = subject[0]
            
            percentile = percentileofscore(indiv_compiled_df[indiv_compiled_df['concept'] == concept]['last_quarter'].to_numpy(), subject_df['last_quarter'].item(), 'weak')
            concept_percentiles[concept].append((subject, percentile))

            distance = np.abs(subject_df['last_quarter'].item() - concept_mean.item()) / concept_std.item()
            concept_distances[concept].append((subject, distance))

            concept_score = subject_df['last_quarter'].item()
            concept_scores[concept].append((subject, concept_score))

    num_in_bottom_quartile = []
    distances = []
    pvalues = defaultdict(list)
    for _ in tqdm(range(num_permutations)):
        distance = 0
        percentiles = []
        bootstrap_dist = []
        for concept in concept_distances:
            choice = np.random.choice([x[1] for x in concept_distances[concept]], 1)
            distance += choice
            percentiles.append(np.random.choice([x[1] for x in concept_percentiles[concept]], 1))
            bootstrap_dist.append((concept, np.random.choice([x[1] for x in concept_scores[concept]], 1)[0]))

        for model in model_df['subject'].unique():
            pvalues[model].append(stats.ttest_rel([x[1] for x in sorted(bootstrap_dist, key=lambda x: x[0])], model_df[model_df['subject'] == model].sort_values('concept')['last_quarter'], 
                            alternative='greater').pvalue)
        
        distances.append(distance)
        num_in_bottom_quartile.append((np.array(percentiles) <= 25).mean())

# from scipy import stats
# model_df = pd.read_csv(f'./results/experiment_1/model_indiv_stats_to_set25.csv')
# human_df = pd.read_csv('./results/experiment_1/human_concept_stats_to_set25.csv')
# human_df = human_df[human_df['concept'].isin(SUBSETS['fol'])].sort_values('concept')
# gdf = model_df[(model_df['subject'] == 'gpt4') & (model_df['concept'].isin(SUBSETS['fol']))].sort_values('concept')
# hdf = model_df[(model_df['subject'] == 'human') & (model_df['concept'].isin(SUBSETS['fol']))].sort_values('concept')

# # pvalue = stats.ttest_rel(human_df['last_quarter_mean'], gdf['last_quarter'], 
# #                         #  equal_var=False, permutations=100000, 
# #                          alternative='greater')
# pvalue = stats.ttest_rel(hdf['last_quarter'], gdf['last_quarter'], 
#                         #  equal_var=False, permutations=100000, 
#                          alternative='greater')

# pvalue

    return distances, num_in_bottom_quartile, pvalues
        

In [236]:
concept_agg_df = pd.read_csv('./results/experiment_1/human_concept_stats_to_set25.csv')
indiv_compiled_df = pd.read_csv('./results/experiment_1/human_indiv_stats_to_set25.csv')
distances , num_in_bottom_quartile, pvalues= permutation_test(concept_agg_df, indiv_compiled_df, 10000)
# print("Proportion in bottom quartile", "mu=", np.mean(num_in_bottom_quartile), "s td=", np.std(num_in_bottom_quartile))

100%|██████████| 10000/10000 [01:54<00:00, 87.00it/s]


In [211]:
pvalues.keys()

dict_keys(['gemma_7b-it', 'gemma_2b-it', 'gpt2_xl', 'llama2_70b', 'gpt4', 'gpt35', 'gemma_7b', 'gpt2', 'mixtralinst', 'gemma_2b', 'llama2_7b', 'llama2_70bchat', 'human', 'gemma-7b'])

In [237]:
for model in pvalues.keys():
    print(model, np.mean(pvalues[model]))

gemma_7b-it 0.34167120461043954
gemma_2b-it 1.220811437563747e-06
gpt2_xl 5.095446892366393e-06
llama2_70b 0.05749308284821489
gpt4 0.2115331483609992
gpt35 0.00039850007924966573
gemma_7b 0.6332297022182314
gpt2 1.3637625905538206e-05
mixtralinst 0.23399607525884625
gemma_2b 0.009593570515677012
llama2_7b 0.08866499881992634
llama2_70bchat 4.6495595225383215e-06
human 0.2786906049581042


In [366]:
mdf = compile_model_stats('./results/experiment_1/raw_results/gpt4_results', check_for_L2=False,answer_col="label")

model_directory = './results/experiment_1/raw_results/'
for file in os.listdir(model_directory):
    model_id = file.split("_results")[0]
    model_df = compile_model_stats(os.path.join(model_directory, file), model_name=model_id, answer_col='label', setstop=SETSTOP)
    distance = calculate_distance(concept_agg_df, model_df)
    
    print(model_id, distance, percentileofscore([x.item() for x in distances], distance, 'weak'))

# model_df = annotate_model_stats(pd.concat(mdfs), indiv_compiled_df)
# model_df.to_csv(f'./results/experiment_1/model_indiv_stats_to_set{SETSTOP}.csv')

# calculate_distance(concept_agg_df, mdf)

gemma_7b-it 71.76963496212225 1.8
gemma_2b-it 111240.96756610135 100.0


gpt2_xl 124.09517643924596 100.0
llama2_70b 99.77732913976206 97.7
gpt4 73.75093999185881 4.5
human 0.0 0.0
gpt35 106.35216937170816 99.5
gemma_7b 58.658865772765594 0.0
gpt2 121.08014387986486 100.0
mixtralinst 71.43929943695346 1.7000000000000002
gemma_2b 90.44409757677707 78.30000000000001
llama2_7b 85.36762344201045 52.6
llama2_70bchat 111.39138390454428 100.0


In [None]:
mdf = compile_model_stats('/users/aloo1/thesis/rq1_success/gpt4_results', check_for_L2=True,answer_col="label")
calculate_distance(concept_agg_df, mdf)

63.947490345959125