In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import statsmodels.api as sm
from scipy.optimize import curve_fit
from scipy import stats
import re
import json
from typing import Dict, List, Tuple
import os
os.chdir('../')

import sys
from src.utils.get_concept_subsets import SUBSETS

In [3]:
from src.utils.get_concept_subsets import READABLE

In [4]:
data_files = {
              "Human": "./data/compiled_humans.csv",
              "PTG16": "./results/experiment_3/compiled_ptg16.csv",
              "Tuned112-2B": "./results/experiment_3/compiled_gemma2b-tuned112.csv",
              "Tuned112-7B": "./results/experiment_3/compiled_gemma7b-tuned112.csv",
              # "Tuned112-7B-AnswerLoss": "./results/experiment_3/compiled_gemma7b-tuned112-answerloss.csv",
              "Tuned92-7B": "./results/experiment_4/compiled_gemma7b-tuned92.csv",
              "Tuned92-7B-Special": "./results/experiment_4/compiled_gemma7b-tuned92-and-special.csv",
              "SparseOr-2B": './results/experiment_3/compiled_gemma2b-tuned112-sparsed_primitivesor.csv', #1.2% of model parameters
              'GPT2-Tuned112': "./results/experiment_3/compiled_gpt2-tuned112.csv",
              "Pretrained-2B": "./results/experiment_3/compiled_gemma2b-pretrained.csv",
              "Pretrained-7B": "./results/experiment_3/compiled_gemma7b-pretrained.csv",
            }

color_and_dash = {
                "SparseOr-2B": ('teal', 'solid'),
                "Human": ('rgb(105,105,105)', 'solid'),
                "Tuned112-2B": ('royalblue', 'solid'),
                "Tuned112-7B": ('royalblue', 'solid'),
                "Tuned112-7B-AnswerLoss": ('purple', 'solid'),
                "GPT2-Tuned112": ('purple', 'solid'),
                "Tuned92-7B": ('teal', 'solid'),
                "Tuned92-7B-Special": ('rgb(127, 201, 170)', 'solid'),
                "Pretrained-2B": ('skyblue', 'dashdot'),
                "Pretrained-7B": ('skyblue', 'dashdot'),
                "PTG16": ('coral', 'solid'),
                }
pretty_name = {'PTG16': 'PLoT Bayesian Model', 
                    'Ellis23': 'LLM-Augmented<br>Bayesian Model (Ellis23)', 
                    'Tuned112-7B': 'Tuned on Human Responses', 
                    'Tuned112-7B-AnswerLoss': 'Tuned on Answers',
                    'Tuned92-7B': 'Tuned on 92 Rules', 
                    'Pretrained-7B': 'Pretrained'}

primitives_or = [
    'hg03', 'hg04', 'hg18', 'hg19', 'hg20', 'hg24', 'hg09', 'hg25', 'hg06'
    ]

## P(Human) v. P(Model) Correlation Graph

In [12]:
def make_r2_graph(title: str, mdf: pd.DataFrame, bins: int = 15, show_raw_scatter=False):
    """
    :param str title: Title of heatmap graph
    :param int bins: Number of bins for x-y values heatmap
    :param pd.DataFrame mdf: Accepts Pandas DataFrames as generated from `process_results.py`
    :returns:
        - figure
    """

    def func(x, b, c, d):
        return  b*(x**2) + c *x + d

    all_hscores = []
    all_mscores = []

    for i, row in mdf.iterrows():
        all_hscores += [float(x) for x in re.sub(",", " ", row['hyes'][1:-1]).split()]
        all_mscores += [float(x) for x in re.sub(",", " ", row['myes'][1:-1]).split()]

    all_mscores = np.array(all_mscores)
    all_hscores = np.array(all_hscores)
    
    x =  (np.array(list(range(0, bins+1)))) * 1/bins 
    x[0] += 0.00001
    mscores_binned = np.digitize(all_mscores, x)

    center_mean = []
    center_sd = []
    valid_xs = []

    plot_array_mean = np.ones(len(x)) * -100
    plot_array_sd = np.zeros(len(x))

    for i in np.array(list(range(0, bins+1))):
        if np.sum(mscores_binned == i) > 0:
            std = np.std(all_hscores[mscores_binned == i]) / np.sqrt(np.sum(mscores_binned == i))
            mean = np.mean(all_hscores[mscores_binned == i])

            valid_xs.append(i*1/bins)
            center_mean.append(mean)
            center_sd.append(std)
            plot_array_mean[i] = mean
            plot_array_sd[i] = std

    if show_raw_scatter:
        fig = px.scatter(x=all_mscores, y=all_hscores,
                    labels={'x': 'Model P(Yes)', 'y': "Human P(Yes)"} ,
        )
    else:
        fig = px.density_heatmap(x=all_mscores, y=all_hscores, 
                        labels={'x': 'Model P(Yes)', 'y': "Human P(Yes)"} ,
                        height=500, width=500, 
                        histnorm='percent', color_continuous_scale=px.colors.sequential.dense, 
                        nbinsx=bins, nbinsy=bins,
                        title=title)
        
    popt, pcov = curve_fit(func, valid_xs, center_mean)

    fig.add_trace(go.Scatter(x=[0, 1], y=[0,1], showlegend=False, mode='lines', marker_color='white'))
    
    fig.add_trace(go.Scatter(x=x, y=func(x, *popt), 
                    mode='lines', 
                    line_width=1,
                    showlegend=False,
                    marker_color='slategray'
                    ))

    fig.add_annotation(x=0.9, y=0.05,
            text=f"R<sup>2</sup>={str(np.corrcoef(all_mscores, all_hscores)[0,1] ** 2)[:4]}",
            showarrow=False)

    fig.add_trace(go.Scatter(x=x, y=plot_array_mean, 
                    mode='markers', 
                    showlegend=False,
                    marker_color='slategray'
                    ),)

    fig.add_trace(go.Scatter(x=x, y=plot_array_mean, 
                    error_y=dict(type='data', array=plot_array_sd, visible=True, thickness=1, color='lightslategray'),  
                    marker_color='rgba(255,255,255,0)',
                    mode='markers', 
                    showlegend=False,
                    ))

    fig.update_layout(width=500, height=500)
    fig.update_yaxes(range=[-0.01,1.01], dtick=0.1)
    fig.update_xaxes(range=[-0.01,1.01], dtick=0.1)
    fig.update_layout(template='ggplot2', coloraxis_showscale=False, font_family='times new roman', font_size=14)
    fig.update_layout(
        margin=dict(l=10, r=10, t=30, b=10),
    )

    return fig

In [162]:
plot_item = [
            # "Tuned112-7B-AnswerLoss",
            # "Tuned112-2B-SparseOr",
            #  'PTG16',
            # 'Tuned112-2B',
            # 'Tuned112-7B',
            'GPT2-Tuned112'
            # 'Pretrained-7B',
            # 'Pretrained-2B'
            ][0]
make_r2_graph(plot_item, pd.read_csv(data_files[plot_item]))

## Trajectory Graph

In [17]:
def plot_trajectory_graphs(concepts: list[str], 
                            sources: Dict[str, Tuple[str, str]], 
                            color_dash_dict,
                            pretty_name_dict,
                            end_early=1000,
                            num_cols=3,
                            height_per_row=150,
                            width_per_col=400,
                            readable_names=True):
    hdf = pd.read_csv("./data/compiled_humans.csv")                 
    with open('./data/labels_to_readable.json', 'r') as f:
        labels_to_readable = json.load(f)

    num_concepts = len(concepts)
    num_rows = num_concepts//num_cols + 1 if num_concepts % num_cols != 0 else num_concepts//num_cols
    num_cols = min(num_cols, num_concepts)

    fig = make_subplots(rows=num_rows, cols=num_cols, 
                    subplot_titles=['<b>' + labels_to_readable[x] + '</b>' for x in concepts] if readable_names else [x for x in concepts], 
                    # subplot_titles = concepts,
                    shared_yaxes=True)

    fig.update_annotations(font_size=12)
    for i, concept in enumerate(concepts):
        pearsons = {}
        row = (i // num_cols) + 1
        col = (i % num_cols) + 1

        hyes, hno = np.array(hdf[hdf['concepts'] == concept]['hyes']).astype(int), np.array(hdf[hdf['concepts'] == concept]['hno']).astype(int)
        htotals = hyes + hno
        hyes, hno = hyes / htotals, hno / htotals

        answer_idx = (np.array(hdf[hdf['concepts'] == concept]['answers'])).astype(int)
        yes_rate = np.mean(answer_idx)
        hscore = np.vstack((hno, hyes))[answer_idx, np.arange(len(hyes))][:end_early+1]

        fig.add_trace(go.Scatter(y=hscore, 
                                    x=np.arange(len(hscore)), 
                                    name='Human', 
                                    line_color=color_dash_dict['Human'][0], 
                                    showlegend=i==0, 
                                    # legendgroup="Human L2"
                                    legendgroup=1,
                                    ),
                        row=row, col=col)

        fig.add_hline(y=(yes_rate ** 2) + ((1-yes_rate) ** 2), line_color='black', 
                       line_width=2, 
                        line_dash="dot", 
                        row=row, col=col)

        exmax = len(hscore)

        for name, fp in sources.items():
            
            mdf = pd.read_csv(fp)
            mscore = np.array([float(x) for x in re.sub(",", " ", mdf[mdf['concept'] == concept]['mscores'].iloc[0][1:-1]).split()][:end_early+1])
            myes = np.array([float(x) for x in re.sub(",", " ", mdf[mdf['concept'] == concept]['myes'].iloc[0][1:-1]).split()][:end_early+1])
            fig.add_trace(go.Scatter(y=mscore, 
                                x=np.arange(len(mscore)), 
                                name=pretty_name_dict[name], 
                                line_color=color_dash_dict[name][0], 
                                line_dash=color_dash_dict[name][1],
                                line_width=1.5,
                                showlegend=i==0, 
                                legendgroup=name
                                # legendgroup=1,
                                ),
                        row=row, col=col)

            corrmax = min(len(hyes), len(myes))
            exmax = min(exmax, len(mscore))
            pearson = np.corrcoef(hyes[:corrmax], myes[:corrmax])[0, 1]
            pearsons[name] = pearson ** 2

        annotation = "R²: " + " | ".join([f"{pretty_name_dict[k]}={str(v)[:4]}" for k, v in pearsons.items()])
        fig.add_annotation(x=exmax, y=0, text=annotation, xref=f"x{i+1}", yref=f"y{i+1}", xanchor='right',
                                showarrow=False, font_size=12)
        fig.add_annotation(x=exmax/2, y=-1, text="Accuracy", xref=f"x{i+1}", yref=f"y{i+1}", xanchor='right',
                                showarrow=False, font_size=12)

        # fig.layout[f'xaxis{"" if i==0 else i}'] = {'range': (0, exmax)}
        
    fig.update_yaxes(showticklabels=True, dtick=0.25, range=[-0.1, 1.1], tickfont={'family': 'times new roman'})
    fig.update_xaxes(tickfont={'family': 'times new roman'})
    fig.update_layout(template='ggplot2', width=width_per_col*num_cols, height=height_per_row*num_rows)
    fig.update_layout(legend=dict(orientation="h", itemwidth=30, yanchor="top", y=-0.08,
        xanchor="right",
        x=1, font={'family': 'times_new_roman'}))
    fig.update_layout(
        margin=dict(l=50, r=30, t=30, b=50),
    )
    fig.update_annotations(font_family='times new roman')
    
    return fig

In [18]:
def make_rule_threes(file_path: str, metric=lambda x: x['pyes_corr'] ** 2, remove_special=True, search_within_concepts=None):
    df = pd.read_csv(file_path)
    df['sort'] = df.apply(metric, axis=1)
    
    if remove_special:
        df = df[~df['concept'].isin(['hg201', 'hg200'])]
    
    if search_within_concepts is not None:
        df = df[df['concept'].isin(search_within_concepts)]

    sorted_concepts = df.sort_values('sort', ascending=False)['concept']

    half = len(sorted_concepts) // 2
    return sorted_concepts.iloc[[0, 1, 2, half-1, half, half+1, -3, -2, -1]]

In [46]:
def find_biggest_diff(df1_name: str, df2_name: str, metric=lambda x: x['pyes_corr'] ** 2, search_within_concepts=None, num_per_segment=3):
    df1 = pd.read_csv(df1_name)
    df2 = pd.read_csv(df2_name)
    
    df1['metric'] = df1.apply(metric, axis=1)
    df2['metric'] = df2.apply(metric, axis=1)

    if search_within_concepts is not None:
        df2 = df2[df2['concept'].isin(search_within_concepts)]
    
    df2 = df2[df2['concept'].isin(df1['concept'])]
    df1 = df1[df1['concept'].isin(df2['concept'])]
    
    df1['diff'] = df1['metric'] - df2['metric']
    sorted_diff = df1.sort_values('diff')['concept']
    # print(sorted_diff.loc[:, ['concept', 'diff']])
    
    # fig = px.scatter(x=df1['metric'], y=df2['metric'], labels={'x': df1_name, 'y': df2_name})
    # fig.show()

    if (num_per_segment * 3) < len(sorted_diff): 
        half = len(sorted_diff) // 2
        median_split = num_per_segment // 2
        if num_per_segment % 2 == 0:
            median_segment =  [half - i for i in range(1, median_split)] + [half] + [half + i for i in range(1, median_split+1)]
        else:
            median_segment =  [half - i for i in range(1, median_split+1)] + [half] + [half + i for i in range(1, median_split+1)]
        indexes = list(range(num_per_segment)) + median_segment + list(range(-num_per_segment, 0, 1))
    else:
        indexes = list(range(len(sorted_diff)))
    print(median_segment)
    return sorted_diff.iloc[indexes]

In [91]:
pretty_name_E4 = {
                # 'Tuned112-7B': 'Tuned on 112 Rules', 
                  'Tuned112-7B': 'Tuned LLM', 
                    'Tuned92-7B': 'Tuned LLM', 
                    'Tuned92-7B-Special': 'Tuned with Minority Rule', 
                    'Pretrained-7B': 'Pretrained LLM'}

fig = plot_trajectory_graphs(concepts=[
                            ['hg201', 'hg200'],
                            # primitives_or,
                            # SUBSETS['tuned92-heldout']
                            # SUBSETS['fol']
                            # ['hg96', 'hg63']
                            # ['hg45', 'hg68']
                            # ['hg03', 'hg04', 'hg18', 'hg19', 'hg20', 'hg24', 'hg09']
                            # ['hg84', 'hg89', 'hg08', 'hg103', 'hg75']
                            # make_rule_threes(data_files['Pretrained-7B'], 
                            #     search_within_concepts=SUBSETS['tuned112']
                            # )
                            # find_biggest_diff(data_files['PTG16'], data_files['Tuned112-7B'], 
                            #     search_within_concepts=SUBSETS['boolean'])
                            # find_biggest_diff(data_files['Pretrained-7B'], data_files['Tuned112-7B'], 
                            #     metric = lambda x: x['macc'],
                                # search_within_concepts=SUBSETS['fol']
                                # )
                            # find_biggest_diff(data_files['Pretrained-7B'], data_files['Tuned112-7B'], 
                            #     search_within_concepts=SUBSETS['fol'])
                            ][0],
                        sources={k: data_files[k] for k in 
                            ['Tuned92-7B', 'Tuned92-7B-Special', 'Pretrained-7B']
                        },
                        num_cols=1,
                        width_per_col=300,
                        color_dash_dict=color_and_dash,
                        pretty_name_dict=pretty_name_E4,
                        # pretty_name_dict=pretty_name,
                        height_per_row=180,
                        readable_names=True,
                        end_early=1000)

fig.add_annotation(text='Response Number', xref='x domain', yref='paper', x=0.5, y=0.5, showarrow=False, font_size=10, font_family='times new roman')
fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=-0.12, y=0.9, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
fig.add_annotation(text='Response Number', xref='x domain', yref='paper', x=0.5, y=-0.15, showarrow=False, font_size=10, font_family='times new roman')
fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=-0.12, y=0.1, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
fig.update_layout(legend=dict(orientation="h", itemwidth=30, yanchor="top", y=-0.13,
        xanchor="right",
        x=1, font={'family': 'times_new_roman'}))

fig.update_traces({'line': {'width': 0.8, 'dash': 'dashdot'}}, selector=lambda x: x['legendgroup'] == 'PTG16')
fig.update_traces({'line': {'width': 1.2}}, selector=lambda x: x['legendgroup'] == 'Pretrained-7B')
fig.update_layout(width=800, showlegend=True)
fig.show()
fig.write_image('./analysis/fig/e4_tuned_minorityrule.pdf')

In [128]:
# concepts = make_rule_threes(data['TunFOL'][0])
pretty_name = {'PTG16': 'Bayesian pLoT Model', 
                    'Tuned112-7B': 'Tuned', 
                    'Tuned112-7B-AnswerLoss': 'Tuned-CE', 
                    'Pretrained-7B': 'Pretrained',
                    'Tuned92-7B': 'Pretrained',
                    'GPT2-Tuned112': "GPT2-Tuned"}

fig = plot_trajectory_graphs(concepts=[
                            # ['hg201', 'hg200'],
                            # primitives_or,
                            # SUBSETS['tuned92-heldout']
                            # SUBSETS['fol']
                            ['hg96', 'hg63']
                            # ['hg45', 'hg68']
                            # ['hg03', 'hg04', 'hg18', 'hg19', 'hg20', 'hg24', 'hg09']
                            # ['hg84', 'hg89', 'hg08', 'hg103', 'hg75']
                            # make_rule_threes(data_files['Pretrained-7B'], 
                            #     search_within_concepts=SUBSETS['tuned112']
                            # )
                            # find_biggest_diff(data_files['PTG16'], data_files['Tuned112-7B'], 
                            #     search_within_concepts=SUBSETS['boolean'])
                            # find_biggest_diff(data_files['Pretrained-7B'], data_files['Tuned112-7B'], 
                            #     metric = lambda x: x['macc'],
                                # search_within_concepts=SUBSETS['fol']
                                # )
                            # find_biggest_diff(data_files['Pretrained-7B'], data_files['Tuned112-7B'], 
                            #     search_within_concepts=SUBSETS['fol'])
                            ][0],
                        sources={k: data_files[k] for k in 
                            ['Tuned112-7B', 'PTG16']
                        },
                        num_cols=1,
                        width_per_col=300,
                        color_dash_dict=color_and_dash,
                        pretty_name_dict=pretty_name,
                        height_per_row=180,
                        readable_names=True,
                        end_early=1000)

fig.add_annotation(text='Response Number', xref='x domain', yref='paper', x=0.5, y=0.5, showarrow=False, font_size=10, font_family='times new roman')
fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=-0.12, y=0.9, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
fig.add_annotation(text='Response Number', xref='x domain', yref='paper', x=0.5, y=-0.15, showarrow=False, font_size=10, font_family='times new roman')
fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=-0.12, y=0.1, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
fig.update_layout(legend=dict(orientation="h", itemwidth=30, yanchor="top", y=-0.13,
        xanchor="right",
        x=1, font={'family': 'times_new_roman'}))

fig.update_traces({'line': {'width': 0.8, 'dash': 'dashdot'}}, selector=lambda x: x['legendgroup'] == 'PTG16')
fig.update_traces({'line': {'width': 1.2}}, selector=lambda x: x['legendgroup'] == 'Pretrained-7B')
fig.update_layout(width=500, showlegend=True)
fig.show()
fig.write_image('./analysis/fig/e3_comparison_vert.pdf')

In [148]:
# concepts = make_rule_threes(data['TunFOL'][0])
pretty_name = {'PTG16': 'Bayesian pLoT Model', 
                    'Tuned112-7B': 'Tuned LLM', 
                    'Tuned112-7B-AnswerLoss': 'Tuned-CE', 
                    'Pretrained-7B': 'Pretrained LLM',
                    'Tuned92-7B': 'Pretrained',
                    'GPT2-Tuned112': "GPT2-Tuned"}

fig = plot_trajectory_graphs(concepts=[
                            # ['hg201', 'hg200'],
                            # primitives_or,
                            # SUBSETS['tuned92-heldout']
                            # SUBSETS['fol']
                            ['hg96', 'hg63']
                            # ['hg45', 'hg68']
                            # ['hg03', 'hg04', 'hg18', 'hg19', 'hg20', 'hg24', 'hg09']
                            # ['hg84', 'hg89', 'hg08', 'hg103', 'hg75']
                            # make_rule_threes(data_files['Pretrained-7B'], 
                            #     search_within_concepts=SUBSETS['tuned112']
                            # )
                            # find_biggest_diff(data_files['PTG16'], data_files['Tuned112-7B'], 
                            #     search_within_concepts=SUBSETS['boolean'])
                            # find_biggest_diff(data_files['Pretrained-7B'], data_files['Tuned112-7B'], 
                            #     metric = lambda x: x['macc'],
                                # search_within_concepts=SUBSETS['fol']
                                # )
                            # find_biggest_diff(data_files['Pretrained-7B'], data_files['Tuned112-7B'], 
                            #     search_within_concepts=SUBSETS['fol'])
                            ][0],
                        sources={k: data_files[k] for k in 
                            ['Tuned112-7B', 'PTG16']
                        },
                        num_cols=2,
                        width_per_col=300,
                        color_dash_dict=color_and_dash,
                        pretty_name_dict=pretty_name,
                        height_per_row=200,
                        readable_names=True,
                        end_early=1000)

fig.add_annotation(text='Response Number', xref='x domain', yref='paper', x=1.85, y=-0.4, showarrow=False, font_size=10, font_family='times new roman')
fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=-0.12, y=0.5, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=1.13, y=0.5, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
fig.add_annotation(text='Response Number', xref='x domain', yref='paper', x=0.5, y=-0.4, showarrow=False, font_size=10, font_family='times new roman')
# fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=-0.12, y=0.1, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
fig.update_layout(legend=dict(orientation="h", itemwidth=30, yanchor="top", y=-0.4,
        xanchor="right",
        x=1, font={'family': 'times_new_roman'}))

fig.update_traces({'line': {'width': 0.8, 'dash': 'dashdot'}}, selector=lambda x: x['legendgroup'] == 'PTG16')
fig.update_traces({'line': {'width': 1.2}}, selector=lambda x: x['legendgroup'] == 'Pretrained-7B')
fig.update_layout(width=950, showlegend=True)
fig.show()
fig.write_image('./analysis/fig/e3_comparison_horz.pdf')

In [63]:
# concepts = make_rule_threes(data['TunFOL'][0])
pretty_name = {'PTG16': 'Bayesian pLoT Model', 
                    'Tuned112-7B': 'Tuned LLM', 
                    'Tuned112-7B-AnswerLoss': 'Tuned-CE', 
                    'Pretrained-7B': 'Pretrained',
                    'Tuned92-7B': 'Pretrained',
                    'GPT2-Tuned112': "GPT2-Tuned"}

fig = plot_trajectory_graphs(concepts=[
                            # ['hg201', 'hg200'],
                            # primitives_or,
                            # SUBSETS['tuned92-heldout']
                            # SUBSETS['fol']
                            # ['hg96', 'hg63']
                            # ['hg45', 'hg68']
                            # ['hg03', 'hg04', 'hg18', 'hg19', 'hg20', 'hg24', 'hg09']
                            # ['hg84', 'hg89', 'hg08', 'hg103', 'hg75']
                            # make_rule_threes(data_files['Pretrained-7B'], 
                            #     search_within_concepts=SUBSETS['tuned112']
                            # )
                            find_biggest_diff(data_files['PTG16'], data_files['Tuned112-7B'], num_per_segment=4)
                            # find_biggest_diff(data_files['Pretrained-7B'], data_files['Tuned112-7B'], 
                            #     metric = lambda x: x['macc'],
                                # search_within_concepts=SUBSETS['fol']
                                # )
                            # find_biggest_diff(data_files['Pretrained-7B'], data_files['Tuned112-7B'], 
                            #     search_within_concepts=SUBSETS['fol'])
                            ][0],
                        sources={k: data_files[k] for k in 
                            ['Tuned112-7B', 'PTG16']
                        },
                        num_cols=2,
                        width_per_col=300,
                        color_dash_dict=color_and_dash,
                        pretty_name_dict=pretty_name,
                        height_per_row=180,
                        readable_names=True,
                        end_early=1000)

# fig.add_annotation(text='Response Number', xref='x domain', yref='paper', x=0.5, y=0.5, showarrow=False, font_size=10, font_family='times new roman')
# fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=-0.12, y=0.9, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
# fig.add_annotation(text='Response Number', xref='x domain', yref='paper', x=0.5, y=-0.15, showarrow=False, font_size=10, font_family='times new roman')
# fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=-0.12, y=0.1, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
# fig.update_layout(legend=dict(orientation="h", itemwidth=30, yanchor="top", y=-0.13,
#         xanchor="right",
#         x=1, font={'family': 'times_new_roman'}))

fig.update_traces({'line': {'width': 0.8, 'dash': 'dashdot'}}, selector=lambda x: x['legendgroup'] == 'PTG16')
fig.update_traces({'line': {'width': 1.2}}, selector=lambda x: x['legendgroup'] == 'Pretrained-7B')
fig.update_layout(width=900, showlegend=True)
fig.show()
fig.write_image('./analysis/fig/e3_appendixgraphs.pdf')

[55, 56, 57, 58]


In [49]:
# concepts = make_rule_threes(data['TunFOL'][0])
pretty_name = {'PTG16': 'PLoT Bayesian Model', 
                    'Tuned112-7B': 'LLM Tuned on 112 Rules', 
                    'Tuned112-7B-AnswerLoss': 'Tuned-CE', 
                    'Pretrained-7B': 'Pretrained',
                    'Tuned92-7B': 'LLM Tuned on 92 Rules',
                    'GPT2-Tuned112': "GPT2-Tuned"}

fig = plot_trajectory_graphs(concepts=[
                            find_biggest_diff(data_files['Tuned112-7B'], data_files['Tuned92-7B'], 
                                search_within_concepts=SUBSETS['tuned92'], num_per_segment=2)
                            # find_biggest_diff(data_files['Pretrained-7B'], data_files['Tuned112-7B'], 
                            #     metric = lambda x: x['macc'],
                                # search_within_concepts=SUBSETS['fol']
                                # )
                            # find_biggest_diff(data_files['Pretrained-7B'], data_files['Tuned112-7B'], 
                            #     search_within_concepts=SUBSETS['fol'])
                            ][0],
                        sources={k: data_files[k] for k in 
                            ['Tuned112-7B', 'Tuned92-7B']
                        },
                        num_cols=2,
                        width_per_col=260,
                        color_dash_dict=color_and_dash,
                        pretty_name_dict=pretty_name,
                        height_per_row=180,
                        readable_names=True,
                        end_early=1000)

# fig.add_annotation(text='Response Number', xref='x domain', yref='paper', x=0.5, y=0.5, showarrow=False, font_size=10, font_family='times new roman')
# fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=-0.12, y=0.9, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
# fig.add_annotation(text='Response Number', xref='x domain', yref='paper', x=0.5, y=-0.15, showarrow=False, font_size=10, font_family='times new roman')
# fig.add_annotation(text='Accuracy', xref='x domain', yref='paper', x=-0.12, y=0.1, showarrow=False, font_size=10, font_family='times new roman', textangle=-90)
# fig.update_layout(legend=dict(orientation="h", itemwidth=30, yanchor="top", y=-0.13,
#         xanchor="right",
#         x=1, font={'family': 'times_new_roman'}))

# fig.update_traces({'line': {'width': 0.8, 'dash': 'dashdot'}}, selector=lambda x: x['legendgroup'] == 'PTG16')
# fig.update_traces({'line': {'width': 1.2}}, selector=lambda x: x['legendgroup'] == 'Pretrained-7B')
fig.update_layout(width=1200, showlegend=True)
fig.show()
fig.write_image('./analysis/fig/e4_tuned92_comparison.pdf')

[48, 49]


In [248]:
px.colors.diverging.curl

['rgb(20, 29, 67)',
 'rgb(28, 72, 93)',
 'rgb(18, 115, 117)',
 'rgb(63, 156, 129)',
 'rgb(153, 189, 156)',
 'rgb(223, 225, 211)',
 'rgb(241, 218, 206)',
 'rgb(224, 160, 137)',
 'rgb(203, 101, 99)',
 'rgb(164, 54, 96)',
 'rgb(111, 23, 91)',
 'rgb(51, 13, 53)']

# Heatmap Plot

In [451]:
from scipy.stats import percentileofscore
import re
import statsmodels.formula.api as smf

# # Construct the columns for the different powers of x
# def get_r2_statsmodels(x, y, k=1):
#     xpoly = np.column_stack([x**i for i in range(k+1)])    
#     return sm.OLS(y, xpoly).fit().rsquared

# Use the formula API and construct a formula describing the polynomial
def get_r2_statsmodels_formula(x, y, k=1):
    formula = 'y ~ 1 + ' + ' + '.join('I(x**{})'.format(i) for i in range(1, k+1))
    data = {'x': x, 'y': y}
    return smf.ols(formula, data).fit() # or rsquared_adj

def find_acc_to_r2_corr(df_name, folder_name):
    
    def func(x, b, c, d):
        return  b*(x**2) + c *x + d

    fig = make_subplots(rows=1, cols=2, subplot_titles=['Tuned LLM', 'Bayesian pLoT Model'], horizontal_spacing=0.15,
            x_title='Model Last Quarter Accuracy', y_title='R<sup>2</sup> with Human Responses')

    mdf = pd.read_csv(data_files[df_name])
    mdf = mdf[~mdf['concept'].isin(['hg200', 'hg201'])]
    bdf =  pd.read_csv(data_files['PTG16'])
    hdf = pd.read_csv("./data/compiled_individual_humans.csv")    
    model_stats = pd.read_csv("./results/experiment_1/model_indiv_stats_with_exp3.csv")

    # mdf = mdf[~mdf['concept'].isin(SUBSETS['boolean'])]
    maccs = []
    baccs = []
    mcorrs = []
    bcorrs = []
    concepts = []
    for concept in mdf['concept'].unique():
        mscore = np.array([float(x) for x in re.sub(",", " ", mdf[mdf['concept'] == concept]['mscores'].iloc[0][1:-1]).split()])
        bscore = np.array([float(x) for x in re.sub(",", " ", bdf[bdf['concept'] == concept]['mscores'].iloc[0][1:-1]).split()])
        hscore = np.array([float(x) for x in re.sub(",", " ", mdf[mdf['concept'] == concept]['hscores'].iloc[0][1:-1]).split()])
        
        # answer_idx = (np.array(hdf[hdf['concepts'] == concept]['answers'])).astype(int)
        # yes_rate = np.mean(answer_idx)

        # hlq_accuracy = hdf_stats[hdf_stats['concept'] == concept]['last_quarter_mean'].item()
        # hlq_sd = hdf_stats[hdf_stats['concept'] == concept]['last_quarter_sd'].item()

        # percentile = model_stats[(model_stats['concept'] == concept) & (model_stats['subject'] == folder_name)]['last_quarter_percentile'].item()
        # hdf_scores = [float(x) for x in re.sub(",", " ", hdf_stats[hdf_stats['concept'] == concept]['last_quarter_scores'].item()[1: -1]).split()]
        # percentile = percentileofscore(hdf_scores, mlq_accuracy, 'weak')
        # print(hlq_accuracy, hdf_scores)
        # print(concept, mlq_accuracy, percentile)
        # hlq_accuracy = hscore[-(len(mscore)//4):].mean()
        corr = (mdf[mdf['concept'] == concept]['pyes_corr'] ** 2).item()
        bcorr = (bdf[bdf['concept'] == concept]['pyes_corr'] ** 2).item()
        # accs.append(mlq_accuracy - (hlq_accuracy - hlq_sd))
        
        blq_accuracy = bscore[3*(len(bscore)//4):].mean()
        baccs.append(blq_accuracy)
        bcorrs.append(bcorr)

        mlq_accuracy = mscore[3*(len(mscore)//4):].mean()
        maccs.append(mlq_accuracy)
        mcorrs.append(corr)

        # corrs.append(corr-bcorr)
        concepts.append(concept)

    fig.add_trace(go.Histogram2d(
        x=maccs,
        y=mcorrs,
        colorscale='viridis',
        nbinsx=15,
        nbinsy=15,
        zmax=15,
        zmin=0,
        histnorm='',
        coloraxis='coloraxis'
    ), row=1,col=1)

    fig.add_trace(go.Histogram2d(
        x=baccs,
        y=bcorrs,
        zmax=15,
        zmin=0,
        nbinsx=15,
        nbinsy=15,
        coloraxis='coloraxis',
        colorscale='viridis',
        histnorm=''
    ), row=1,col=2)

    fig.add_trace(go.Scatter(x=maccs, y=mcorrs,
                customdata = [[c for c in concepts], [READABLE[c] for c in concepts]],
                hoverinfo='text', text=[READABLE[c] for c in concepts],
                # marker_color=["darkblue" if c in SUBSETS['boolean'] else "coral" for c in concepts],
                marker_color='lightgray',
                marker_size=2.5,
                showlegend=False,
                mode='markers'),
                row=1, col=1)
    
    fig.add_trace(go.Scatter(x=baccs, y=bcorrs,
                customdata = [[c for c in concepts], [READABLE[c] for c in concepts]],
                hoverinfo='text', text=[READABLE[c] for c in concepts],
                # marker_color=["darkblue" if c in SUBSETS['boolean'] else "coral" for c in concepts],
                marker_color='lightgray',
                marker_size=2.5,
                showlegend=False,
                mode='markers'),
                row=1, col=2)
    

    # delta = (np.array(mcorrs) - np.array(bcorrs))
    # delta = (delta - np.mean(delta)) / np.std(delta)
    
    # fig.add_trace(go.Scatter(x=maccs, y=mcorrs,
    #             customdata = [[c for c in concepts], [READABLE[c] for c in concepts]],
    #             hoverinfo='text', text=[READABLE[c] for c in concepts],
    #             # marker_color=["darkblue" if c in SUBSETS['boolean'] else "coral" for c in concepts]
    #             marker_color=delta,
    #             marker=dict(colorscale="balance_r",  
    #                 colorbar=dict(title='R<sup>2</sup>', 
    #                 tickmode='array',
    #                 tickvals=[2.4, 0, -2.4],
    #                 ticktext = ['Tuned<br>Model<br>Higher', 'Same', 'Bayesian<br>Model<br>Higher'],
    #                 outlinewidth=0,
    #                 thickness=15
    #                 )),
    #             showlegend=False,
    #             mode='markers'),
    #             row=1, col=1)
    
    # fig.add_trace(go.Scatter(x=baccs, y=bcorrs, 
    #             hoverinfo='text', text=[READABLE[c] for c in concepts],
    #             # marker_color=["darkblue" if c in SUBSETS['boolean'] else "coral" for c in concepts],
    #             marker_color=delta,
    #             marker=dict(colorscale="balance_r", 
    #                 # colorbar=dict(showticklabels=False)
    #                 ),
    #             showlegend=False,
    #             mode='markers'),
    #             row=1, col=2)
    
    # mfit = get_r2_statsmodels_formula(np.array(maccs), np.array(mcorrs), k=2)
    # bfit = get_r2_statsmodels_formula(np.array(baccs), np.array(bcorrs), k=2)
    
    # data = {'accuracy': maccs + baccs, 'r2': mcorrs + bcorrs, 'is_llm': [1] * len(maccs) + [0] * len(baccs), 'concept': list(mdf['concept'].unique()) + list(mdf['concept'].unique())} 

    # fig.add_trace(go.Scatter(x=sorted(maccs), y=mfit.get_prediction({'x': np.array(sorted(maccs)).reshape((-1, 1))}).predicted_mean, 
    #                 mode='lines', 
    #                 line_width=1,
    #                 showlegend=False,
    #                 marker_color='slategray'
    #                 ), row=1, col=1)
    
    # fig.add_annotation(x=0.84, y=0.48, text=f'R<sup>2</sup>={str(bfit.rsquared)[:4]}', showarrow=False, row=1, col=2)
    # fig.add_annotation(x=0.84, y=0.72, text=f'R<sup>2</sup>={str(mfit.rsquared)[:4]}', showarrow=False, row=1, col=1)

    # fig.add_trace(go.Scatter(x=sorted(baccs), y=bfit.get_prediction({'x': np.array(sorted(baccs)).reshape((-1, 1))}).predicted_mean, 
    #                 mode='lines', 
    #                 line_width=1,
    #                 showlegend=False,
    #                 marker_color='slategray'
    #                 ), row=1, col=2)

    fig['layout'].xaxis= {'anchor': 'y', 'domain': [0.0, 0.45], 
                            'dtick': 0.1, 'showticklabels': True, 'tick0': 0.5, 'range': [0.5, 1]}
    
    fig['layout'].yaxis= {'anchor': 'x', 'domain': [0.0, 1.0], 
                            'dtick': 0.2, 'showticklabels': True, 'tick0': 0.2, 'range': [0.1, 1]}
    fig['layout'].xaxis2= {'anchor': 'y2', 'domain': [0.55, 1.0], 
                            'dtick': 0.1, 'showticklabels': True, 'tick0': 0.5, 'range': [0.5, 1]}
    
    fig['layout'].yaxis2= {'anchor': 'x2', 'domain': [0.0, 1.0], 
                            'dtick': 0.2, 'showticklabels': True, 'tick0': 0.2, 'range': [0.1, 1]}
    
    fig.add_shape(type='rect', xref='x', yref='y', x0=0.75, y0=0, x1=1, y1=0.5, line=dict(color='white', width=2), fillcolor='rgba(0,0,0,0)')
    fig.add_shape(type='rect', xref='x', yref='y', x0=0.5, y0=0.5, x1=0.75, y1=1, line=dict(color='white', width=2), fillcolor='rgba(0,0,0,0)')
    fig.add_shape(type='rect', xref='x2', yref='y2', x0=0.75, y0=0, x1=1, y1=0.5, line=dict(color='white', width=2), fillcolor='rgba(0,0,0,0)')
    fig.add_shape(type='rect', xref='x2', yref='y2', x0=0.5, y0=0.5, x1=0.75, y1=1, line=dict(color='white', width=2), fillcolor='rgba(0,0,0,0)')
    fig.add_annotation(text='Unhumanlike<br>high accuracy', align='left',xref='x', yref='y', x=0.76, y=0.11, yanchor='bottom', xanchor='left', showarrow=False, font_color='lightblue', font_size=9.5)
    fig.add_annotation(text='Unhumanlike<br>low accuracy', align='left',xref='x', yref='y', x=0.51, y=0.11, yanchor='bottom', xanchor='left', showarrow=False, font_color='lightblue', font_size=9.5)
    fig.add_annotation(text='Humanlike<br>low accuracy', align='left',xref='x', yref='y', x=0.51, y=0.99, yanchor='top', xanchor='left', showarrow=False, font_color='lightblue', font_size=9.5)
    fig.add_annotation(text='Humanlike<br>high accuracy', align='left',xref='x', yref='y', x=0.76, y=0.99, yanchor='top', xanchor='left', showarrow=False, font_color='lightblue', font_size=9.5)

    fig.add_annotation(text='Unhumanlike<br>high accuracy', align='left', xref='x2', yref='y2', x=0.76, y=0.11, yanchor='bottom', xanchor='left', showarrow=False, font_color='lightblue', font_size=9.5)
    fig.add_annotation(text='Unhumanlike<br>low accuracy', align='left',xref='x2', yref='y2', x=0.51, y=0.11, yanchor='bottom', xanchor='left', showarrow=False, font_color='lightblue', font_size=9.5)
    fig.add_annotation(text='Humanlike<br>low accuracy', align='left',xref='x2', yref='y2', x=0.51, y=0.99, yanchor='top', xanchor='left', showarrow=False, font_color='lightblue', font_size=9.5)
    fig.add_annotation(text='Humanlike<br>high accuracy', align='left',xref='x2', yref='y2', x=0.76, y=0.99, yanchor='top', xanchor='left', showarrow=False, font_color='lightblue', font_size=9.5)

    fig.update_xaxes(dtick=0.1)
    fig.update_layout(width=700, height=350, template='ggplot2', font_family='times new roman', coloraxis_showscale=False)
    fig.update_layout(
        margin=dict(l=60, r=40, t=30, b=70),
    )
    fig.update_coloraxes(colorscale='viridis', showscale=True, colorbar_title="Number of<br>Rules", colorbar_title_font_size=11)
    # fig.update(layout_showlegend=False) 
    return fig, data

fig, data = find_acc_to_r2_corr('Tuned112-7B', 'gemma7b-tuned112')
fig.show()
fig.write_image('./analysis/fig/e3_versus_bayesian_model.pdf')
# # find_acc_to_r2_corr('Pretrained-7B', 'gemma7b-pretrained')

In [97]:
pd.DataFrame.from_dict(data).to_csv('./results/experiment_3/r2_to_accuracy_data.csv')

# Box Plots

In [5]:
def add_p_value_annotation(fig, array_columns, subplot=None, 
            _format=dict(interline=0.07, text_height=1.07, color='black'),
            stat_test=lambda x, y:  stats.ttest_ind(x, y, equal_var=False)):
    ''' Adds notations giving the p-value between two box plot data (t-test two-sided comparison)
    
    Parameters:
    ----------
    fig: figure
        plotly boxplot figure
    array_columns: np.array
        array of which columns to compare 
        e.g.: [[0,1], [1,2]] compares column 0 with 1 and 1 with 2
    subplot: None or int
        specifies if the figures has subplots and what subplot to add the notation to
    _format: dict
        format characteristics for the lines

    Returns:
    -------
    fig: figure
        figure with the added notation
    '''
    # Specify in what y_range to plot for each pair of columns
    y_range = np.zeros([len(array_columns), 2])
    for i in range(len(array_columns)):
        y_range[i] = [1.01+i*_format['interline'], 1.02+i*_format['interline']]

    # Get values from figure
    fig_dict = fig.to_dict()

    # Get indices if working with subplots
    if subplot:
        if subplot == 1:
            subplot_str = ''
        else:
            subplot_str =str(subplot)
        indices = [] #Change the box index to the indices of the data for that subplot
        for index, data in enumerate(fig_dict['data']):
            #print(index, data['xaxis'], 'x' + subplot_str)
            if data['xaxis'] == 'x' + subplot_str:
                indices = np.append(indices, index)
        indices = [int(i) for i in indices]
    else:
        subplot_str = ''

    # Print the p-values
    for index, column_pair in enumerate(array_columns):
        if subplot:
            data_pair = [indices[column_pair[0]], indices[column_pair[1]]]
        else:
            data_pair = column_pair

        # Mare sure it is selecting the data and subplot you want
        #print('0:', fig_dict['data'][data_pair[0]]['name'], fig_dict['data'][data_pair[0]]['xaxis'])
        #print('1:', fig_dict['data'][data_pair[1]]['name'], fig_dict['data'][data_pair[1]]['xaxis'])

        # Get the p-value
        # pvalue = stats.ttest_ind(
        #     fig_dict['data'][data_pair[0]]['y'],
        #     fig_dict['data'][data_pair[1]]['y'],
        #     equal_var=False,
        # )[1]

        pvalue = stat_test(fig_dict['data'][data_pair[0]]['y'], fig_dict['data'][data_pair[1]]['y'])[1]
        if pvalue >= 0.05:
            symbol = 'ns'
        elif pvalue >= 0.01: 
            symbol = '*'
        elif pvalue >= 0.001:
            symbol = '**'
        else:
            symbol = '***'
        # Vertical line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[0], y0=y_range[index][0], 
            x1=column_pair[0], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        # Horizontal line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[0], y0=y_range[index][1], 
            x1=column_pair[1], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        # Vertical line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[1], y0=y_range[index][0], 
            x1=column_pair[1], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        ## add text at the correct x, y coordinates
        ## for bars, there is a direct mapping from the bar number to 0, 1, 2...
        fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
            x=(column_pair[0] + column_pair[1])/2,
            y=y_range[index][1]*_format['text_height'],
            showarrow=False,
            text=symbol,
            textangle=0,
            xref="x"+subplot_str,
            yref="y"+subplot_str+" domain"
        ))
    return fig

In [9]:
def make_r2_comparison_scatter(source_dict, pvalue_annots=None, start_at=0, stop_at=500, concept_subset='tuned112', 
                               subsets={'Propositional Rules': SUBSETS['boolean'],
                                        'FOL Rules': SUBSETS['fol']},
                               pretty_name=pretty_name):
    
    with open(f'./data/subsets/{concept_subset}.txt', 'r') as f:
        concepts = [x.strip() for x in f.readlines()]
    with open('./data/labels_to_readable.json', 'r') as f:
        labels_to_readable = json.load(f)

    fig= make_subplots(rows=1, cols=len(subsets), subplot_titles=list(subsets.keys()), shared_yaxes=True)
    # fig.update_annotations(y=1.3)
    data = {}
    for i, (name, subset_list) in enumerate(subsets.items()):
        for j, (name, filepath), in enumerate(source_dict.items()):
            data[name] = []
            df = pd.read_csv(filepath)
            df = df[df['concept'].isin(subset_list)]

            corrs = []
            for concept in df['concept'].unique():
                if concept in concepts:
                    hscores = [float(x) for x in re.sub(",", " ", df[df['concept'] == concept]['hyes'].iloc[0][1:-1]).split()[start_at:stop_at]]
                    mscores = [float(x) for x in re.sub(",", " ", df[df['concept'] == concept]['myes'].iloc[0][1:-1]).split()[start_at:stop_at]]
                    corr = np.corrcoef(hscores, mscores)[0,1] ** 2
                    corrs.append(corr)
                    data[name].append((concept, corr))

            fig.add_trace(go.Box(x=[pretty_name[name]] * len(df), y=corrs, marker_color=color_and_dash[name][0], name=name, 
                hovertext=[f"{c}: {labels_to_readable[c]}" for c in df['concept']],
                boxpoints='all',marker_size=2, jitter=0.3, showlegend=False), row=1, col=i+1)
    

    fig.update_annotations(font_size=14, font_family='times new roman')
    fig.update_annotations({'y': 1.15}, selector=lambda x: "Rules" in x.text)
    # fig.update_traces(boxpoints='all', jitter=0.2)
    if pvalue_annots is not None:
        for i, pvalue_annot in enumerate(pvalue_annots):
            fig = add_p_value_annotation(fig, pvalue_annot, subplot=i+1, stat_test=lambda x, y: stats.ttest_rel(x, y))
    
    fig.update_annotations(font_size=14, font_family='times new roman')
    fig.update_layout(width=700, height=400, template='ggplot2')
    fig.update_layout(
        margin=dict(l=70, r=50, t=70, b=60),
    )
    fig.update_yaxes(selector=lambda x: x['anchor'] == 'x2', title='R<sup>2</sup> with Human Responses', title_font={'family': 'times new roman', 'size': 14})
    # fig.update_layout(font_size=14, font_family='times new roman')
    return fig, data

In [10]:
fig, data = make_r2_comparison_scatter({k: v for k,v in data_files.items() if k in 
                            # ['PTG16', 'Tuned112-2B', 'Pretrained-2B']
                            # ['PTG16', 'Tuned112-7B', 'Tuned92-7B', 'Pretrained-7B']
                            ['PTG16', 'Tuned112-7B', 'Pretrained-7B']},
                            [[[0,1], [2,1]], [[0,1], [2,1]]]
                            )

fig.update_xaxes(tickfont={'family': 'times new roman', 'size': 12})
fig.show()
fig.write_image('./analysis/fig/e3_per_rule_r2yes.pdf')

In [34]:
pretty_name_e4 = {'PTG16': 'PLoT Bayesian Model', 
                    'Ellis23': 'LLM-Augmented<br>Bayesian Model (Ellis23)', 
                    'Tuned112-7B': 'LLM Tuned on 112 Rules', 
                    'Tuned112-7B-AnswerLoss': 'Tuned on Answers',
                    'Tuned92-7B': 'LLM Tuned on 92 Rules', 
                    'Pretrained-7B': 'Pretrained LLM'}

fig, data = make_r2_comparison_scatter({k: v for k,v in data_files.items() if k in 
                            # ['PTG16', 'Tuned112-2B', 'Pretrained-2B']
                            ['Tuned112-7B', 'Tuned92-7B', 'Pretrained-7B']},
                            [[[0,1], [2,1]]],
                            subsets = {'': SUBSETS['tuned92-heldout']},
                            pretty_name=pretty_name_e4
                            )

fig.update_layout(width=400)
fig.update_layout(
        margin=dict(l=50, r=20, t=45, b=60),
    )
fig.update_xaxes(tickfont={'family': 'times new roman', 'size': 12})
fig.update_layout(font_family='times new roman')
fig.show()
fig.write_image('./analysis/fig/e4_per_rule_r2yes.pdf')

In [213]:
pretty_name = {'PTG16': 'PLoT Bayesian Model', 
                    'Ellis23': 'LLM-Augmented Bayesian Model', 
                    'Tuned112-7B': 'KL Loss', 
                    'Tuned112-7B-AnswerLoss': 'CE Loss',
                    'Tuned92-7B': 'Tuned on 92 Rules', 
                    'Pretrained-7B': 'Pretrained',
                    'GPT2-Tuned112': "GPT2 Tuned"}

fig, data = make_r2_comparison_scatter({k: v for k,v in data_files.items() if k in 
                            # ['PTG16', 'Tuned112-2B', 'Pretrained-2B']
                            # ['PTG16', 'Tuned112-7B', 'Tuned92-7B', 'Pretrained-7B']
                            ['Tuned112-7B', 'Tuned112-7B-AnswerLoss', 'Pretrained-7B']},
                            [[[0,1], [2,1]], [[0,1], [2,1]]]
                            )
fig.show()
# fig.write_image('./analysis/fig/e3_per_rule_r2yes.pdf')

In [211]:
pretty_name = {'PTG16': 'PLoT Bayesian Model', 
                    'Ellis23': 'LLM-Augmented Bayesian Model', 
                    'Tuned112-7B': 'Gemma-7B Tuned', 
                    'Tuned112-7B-AnswerLoss': 'CE Loss',
                    'Tuned92-7B': 'Tuned on 92 Rules', 
                    'GPT2-Tuned112': "GPT2 Tuned",
                    'Pretrained-7B': 'Pretrained',}

fig, data = make_r2_comparison_scatter({k: v for k,v in data_files.items() if k in 
                            # ['PTG16', 'Tuned112-2B', 'Pretrained-2B']
                            # ['PTG16', 'Tuned112-7B', 'Tuned92-7B', 'Pretrained-7B']
                            ['Tuned112-7B', 'GPT2-Tuned112', 'PTG16']},
                            [[[0,1], [2,1], [2,0]], [[0,1], [2,1], [2,0]]]
                            )
fig.show()
# fig.write_image('./analysis/fig/e3_per_rule_r2yes.pdf')

In [149]:
llm = [x[1] for x in sorted(data['Tuned112-7B'], key=lambda x: x[0])]
bmodel = [x[1] for x in sorted(data['PTG16'], key=lambda x: x[0])]
llm2 = [x[1] for x in sorted(data['Tuned92-7B'], key=lambda x: x[0])]
pllm = [x[1] for x in sorted(data['Pretrained-7B'], key=lambda x: x[0])]

from scipy import stats
print(stats.ttest_rel(pllm, llm, alternative='less'))
print(stats.ttest_rel(bmodel, llm, alternative='less'))
print(stats.ttest_rel(bmodel, llm2, alternative='less'))
print(stats.ttest_rel(llm2, llm, alternative='less'))

TtestResult(statistic=-25.706390757322076, pvalue=7.823212062599463e-40, df=77)
TtestResult(statistic=-4.427537846246212, pvalue=1.5519418591168324e-05, df=77)
TtestResult(statistic=-4.026516295127854, pvalue=6.577483908817582e-05, df=77)
TtestResult(statistic=-2.3065179226237835, pvalue=0.011886063067266145, df=77)


In [150]:
def make_r2_bar_charts(source_dict, start_at=0, stop_at=500, show_ellis=True):
    r2s = []
    mode = []
    colors = []

    if show_ellis:
        mode += [pretty_name['Ellis23']]
        r2s += [0.81]
        colors += ['orange']

    for name, filepath in source_dict.items():
        mdf = pd.read_csv(filepath)
        
        all_hscores = []
        all_mscores = []
        for i, row in mdf.iterrows():
            all_hscores += [float(x) for x in re.sub(",", " ", row['hyes'][1:-1]).split()[start_at:stop_at]]
            all_mscores += list(np.array([float(x) for x in re.sub(",", " ", row['myes'][1:-1]).split()[start_at:stop_at]]))

        all_mscores = np.array(all_mscores)
        all_hscores = np.array(all_hscores)

        r2s.append(np.corrcoef(all_mscores, all_hscores)[0,1] ** 2)
        mode.append(pretty_name[name])
        colors.append(color_and_dash[name][0])

    fig = go.Figure()
    for i in range(len(mode)):
        fig.add_trace(go.Bar(x=[mode[i]], y=[r2s[i]], marker_color=[colors[i]], showlegend=False))
        fig.add_annotation(x=mode[i], y=r2s[i]+0.04, showarrow=False, text=str(r2s[i])[:5])

    fig.update_layout(width=500, height=400, template='ggplot2')
    fig.update_layout(
        margin=dict(l=70, r=50, t=40, b=60),
    )

    fig.update_yaxes(title='R<sup>2</sup> with Human Responses', range=[0,1])
    fig.update_layout(font_size=14, font_family='times new roman', 
            )
    fig.update_layout(width=400)
    return fig

In [416]:
fig = make_r2_bar_charts({k: v for k,v in data_files.items() if k in 
                            # ['PTG16', 'Tuned112-2B', 'Pretrained-2B']
                            ['Tuned112-7B', 'Pretrained-7B', 'Tuned112-7B-AnswerLoss']
                            }, show_ellis=False)
fig.update_layout(width=300)
fig.show()
fig.write_image('./analysis/fig/e3_tunedanswercomparison.pdf')

In [170]:
pretty_name = {
    'Ellis23': 'LLM-Augmented<br>Bayesian Model (Ellis23)',
    'PTG16': 'Bayesian pLoT Model<br>(PTG16)', 
                    'Tuned112-7B': 'LLM Tuned on<br>Human Responses', 
                    'Tuned112-7B-AnswerLoss': 'Tuned-CE', 
                    'Pretrained-7B': 'Pretrained LLM',
                    'Tuned92-7B': 'Pretrained',
                    'GPT2-Tuned112': "GPT2-Tuned"}

fig = make_r2_bar_charts({k: v for k,v in data_files.items() if k in 
                            # ['PTG16', 'Tuned112-2B', 'Pretrained-2B']
                            ['PTG16', 'Tuned112-7B', 'Pretrained-7B']
                            })
fig.update_xaxes(tickfont={'family': 'times new roman', 'size': 11})
fig.update_yaxes(tickfont={'family': 'times new roman', 'size': 11})
fig.show()
fig.write_image('./analysis/fig/e3_model_r2_comparison.pdf')

# Correlation to Bayesian

In [50]:
def get_fit_to_bayesian(start_at:int=0, stop_at:int=999):
    ptg16_df = pd.read_csv(data_files['PTG16']).sort_values('concept')
    tuned112_df = pd.read_csv(data_files['Tuned112-7B']).sort_values('concept')

    r2 = []
    concepts = []
    for concept in tuned112_df['concept'].unique():
        mscores = [float(x) for x in re.sub(",", " ", tuned112_df[tuned112_df['concept'] == concept]['myes'].iloc[0][1:-1]).split()[start_at:stop_at]]
        bscores = [float(x) for x in re.sub(",", " ", ptg16_df[ptg16_df['concept'] == concept]['myes'].iloc[0][1:-1]).split()[start_at:stop_at]]
        r2.append(np.corrcoef(mscores, bscores)[0, 1] ** 2)
        concepts.append(concept)
    return r2, concepts

In [62]:
def get_fit_to_human(start_at:int=0, stop_at:int=999):
    hdf = pd.read_csv(data_files['Human'])
    tuned112_df = pd.read_csv(data_files['Tuned112-7B']).sort_values('concept')

    r2 = []
    concepts = []
    for concept in tuned112_df['concept'].unique():
        mscores = [float(x) for x in re.sub(",", " ", tuned112_df[tuned112_df['concept'] == concept]['myes'].iloc[0][1:-1]).split()[start_at:stop_at]]
        bscores = hdf[hdf['concepts'] == concept]['hyes'].tolist()[start_at:stop_at]
        r2.append(np.corrcoef(mscores, bscores)[0, 1] ** 2)
        concepts.append(concept)
    return r2, concepts

In [97]:
with open('./data/labels_to_readable.json', 'r') as f:
    labels_to_readable = json.load(f)

fig = go.Figure()
step_size=10
for i in range(0, 80, step_size):
    r2, concepts = get_fit_to_human(i, i+step_size)
    fig.add_trace(go.Box(y=r2, x=[str(i) + "-" + str(i+step_size)] * len(r2), 
        marker_color='blue', boxpoints=False, hovertext=[labels_to_readable[c] for c in concepts], showlegend=False))

    boolean_points = [(p, labels_to_readable[c]) for p, c in zip(r2, concepts) if c not in boolean]
    fig.add_trace(go.Box(y=[x[0] for x in boolean_points], x=[str(i) + "-" + str(i+step_size)] * len(r2), 
        boxpoints='all', marker_color='blue', 
        hovertext=[x[1] for x in boolean_points],
        name='FOL Rules',
        showlegend=i==0,
        legendgroup='FOL',
        fillcolor= 'rgba(255,255,255,0)',
        line_color='rgba(255,255,255,0)'))

    boolean_points = [(p, labels_to_readable[c]) for p, c in zip(r2, concepts) if c in boolean]
    fig.add_trace(go.Box(y=[x[0] for x in boolean_points], x=[str(i) + "-" + str(i+step_size)] * len(r2), 
        boxpoints='all', marker_color='teal', 
        hovertext=[x[1] for x in boolean_points],
        name='Propositional Rules',
        showlegend=i==0,
        legendgroup='Propositional',
        fillcolor= 'rgba(255,255,255,0)',
        line_color='rgba(255,255,255,0)'))

fig.update_layout(width=800, height=400, template='ggplot2')
fig.update_layout(
    margin=dict(l=70, r=50, t=40, b=60),
)

fig.update_yaxes(title='R<sup>2</sup> with FOL-Grammar Bayesian PLoT Model')
fig.update_xaxes(title='Window of Object Indexes')
fig.update_layout(font_size=14, font_family='times new roman', 
        )
fig.show()


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


invalid value encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


Mean of empty slice.


invalid value encountered in divide


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply


invalid value encountered in divide


invalid value encountered in divide



In [196]:
with open('./data/labels_to_readable.json', 'r') as f:
    labels_to_readable = json.load(f)

fig = go.Figure()
step_size = 20
for i in range(0, 80, step_size):
    r2, concepts = get_fit_to_bayesian(i, i+step_size)
    fig.add_trace(go.Box(y=r2, x=[str(i) + "-" + str(i+step_size)] * len(r2), 
        marker_color='blue', boxpoints=False, hovertext=[labels_to_readable[c] for c in concepts], showlegend=False))

    boolean_points = [(p, labels_to_readable[c]) for p, c in zip(r2, concepts) if c not in boolean]
    fig.add_trace(go.Box(y=[x[0] for x in boolean_points], x=[str(i) + "-" + str(i+step_size)] * len(r2), 
        boxpoints='all', marker_color='blue', 
        hovertext=[x[1] for x in boolean_points],
        name='FOL Rules',
        showlegend=i==0,
        legendgroup='FOL',
        fillcolor= 'rgba(255,255,255,0)',
        line_color='rgba(255,255,255,0)'))

    boolean_points = [(p, labels_to_readable[c]) for p, c in zip(r2, concepts) if c in boolean]
    fig.add_trace(go.Box(y=[x[0] for x in boolean_points], x=[str(i) + "-" + str(i+step_size)] * len(r2), 
        boxpoints='all', marker_color='teal', 
        hovertext=[x[1] for x in boolean_points],
        name='Propositional Rules',
        showlegend=i==0,
        legendgroup='Propositional',
        fillcolor= 'rgba(255,255,255,0)',
        line_color='rgba(255,255,255,0)'))

fig.update_layout(width=800, height=400, template='ggplot2')
fig.update_layout(
    margin=dict(l=70, r=50, t=40, b=60),
)

fig.update_yaxes(title='R<sup>2</sup> with FOL-Grammar Bayesian PLoT Model')
fig.update_xaxes(title='Window of Object Indexes')
fig.update_layout(font_size=14, font_family='times new roman', 
        )

fig.show()


Degrees of freedom <= 0 for slice


divide by zero encountered in divide


invalid value encountered in multiply

