In [1]:
import pandas as pd
import os, sys
os.chdir("../")
sys.path.append('./src')
from tqdm import tqdm
from utils.get_concept_subsets import SUBSETS, READABLE

In [2]:
import json
from utils.get_concept_subsets import SUBSETS, READABLE

# Combine raw files

In [35]:
dfs = []
directories = ['./results/experiment_2/raw_results/gpt4_boolean_translated', 
              './results/experiment_2/raw_results/gpt4_fol_translated']

for directory in directories:
    for file in os.listdir(directory):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(directory, file), index_col=0)
            df['consistency'] = df['rule_evaluation'].astype(str) == df['model_answer'].astype(str)

            current_set = 0
            sets = []
            for i, row in df.iterrows():
                if df['model_reply'].iloc[i] != df['model_reply'].iloc[i-1]:
                    current_set += 1
                sets.append(current_set)
            df['set'] = sets

            df.to_csv(os.path.join(directory, file))
            dfs.append(df)

mdf = pd.concat(dfs)
mdf.to_csv('./results/experiment_2/compiled_all_rules.csv')

In [36]:
mdf[(mdf['set'] >= 3) & (~mdf['concept_num'].isin(SUBSETS['boolean']))]['consistency'].mean()

0.8917629792698587

## Break apart FOL Rules

In [37]:
df = pd.read_csv('./results/experiment_2/compiled_all_rules.csv')
df = df[~df['concept_num'].isin(SUBSETS['boolean'])]
df.to_csv('./results/experiment_2/compiled_fol_rules.csv')


# Review Manual annotations

In [51]:
from src.translate_explanations import evaluate_python_code, process_python_code
import numpy as np
import math
def check_rule_coverage(annotated_df, code_column='manual_code', only_eval_last_set=False):
    
    # annotated_df = annotated_df.dropna()
    hdf = pd.read_csv('./results/experiment_1/human_concept_stats_to_set25.csv')
    coverage = []
    full_coverage = []
    code_evaluations = []
    chance = []
    human_lq = []

    expression = annotated_df['rule_expression'].tolist()

    for i, row in tqdm(annotated_df.iterrows(), total=len(annotated_df)):
        concept_df = annotated_df[(annotated_df['concept_num'] == row['concept_num']) & (annotated_df['set'] <= 25)]
        yesrate = concept_df['answer'].mean()
        chance.append((yesrate * yesrate) + ((1-yesrate) * (1-yesrate)))
        human_lq.append(hdf[hdf['concept'] == row['concept_num']]['last_quarter_mean'].iloc[0])

        if (row[code_column] not in ["" or "NA"] and not pd.isna(row[code_column])) and ((not only_eval_last_set) or (row['set'] == 25)):
            if "set" in row[code_column]:
                code_evaluations.append(evaluate_python_code(row[code_column], row['object'], int(row['set']), row['concept_num']))
                coverage.append((np.array([
                    evaluate_python_code(row[code_column], backrow['object'], int(backrow['set']), backrow['concept_num']) for _, backrow in concept_df.iloc[:i, :].iterrows()
                    ]) == concept_df.iloc[:i, :].loc[:, 'answer']).mean())
                full_coverage.append((np.array([
                    evaluate_python_code(row[code_column], backrow['object'], int(backrow['set']), backrow['concept_num']) for _, backrow in concept_df.iterrows()
                    ]) == concept_df.loc[:, 'answer']).mean())
            else:
                code_evaluations.append(evaluate_python_code(row[code_column], row['object']))
                # if row['concept_num'] == 'hg100':
                #     print((np.array([
                #     evaluate_python_code(row[code_column], backrow['object']) for _, backrow in concept_df.iloc[:i, :].iterrows()
                #     ]) == concept_df.iloc[:i, :].loc[:, 'answer']).mean())
                #     print((np.array([
                #         evaluate_python_code(row[code_column], backrow['object']) for _, backrow in concept_df.iloc[:i, :].iterrows()
                #     ]) == concept_df.iloc[:i, :].loc[:, 'answer']))
                coverage.append((np.array([
                    evaluate_python_code(row[code_column], backrow['object']) for _, backrow in concept_df.iloc[:i, :].iterrows()
                    ]) == concept_df.iloc[:i, :].loc[:, 'answer']).mean())
                full_coverage.append((np.array([
                    evaluate_python_code(row[code_column], backrow['object']) for _, backrow in concept_df.iterrows()
                    ]) == concept_df.loc[:, 'answer']).mean())
                
            if row['rule_expression'] == "":
                expression[i] = process_python_code(row[code_column])
        else:
            full_coverage.append(-1)
            coverage.append(-1)
            code_evaluations.append("NA")

    annotated_df['code_eval'] = code_evaluations
    annotated_df['rule_expression'] = expression
    annotated_df['human_lq'] = human_lq
    annotated_df['chance'] = chance
    annotated_df["coverage_" + code_column] = coverage
    annotated_df["full_coverage_" + code_column] = full_coverage
    annotated_df['consistency'] = annotated_df['code_eval'].astype(str).apply(lambda x: x.strip().lower()) == annotated_df['model_answer'].astype(str).apply(lambda x: x.strip().lower()) 

    return annotated_df

In [101]:
manual_rule_coverage = check_rule_coverage(
    pd.read_csv('./results/experiment_2/compiled_bool_rules_annotated.csv')
    ).sort_values(['coverage_manual_code'])
manual_rule_coverage.to_csv('./results/experiment_2/compiled_bool_rules_annotated_stats.csv')
manual_rule_coverage.loc[:, ['concept_num', 'coverage_manual_code', 'human_lq', 'concept', 'rule_expression']]

100%|██████████| 2542/2542 [05:40<00:00,  7.46it/s] 


Unnamed: 0,concept_num,coverage_manual_code,human_lq,concept,rule_expression
1878,hg78,0.142857,0.916509,blue implies circle,blue and triangle
1879,hg78,0.142857,0.916509,blue implies circle,blue and triangle
763,hg11,0.144737,0.984091,not (circle and blue),large and blue
762,hg11,0.144737,0.984091,not (circle and blue),large and blue
764,hg11,0.144737,0.984091,not (circle and blue),large and blue
...,...,...,...,...,...
867,hg12,1.000000,0.909091,not (circle or blue),(green and (rectangle or triangle)) or (yellow...
866,hg12,1.000000,0.909091,not (circle or blue),(green and (rectangle or triangle)) or (yellow...
596,hg08,1.000000,0.930526,blue or green,(green or blue)
2554,hg86,1.000000,0.917184,circle or (triangle implies blue),(rectangle) or (circle) or (triangle and blue)


In [106]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/compiled_bool_rules_annotated_stats.csv')
manual_rule_coverage['score'] = manual_rule_coverage['answer'] == manual_rule_coverage['model_answer']
acc_df = manual_rule_coverage.groupby(['concept']).apply(lambda x: x[x['item_num'] >= (3 * (max(x['item_num']) / 4))]['score'].mean()).reset_index()
print("Overall accuracy", manual_rule_coverage['score'].mean())
print("Last quarter accuracy", acc_df.loc[:, 0].mean())

manual_rule_coverage['score'] = manual_rule_coverage['answer'] == manual_rule_coverage['code_eval']
acc_df = manual_rule_coverage.groupby(['concept']).apply(lambda x: x[x['item_num'] >= (3 * (max(x['item_num']) / 4))]['score'].mean()).reset_index()
print("Overall accuracy", manual_rule_coverage['score'].mean())
print("Last quarter accuracy", acc_df.loc[:, 0].mean())

Overall accuracy 0.8623131392604249
Last quarter accuracy 0.9335728192940191
Overall accuracy 0.8414634146341463
Last quarter accuracy 0.9035882722338028


In [107]:
manual_rule_coverage.groupby(['concept']).mean('coverage_manual_code').reset_index().sort_values(['consistency'])

Unnamed: 0.1,concept,Unnamed: 0,item_num,answer,model_answer,set,code_eval,human_lq,chance,coverage_manual_code,full_coverage_manual_code,consistency,score
3,(not blue) implies (not circle),2092.5,36.5,0.743243,0.648649,12.108108,0.648649,0.914787,0.618335,0.780131,0.780131,0.891892,0.77027
22,circle xor blue,1031.5,34.5,0.442857,0.285714,11.9,0.285714,0.886243,0.506531,0.776531,0.776531,0.914286,0.728571
8,blue implies circle,1924.833333,46.833333,0.738095,0.654762,13.011905,0.619048,0.916509,0.613379,0.830074,0.830074,0.916667,0.833333
2,(circle or triangle) implies blue,2225.5,30.5,0.516667,0.433333,13.6,0.366667,0.920387,0.500556,0.733333,0.733333,0.933333,0.816667
25,large or small,1464.0,39.0,0.759494,0.670886,13.101266,0.683544,0.91,0.634674,0.767986,0.767986,0.936709,0.822785
9,blue implies size 1,2162.0,32.0,0.815385,0.661538,12.430769,0.723077,0.921569,0.698935,0.746982,0.746982,0.938462,0.753846
24,large or medium,1540.0,36.0,0.690141,0.591549,13.802817,0.56338,0.875598,0.572307,0.808371,0.808371,0.943662,0.84507
30,not (circle xor blue),1105.0,38.0,0.506494,0.428571,13.87013,0.428571,0.805263,0.500084,0.664024,0.664024,0.948052,0.688312
18,circle or (triangle implies blue),2514.5,39.5,0.7875,0.6375,14.425,0.6375,0.917184,0.665312,0.789219,0.789219,0.95,0.8
29,not (circle or blue),881.0,42.0,0.517647,0.482353,13.211765,0.505882,0.909091,0.500623,0.846505,0.846505,0.952941,0.847059


In [114]:
mdf.groupby('concept').apply(lambda x: (x['full_coverage_manual_code'] == 1).sum())

concept
(circle and blue) or (triangle and green)     4
(circle or blue) or  (triangle and green)     0
(circle or triangle) implies blue             6
(not blue) implies (not circle)               3
(not blue) implies circle                     2
False                                        24
True                                          9
blue                                          5
blue implies circle                           3
blue implies size 1                           0
blue or green                                10
blue or small                                 3
circle                                       13
circle and (not blue)                        10
circle and blue                              14
circle and not blue                           2
circle implies blue                           0
circle or (blue and triangle)                 2
circle or (triangle implies blue)             1
circle or blue                               10
circle or triangle              

In [196]:
print("Consistency", manual_rule_coverage['consistency'].mean())
print("Coverage", manual_rule_coverage.drop_duplicates(['concept', 'set'])['coverage_manual_code'].mean())

mdf = manual_rule_coverage.drop_duplicates(['concept', 'set'])
print("Arrived at right rule on last set", (mdf[mdf['set'] == 25]['coverage_manual_code'] == 1).mean())
print("Arrived at right rule on any of last five", mdf[mdf['set'] >= 20].groupby('concept').apply(lambda x: (x['full_coverage_manual_code'] == 1).sum() > 0).reset_index().loc[:, 0].mean())
fdf = mdf[mdf['set'] == 25].sort_values('coverage_manual_code')
fdf.loc[:, ['concept_num', 'concept',  'rule_expression', 'coverage_manual_code', 'chance', 'model_reply', 'manual_code', ]]

Arrived at right rule on last set 0.4411764705882353
Arrived at right rule on any of last five 0.7647058823529411


Unnamed: 0,concept_num,concept,rule_expression,coverage_manual_code,chance,model_reply,manual_code
1137,hg15,not (circle xor blue),green or (triangle),0.532468,0.500084,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.green or (ob...
1799,hg25,blue or small,small and (triangle or rectangle),0.695652,0.508507,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.small and (o...
1063,hg14,circle xor blue,(circle and ((large and green) or (medium and ...,0.714286,0.506531,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.circle and ...
1205,hg16,circle xor (not blue),not (medium and blue) and not (large and blue),0.738462,0.502959,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return not (obj.medium ...
2319,hg83,(circle and blue) or (triangle and green),(circle or triangle) and (blue or green) and n...,0.777778,0.593355,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.circle or o...
2392,hg84,(circle or blue) or (triangle and green),blue or green,0.788732,0.512002,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.blue or obj....
1272,hg18,large,large and not circle,0.828571,0.514694,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.large and no...
2242,hg82,(circle or triangle) implies blue,blue or (large and yelllow) or ((medium or sma...,0.833333,0.500556,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.blue or (obj...
682,hg09,circle and blue,large and blue,0.846154,0.836621,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.large and ob...
1498,hg21,large or small,large or (blue or green) and not (medium and b...,0.848101,0.634674,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.large or (ob...


In [179]:
mdf[mdf['concept_num'] == 'hg15'].sort_values(['item_num']).loc[:, ['concept_num', 'concept',  'rule_expression', 'coverage_manual_code', 'human_lq', 'model_reply', 'manual_code', ]]

Unnamed: 0,concept_num,concept,rule_expression,coverage_manual_code,human_lq,model_reply,manual_code
1064,hg15,not (circle xor blue),large and circle and blue,0.558442,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.large and ob...
1066,hg15,not (circle xor blue),not (blue or green) and not (triangle or circle),0.571429,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return not (obj.blue or...
1071,hg15,not (circle xor blue),(medium and green and triangle) or (large and ...,0.662338,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.medium and ...
1072,hg15,not (circle xor blue),(green and triangle) or (large and circle),0.584416,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and o...
1073,hg15,not (circle xor blue),(green) or (blue and circle),0.701299,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green) or (...
1078,hg15,not (circle xor blue),(green and triangle) or (blue and circle),0.74026,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and o...
1079,hg15,not (circle xor blue),(triangle and green) or (circle and blue) or (...,0.831169,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.triangle an...
1080,hg15,not (circle xor blue),(green and triangle) or (blue and circle),0.74026,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and o...
1082,hg15,not (circle xor blue),(green and triangle) or (blue and circle) or (...,0.831169,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and o...
1085,hg15,not (circle xor blue),(green and triangle) or (blue and circle) or (...,0.701299,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and o...


In [192]:
human_lq = manual_rule_coverage.sort_values(['concept_num', 'item_num']).drop_duplicates(['concept'])['human_lq'].tolist()
concept = manual_rule_coverage.sort_values(['concept_num', 'item_num']).drop_duplicates(['concept'])['concept'].tolist()
lq_coverage = manual_rule_coverage.sort_values(['concept_num', 'item_num']).groupby(['concept']).apply(lambda x: x[x['item_num'] >= (3 * (max(x['item_num']) / 4))]['coverage_manual_code'].mean()).reset_index().loc[:, 0].tolist()
gdf = pd.DataFrame()
gdf['human_lq'] = human_lq
gdf['concept'] = concept
gdf['lq_coverage'] = lq_coverage
gdf

In [167]:
import plotly.express as px
px.line(manual_rule_coverage.drop_duplicates(['concept', 'set']).sort_values(by=['set']), x='set', y='coverage_manual_code', color='concept', hover_data='concept', width=1000)

# FOL Rule

In [4]:
import numpy as np
np.random.choice(SUBSETS['fol'], 16)
"""
# array(['hg88', 'hg87', 'hg46', 'hg89', 'hg94', 'hg44', 'hg73', 'hg75',
       'hg62', 'hg29', 'hg31', 'hg55', 'hg44', 'hg72', 'hg61', 'hg64'],
      dtype='<U5')
"""

array(['hg88', 'hg87', 'hg46', 'hg89', 'hg94', 'hg44', 'hg73', 'hg75',
       'hg62', 'hg29', 'hg31', 'hg55', 'hg44', 'hg72', 'hg61', 'hg64'],
      dtype='<U5')

### FOL Annotated Subset

In [None]:
manual_rule_coverage = check_rule_coverage(
    pd.read_csv('./results/experiment_2/compiled_fol_rules_annotated_subset.csv')
    ).sort_values(['coverage_manual_code'])
manual_rule_coverage = manual_rule_coverage[manual_rule_coverage['to_annotate']]
manual_rule_coverage.loc[:, ['concept_num', 'coverage_manual_code', 'human_lq', 'concept', 'rule_expression']]
manual_rule_coverage.to_csv('./results/experiment_2/compiled_fol_rules_annotated_subset_stats.csv')

In [61]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/compiled_fol_rules_annotated_subset_stats.csv')
mdf = manual_rule_coverage.drop_duplicates(['concept', 'set'])
print("Average Coverage (excluding first sets)", manual_rule_coverage[manual_rule_coverage['coverage_manual_code'] >= 0]['coverage_manual_code'].mean())
print("Average Consistency (excluding no-replies)", manual_rule_coverage.dropna()['consistency'].mean())
print("Arrived at right rule on last set", (mdf[mdf['set'] == 25]['coverage_manual_code'] == 1).mean())
print("Arrived at right rule on any of last five", mdf[mdf['set'] >= 20].groupby('concept').apply(lambda x: (x['full_coverage_manual_code'] == 1).sum() > 0).reset_index().loc[:, 0].mean())
fdf = mdf[mdf['set'] == 25].sort_values('coverage_manual_code')
fdf.loc[:, ['concept_num', 'concept',  'rule_expression', 'coverage_manual_code', 'chance', 'model_reply', 'manual_code', ]]

Average Coverage (excluding first sets) 0.6781448340668781
Average Consistency (excluding no-replies) 0.9554695062923524
Arrived at right rule on last set 0.0
Arrived at right rule on any of last five 0.0


Unnamed: 0,concept_num,concept,rule_expression,coverage_manual_code,chance,model_reply,manual_code
69,hg26,one of the largest or smallest,(large and green) or (large and circle) or (gr...,0.423077,0.795858,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.large and o...
198,hg103,exactly one blue object in the set,(triangle or rectangle) and (small or medium),0.557143,0.549634,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.triangle or...
439,hg98,the unique smallest object,(small and not yellow) or (medium and blue) or...,0.666667,0.659735,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.small and n...
469,hg17,there is a triangle in the set,not(small or blue) or ((small and yellow and t...,0.679012,0.604329,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return not(obj.small or...
474,hg71,does not exist another object with same shape ...,small or green or rectangle,0.679487,0.739645,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.small or obj...
501,hg96,the unique object,rectangle,0.68,0.875556,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.rectangle
602,hg35,larger than some other object,(large and blue) or (medium and yellow) or ((t...,0.705882,0.500433,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.large and o...
658,hg106,exactly one blue object that is the same shape...,(large and green and rectangle) or (small and ...,0.730769,0.673899,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.large and o...
687,hg99,one of the smallest,small and not yellow,0.738462,0.501065,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.small and no...
705,hg108,all other objects of the same shape are blue,(green and not rectangle) or blue or (yellow a...,0.742424,0.53719,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and n...


In [None]:
manual_rule_coverage[manual_rule_coverage['coverage_manual_code'] >= 0].loc[:, ['concept', 'coverage_manual_code']].groupby(['concept']).mean()

###  FOL Annotated Last Sets

In [52]:
manual_rule_coverage = check_rule_coverage(
    pd.read_csv('./results/experiment_2/compiled_fol_rules_annotated_last_sets.csv'), only_eval_last_set=True
    ).sort_values(['coverage_manual_code'])
manual_rule_coverage = manual_rule_coverage[manual_rule_coverage['to_annotate']]
manual_rule_coverage.to_csv('./results/experiment_2/compiled_fol_rules_annotated_last_sets_stats.csv')
manual_rule_coverage.loc[:, ['concept_num', 'coverage_manual_code', 'human_lq', 'concept', 'rule_expression']]

  4%|▎         | 217/5905 [00:00<00:02, 2123.23it/s]

0.5797101449275363
0     False
1     False
2      True
3      True
4      True
      ...  
64    False
65    False
66     True
67     True
68     True
Name: answer, Length: 69, dtype: bool


100%|██████████| 5905/5905 [02:06<00:00, 46.64it/s]  


Unnamed: 0,concept_num,coverage_manual_code,human_lq,concept,rule_expression
2731,hg47,0.207317,0.724638,(same shape as one of the largest) or blue,medium and (same color as last `True’ object) ...
2735,hg47,0.207317,0.724638,(same shape as one of the largest) or blue,medium and (same color as last `True’ object) ...
2095,hg39,0.223684,0.872500,same shape as a blue or green object,((small and circle) or (small and rectangle)) ...
1130,hg26,0.230769,0.857143,one of the largest or smallest,(medium and green) or (medium and blue)
5304,hg91,0.244444,0.851779,same shape as an object that is blue or green,small and blue and (rectangle or triangle)
...,...,...,...,...,...
3171,hg53,0.953846,0.959079,the unique object that is (blue and circle),small and blue and triangle
3108,hg52,0.962025,0.909091,(blue or green) and no other object is blue or...,small and blue and circle
3107,hg52,0.962025,0.909091,(blue or green) and no other object is blue or...,small and blue and circle
3260,hg54,0.977778,0.947464,(blue or circle) and no other object is (blue ...,small and blue and triangle


In [58]:
mdf = manual_rule_coverage.drop_duplicates(['concept', 'set'])
print("Arrived at right rule on last set", (mdf[mdf['set'] == 25]['coverage_manual_code'] == 1).mean())
fdf = mdf[mdf['set'] == 25].sort_values('coverage_manual_code')
fdf.loc[:, ['concept_num', 'concept',  'rule_expression', 'coverage_manual_code', 'chance', 'model_reply', 'manual_code', ]]

Arrived at right rule on last set 0.0


Unnamed: 0,concept_num,concept,rule_expression,coverage_manual_code,chance,model_reply,manual_code
2731,hg47,(same shape as one of the largest) or blue,medium and (same color as last `True’ object) ...,0.207317,0.685901,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.medium and (...
2095,hg39,same shape as a blue or green object,((small and circle) or (small and rectangle)) ...,0.223684,0.771468,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return ((obj.small and ...
1130,hg26,one of the largest or smallest,(medium and green) or (medium and blue),0.230769,0.795858,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.medium and ...
5304,hg91,same shape as an object that is blue or green,small and blue and (rectangle or triangle),0.244444,0.785432,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.small and ob...
2174,hg40,(not blue) and same shape as a blue object,not (small and yellow and circle) and not (lar...,0.265823,0.621855,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return not (obj.small a...
...,...,...,...,...,...,...,...
5384,hg92,unique blue object,small and blue and (rectangle or triangle),0.820513,0.757725,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.small and ob...
3029,hg51,unique circle,large and blue and triangle,0.822785,0.798109,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.large and ob...
3263,hg54,(blue or circle) and no other object is (blue ...,blue and large,0.866667,0.956543,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.blue and obj...
3172,hg53,the unique object that is (blue and circle),small and blue and triangle,0.953846,0.940355,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.small and ob...


# PTG16 Model Hypotheses

In [73]:
def convert_bayesian_model_file(in_bool=True):
    """
    hg13L2	2	1	0	11	13	0.854544341564178466796875	0.26362450824922234460245817899704	"λx.apply1(apply2(size-gt,x.o),x.o)
    """
    df_dict = {k: [] for k in ['concept', 'is_bool', 'item_num', 'set_num', 'answer', 'hyes', 'hno', 'p(yes)', 'lambda_rule']}
    with open('./data/PTG16_model.txt', 'r') as f:
        for line in f.readlines():
            concept, item_num, set_num, answer, hyes, hno, alpha, posterior, rule = line.split("\t")
            df_dict['concept'].append(concept[:-2])
            df_dict['is_bool'].append((concept[:-2] in SUBSETS['boolean']) == in_bool)
            df_dict['item_num'].append(item_num)
            df_dict['set_num'].append(set_num)
            df_dict['answer'].append(answer)
            df_dict['hyes'].append(hyes)
            df_dict['hno'].append(hno)
            df_dict['p(yes)'].append(float(posterior))
            df_dict['lambda_rule'].append(rule)

    ptg16_df = pd.DataFrame.from_dict(df_dict)
    return ptg16_df

In [75]:
from src.utils.preprocess import format_shape

def fill_in_bayesian_model_df(df):
    with open('./data/labels_to_data.json', 'r') as f:
        data = json.load(f)
    
    object = []
    data_answer = []
    model_answer = []
    df = df.sort_values(['concept', 'item_num'])
    for group in df.groupby(['concept']):
        concept = group[0][0]
        object += [format_shape(i) for s in data[concept]['L2']['sets'][:25] for i in s]
        data_answer += [i for s in data[concept]['L2']['answers'][:25] for i in s]

    model_answer = df.apply(lambda x: x['p(yes)'] > 0.5, axis=1)
    concept_names = df['concept'].apply(lambda x: READABLE[x])
    df['object'] = object
    df['concept_num'] = df['concept']
    df['set'] = df['set_num']
    df['concept'] = concept_names
    df['data_answer'] = data_answer
    df['model_answer'] = model_answer
    return df

df = fill_in_bayesian_model_df(convert_bayesian_model_file(in_bool=False))
df.to_csv('./results/experiment_2/PTG16_model_fol_rules.csv')

In [34]:
manual_rule_coverage = check_rule_coverage(df).sort_values(['coverage_manual_code'])

100%|██████████| 2555/2555 [00:16<00:00, 156.10it/s]


In [45]:
manual_rule_coverage.sort_values(by=['Unnamed: 0']).to_csv('./results/experiment_2/PTG16_model_bool_rules_annotated_stats.csv')

In [66]:
print("Consistency", manual_rule_coverage['consistency'].mean())
print("Coverage", manual_rule_coverage['coverage_manual_code'].mean())
print("Last object set rate", (manual_rule_coverage[manual_rule_coverage['set'] == 24].drop_duplicates('concept').loc[:, ['concept', 'rule_expression', 'full_coverage_manual_code']] == 1)['full_coverage_manual_code'].mean())
manual_rule_coverage[manual_rule_coverage['set'] == 24].drop_duplicates('concept').loc[:, ['concept', 'rule_expression', 'full_coverage_manual_code']]

Consistency 0.875146771037182
Coverage 0.8772671571166183
Last object set rate 0.7352941176470589


Unnamed: 0,concept,rule_expression,full_coverage_manual_code
2463,blue,yellow (and not) green,0.304878
500,(circle and blue) or (triangle and green),not ((yellow or rectangle) and (green iff tria...,0.518519
1027,large or small,not medium,0.607595
1406,not (circle and blue),not (blue or circle),0.623377
1567,circle and blue,larger than itself,0.910256
2218,circle implies blue,not circle,0.929577
418,(circle or blue) or (triangle and green),not ((rectangle and green) or yellow),0.929577
72,circle and not blue,circle and yellow,0.958904
1189,circle and (not blue),circle and green,0.958904
809,small and blue,small and blue,1.0
