In [1]:
import pandas as pd
import os, sys
os.chdir("../")
sys.path.append('./src')
from tqdm import tqdm
from utils.get_concept_subsets import SUBSETS, READABLE

In [2]:
import json
from utils.get_concept_subsets import SUBSETS, READABLE

# Functions

In [3]:
from src.translate_explanations import evaluate_python_code, process_python_code
import numpy as np
import math
def check_rule_coverage(annotated_df, code_column='manual_code', only_eval_last_set=False):
    
    # annotated_df = annotated_df.dropna()
    hdf = pd.read_csv('./results/experiment_1/human_concept_stats_to_set25.csv')
    coverage = []
    full_coverage = []
    code_evaluations = []
    chance = []
    human_lq = []

    expression = annotated_df['rule_expression'].tolist()

    for i, row in tqdm(annotated_df.iterrows(), total=len(annotated_df)):
        concept_df = annotated_df[(annotated_df['concept_num'] == row['concept_num']) & (annotated_df['set'] <= 25)].sort_values(['set', 'item_num'])
        yesrate = concept_df['answer'].mean()
        chance.append((yesrate * yesrate) + ((1-yesrate) * (1-yesrate)))
        human_lq.append(hdf[hdf['concept'] == row['concept_num']]['last_quarter_mean'].iloc[0])

        if (row[code_column] not in ["" or "NA"] and not pd.isna(row[code_column])) and ((not only_eval_last_set) or (row['set'] == 25)):
            if "set" in row[code_column]: # FOL Logic Code
                code_evaluations.append(evaluate_python_code(row[code_column], row['object'], int(row['set']), row['concept_num']))
                previous_likelihood = np.array(np.array([
                        evaluate_python_code(row[code_column], backrow['object'], int(backrow['set']), backrow['concept_num']) for _, backrow in concept_df[concept_df['set'] < row['set']].iterrows()
                        ]) == concept_df[concept_df['set'] < row['set']].loc[:, 'answer'])
                
                future_likelihood =  np.array(np.array([
                    evaluate_python_code(row[code_column], backrow['object'], int(backrow['set']), backrow['concept_num']) for _, backrow in concept_df[concept_df['set'] >= row['set']].iterrows()
                    ]) == concept_df[concept_df['set'] >= row['set']].loc[:, 'answer'])
                
                if row['set'] == 0:
                    coverage.append("NA")
                else:
                    coverage.append(previous_likelihood.mean())
                
                full_coverage.append(np.concatenate((previous_likelihood, future_likelihood)).mean()) 
                            
            else: # Propositional logic code
                code_evaluations.append(evaluate_python_code(row[code_column], row['object']))
                previous_likelihood = np.array(np.array([
                        evaluate_python_code(row[code_column], backrow['object']) for _, backrow in concept_df[concept_df['set'] < row['set']].iterrows()
                        ]) == concept_df[concept_df['set'] < row['set']].loc[:, 'answer'])
                
                future_likelihood =  np.array(np.array([
                    evaluate_python_code(row[code_column], backrow['object']) for _, backrow in concept_df[concept_df['set'] >= row['set']].iterrows()
                    ]) == concept_df[concept_df['set'] >= row['set']].loc[:, 'answer'])
                
                if row['set'] == 0:
                    coverage.append("NA")
                else:
                    coverage.append(previous_likelihood.mean())
                
                full_coverage.append(np.concatenate((previous_likelihood, future_likelihood)).mean()) 
                
            if row['rule_expression'] == "":
                expression[i] = process_python_code(row[code_column])
        else:
            full_coverage.append(-1)
            coverage.append(-1)
            code_evaluations.append("NA")

    annotated_df['code_eval'] = code_evaluations
    annotated_df['rule_expression'] = expression
    annotated_df['human_lq'] = human_lq
    annotated_df['chance'] = chance
    annotated_df["coverage_" + code_column] = coverage
    annotated_df["full_coverage_" + code_column] = full_coverage
    annotated_df['consistency'] = annotated_df['code_eval'].astype(str).apply(lambda x: x.strip().lower()) == annotated_df['model_answer'].astype(str).apply(lambda x: x.strip().lower()) 

    return annotated_df

# Combine raw files

In [249]:
dfs = []
directories = ['./results/experiment_2/raw_results/gpt4_boolean_translated', 
              './results/experiment_2/raw_results/gpt4_fol_translated']

for directory in directories:
    for file in os.listdir(directory):
        if file.endswith('.csv'):
            df = pd.read_csv(os.path.join(directory, file), index_col=0)
            df['consistency'] = df['rule_evaluation'].astype(str) == df['model_answer'].astype(str)

            current_set = 0
            sets = []
            for i, row in df.iterrows():
                if df['model_reply'].iloc[i] != df['model_reply'].iloc[i-1]:
                    current_set += 1
                sets.append(current_set)
            df['set'] = sets

            df.to_csv(os.path.join(directory, file))
            dfs.append(df)

mdf = pd.concat(dfs)
print("Consistency on boolean rules from raw GPT3.5 output", mdf[(mdf['set'] >= 3) & (mdf['concept_num'].isin(SUBSETS['boolean']))]['consistency'].mean)
print("Consistency on FOL rules from raw GPT3.5 output", mdf[(mdf['set'] >= 3) & (~mdf['concept_num'].isin(SUBSETS['boolean']))]['consistency'].mean)
mdf.to_csv('./results/experiment_2/compiled_all_rules.csv')

Consistency on boolean rules from raw GPT3.5 output <bound method Series.mean of 7     True
8     True
9     True
10    True
11    True
      ... 
66    True
67    True
68    True
69    True
70    True
Name: consistency, Length: 2359, dtype: bool>
Consistency on FOL rules from raw GPT3.5 output <bound method Series.mean of 5     True
6     True
7     True
8     True
9     True
      ... 
69    True
70    True
71    True
72    True
73    True
Name: consistency, Length: 5451, dtype: bool>


## Break apart Rule Groups

In [None]:
df = pd.read_csv('./results/experiment_2/compiled_all_rules.csv')
df = df[df['concept_num'].isin(SUBSETS['boolean'])]
df.to_csv('./results/experiment_2/compiled_bool_rules.csv')

In [250]:
df = pd.read_csv('./results/experiment_2/compiled_all_rules.csv')
df = df[~df['concept_num'].isin(SUBSETS['boolean'])]
df.to_csv('./results/experiment_2/compiled_fol_rules.csv')

# Generate Stats for Manual annotations

## Boolean Rules

In [143]:
manual_rule_coverage = check_rule_coverage(
    pd.read_csv('./results/experiment_2/compiled_bool_rules_annotated.csv')
    ).sort_values(['coverage_manual_code'])
manual_rule_coverage.to_csv('./results/experiment_2/compiled_bool_rules_annotated_stats.csv')
manual_rule_coverage.loc[:, ['concept_num', 'coverage_manual_code', 'human_lq', 'concept', 'rule_expression']]

  coverage.append(previous_likelihood.mean())
  ret = ret.dtype.type(ret / rcount)
  coverage.append(previous_likelihood.mean())
100%|██████████| 2555/2555 [03:01<00:00, 14.08it/s]


Unnamed: 0,concept_num,coverage_manual_code,human_lq,concept,rule_expression
1880,hg78,-1.0,0.916509,blue implies circle,
1881,hg78,-1.0,0.916509,blue implies circle,
1504,hg22,-1.0,0.875598,large or medium,
1882,hg78,-1.0,0.916509,blue implies circle,
1883,hg78,-1.0,0.916509,blue implies circle,
...,...,...,...,...,...
2337,hg84,,0.886243,(circle or blue) or (triangle and green),small and red
2408,hg85,,0.918159,circle or (blue and triangle),large and blue and square
2475,hg86,,0.917184,circle or (triangle implies blue),large and blue
2476,hg86,,0.917184,circle or (triangle implies blue),large and blue


In [242]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/compiled_bool_rules_annotated_stats.csv')

manual_rule_coverage['score'] = manual_rule_coverage['answer'] == manual_rule_coverage['model_answer']
acc_df = manual_rule_coverage.groupby(['concept']).apply(lambda x: x[x['item_num'] >= (3 * (max(x['item_num']) / 4))]['score'].mean()).reset_index()
print("Overall accuracy (answer)", manual_rule_coverage['score'].mean())
print("Last quarter accuracy (answer)", acc_df.loc[:, 0].mean())

manual_rule_coverage['score'] = manual_rule_coverage['answer'] == manual_rule_coverage['code_eval']
acc_df = manual_rule_coverage.groupby(['concept']).apply(lambda x: x[x['item_num'] >= (3 * (max(x['item_num']) / 4))]['score'].mean()).reset_index()
print("Overall accuracy (code eval)", manual_rule_coverage['score'].mean())
print("Last quarter accuracy (code eval)", acc_df.loc[:, 0].mean())

Overall accuracy (answer) 0.8583170254403131
Last quarter accuracy (answer) 0.9293528704449142
Overall accuracy (code eval) 0.8391389432485323
Last quarter accuracy (code eval) 0.9021950833793136


In [243]:
print("Consistency", manual_rule_coverage['consistency'].mean())
print("Coverage", manual_rule_coverage[manual_rule_coverage['coverage_manual_code'] >=0].drop_duplicates(['concept', 'set'])['coverage_manual_code'].mean())

mdf = manual_rule_coverage.drop_duplicates(['concept', 'set'])
print("Arrived at right rule on last set", (mdf[mdf['set'] == 25]['coverage_manual_code'] == 1).mean())
print("Arrived at right rule on any of last five", mdf[mdf['set'] >= 20].groupby('concept').apply(lambda x: (x['full_coverage_manual_code'] == 1).sum() > 0).reset_index().loc[:, 0].mean())
fdf = mdf[mdf['set'] == 25].sort_values('coverage_manual_code')
fdf['concept'] = fdf['concept_num'].apply(lambda x: READABLE[x])
fdf.loc[:, ['concept_num', 'concept',  'rule_expression', 'full_coverage_manual_code', 'chance']]

Consistency 0.963600782778865
Coverage 0.901008696498852
Arrived at right rule on last set 0.4411764705882353
Arrived at right rule on any of last five 0.7647058823529411


Unnamed: 0,concept_num,concept,rule_expression,full_coverage_manual_code,chance
42,hg15,not (circle xor blue),green or (triangle),0.532468,0.500084
137,hg25,blue or small,small and (triangle or rectangle),0.695652,0.508507
178,hg14,circle xor blue,(circle and ((large and green) or (medium and ...,0.714286,0.506531
200,hg16,circle xor (not blue),not (medium and blue) and not (large and blue),0.742424,0.504132
285,hg83,(circle and blue) or (triangle and green),(circle or triangle) and (blue or green) and n...,0.777778,0.593355
343,hg84,(circle or blue) or (triangle and green),blue or green,0.788732,0.512002
467,hg18,large,large and not circle,0.828571,0.514694
554,hg09,circle and blue,large and blue,0.846154,0.836621
621,hg21,large or small,large or (blue or green) and not (medium and b...,0.848101,0.634674
665,hg82,(circle or triangle) implies blue,blue or (large and yelllow) or ((medium or sma...,0.836066,0.501209


In [168]:
manual_rule_coverage.groupby(['concept']).mean('coverage_manual_code').reset_index().sort_values(['consistency'])

Unnamed: 0.1,concept,Unnamed: 0,item_num,answer,set,human_lq,chance,coverage_manual_code,full_coverage_manual_code,consistency,score
8,blue implies circle,1922.5,44.5,0.755556,12.411111,0.916509,0.630617,0.750018,0.734568,0.855556,0.788889
3,(not blue) implies (not circle),2092.5,36.5,0.743243,12.108108,0.914787,0.618335,0.848838,0.780131,0.891892,0.77027
22,circle xor blue,1031.5,34.5,0.442857,11.9,0.886243,0.506531,0.831882,0.776531,0.914286,0.728571
2,(circle or triangle) implies blue,2225.0,30.0,0.52459,13.393443,0.920387,0.501209,0.817978,0.708412,0.918033,0.803279
24,large or medium,1539.5,35.5,0.694444,13.625,0.875598,0.575617,0.848734,0.784336,0.930556,0.833333
25,large or small,1464.0,39.0,0.759494,13.101266,0.91,0.634674,0.845702,0.767986,0.936709,0.822785
9,blue implies size 1,2162.0,32.0,0.815385,12.430769,0.921569,0.698935,0.831146,0.746982,0.938462,0.753846
30,not (circle xor blue),1105.0,38.0,0.506494,13.87013,0.805263,0.500084,0.725892,0.664024,0.948052,0.688312
18,circle or (triangle implies blue),2514.5,39.5,0.7875,14.425,0.917184,0.665312,0.83833,0.789219,0.95,0.8
29,not (circle or blue),881.0,42.0,0.517647,13.211765,0.909091,0.500623,0.924296,0.846505,0.952941,0.847059


In [152]:
manual_rule_coverage.groupby('concept').apply(lambda x: (x['full_coverage_manual_code'] == 1).sum())

concept
(circle and blue) or (triangle and green)    10
(circle or blue) or  (triangle and green)     0
(circle or triangle) implies blue            12
(not blue) implies (not circle)               4
(not blue) implies circle                     6
False                                        73
True                                         24
blue                                         11
blue implies circle                           6
blue implies size 1                           0
blue or green                                31
blue or small                                 6
circle                                       34
circle and (not blue)                        27
circle and blue                              44
circle and not blue                           6
circle implies blue                           0
circle or (blue and triangle)                 6
circle or (triangle implies blue)             6
circle or blue                               32
circle or triangle              

In [179]:
mdf[mdf['concept_num'] == 'hg15'].sort_values(['item_num']).loc[:, ['concept_num', 'concept',  'rule_expression', 'coverage_manual_code', 'human_lq', 'model_reply', 'manual_code', ]]

Unnamed: 0,concept_num,concept,rule_expression,coverage_manual_code,human_lq,model_reply,manual_code
1064,hg15,not (circle xor blue),large and circle and blue,0.558442,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.large and ob...
1066,hg15,not (circle xor blue),not (blue or green) and not (triangle or circle),0.571429,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return not (obj.blue or...
1071,hg15,not (circle xor blue),(medium and green and triangle) or (large and ...,0.662338,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.medium and ...
1072,hg15,not (circle xor blue),(green and triangle) or (large and circle),0.584416,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and o...
1073,hg15,not (circle xor blue),(green) or (blue and circle),0.701299,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green) or (...
1078,hg15,not (circle xor blue),(green and triangle) or (blue and circle),0.74026,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and o...
1079,hg15,not (circle xor blue),(triangle and green) or (circle and blue) or (...,0.831169,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.triangle an...
1080,hg15,not (circle xor blue),(green and triangle) or (blue and circle),0.74026,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and o...
1082,hg15,not (circle xor blue),(green and triangle) or (blue and circle) or (...,0.831169,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and o...
1085,hg15,not (circle xor blue),(green and triangle) or (blue and circle) or (...,0.701299,0.805263,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and o...


In [192]:
human_lq = manual_rule_coverage.sort_values(['concept_num', 'item_num']).drop_duplicates(['concept'])['human_lq'].tolist()
concept = manual_rule_coverage.sort_values(['concept_num', 'item_num']).drop_duplicates(['concept'])['concept'].tolist()
lq_coverage = manual_rule_coverage.sort_values(['concept_num', 'item_num']).groupby(['concept']).apply(lambda x: x[x['item_num'] >= (3 * (max(x['item_num']) / 4))]['coverage_manual_code'].mean()).reset_index().loc[:, 0].tolist()
gdf = pd.DataFrame()
gdf['human_lq'] = human_lq
gdf['concept'] = concept
gdf['lq_coverage'] = lq_coverage
gdf

##  FOL Rules

In [4]:
import numpy as np
np.random.choice(SUBSETS['fol'], 16)
"""
# array(['hg88', 'hg87', 'hg46', 'hg89', 'hg94', 'hg44', 'hg73', 'hg75',
       'hg62', 'hg29', 'hg31', 'hg55', 'hg44', 'hg72', 'hg61', 'hg64'],
      dtype='<U5')
"""

array(['hg88', 'hg87', 'hg46', 'hg89', 'hg94', 'hg44', 'hg73', 'hg75',
       'hg62', 'hg29', 'hg31', 'hg55', 'hg44', 'hg72', 'hg61', 'hg64'],
      dtype='<U5')

In [265]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/compiled_fol_rules.csv')

manual_rule_coverage = manual_rule_coverage.dropna()
manual_rule_coverage['score'] = manual_rule_coverage['answer'].astype(str).apply(lambda x: x.lower().strip()) == manual_rule_coverage['model_answer'].astype(str).apply(lambda x: x.lower().strip())
acc_df = manual_rule_coverage.groupby(['concept']).apply(lambda x: x[x['item_num'] >= (3 * (max(x['item_num']) / 4))]['score'].mean()).reset_index()
print("Overall accuracy (answer)", manual_rule_coverage['score'].mean())
print("Last quarter accuracy (answer)", acc_df.loc[:, 0].mean())


Overall accuracy (answer) 0.7016693591814755
Last quarter accuracy (answer) 0.6868402326968103


In [261]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/PTG16_model_fol_rules.csv')

manual_rule_coverage = manual_rule_coverage.dropna()
manual_rule_coverage['score'] = manual_rule_coverage['answer'].astype(bool).astype(str).apply(lambda x: x.lower().strip()) == manual_rule_coverage['model_answer'].astype(str).apply(lambda x: x.lower().strip())
acc_df = manual_rule_coverage.groupby(['concept']).apply(lambda x: x[x['item_num'] >= (3 * (max(x['item_num']) / 4))]['score'].mean()).reset_index()
print("Overall accuracy (answer)", manual_rule_coverage['score'].mean())
print("Last quarter accuracy (answer)", acc_df.loc[:, 0].mean())


Overall accuracy (answer) 0.8167654530059272
Last quarter accuracy (answer) 0.8824948617275646


### FOL Annotated Subset

In [157]:
manual_rule_coverage = check_rule_coverage(
    pd.read_csv('./results/experiment_2/compiled_fol_rules_annotated_subset.csv')
    ).sort_values(['coverage_manual_code'])
manual_rule_coverage = manual_rule_coverage[manual_rule_coverage['to_annotate']]
manual_rule_coverage.loc[:, ['concept_num', 'coverage_manual_code', 'human_lq', 'concept', 'rule_expression']]
manual_rule_coverage.to_csv('./results/experiment_2/compiled_fol_rules_annotated_subset_stats.csv')

  coverage.append(previous_likelihood.mean())
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 1091/1091 [06:32<00:00,  2.78it/s]


In [181]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/compiled_fol_rules_annotated_subset_stats.csv')
mdf = manual_rule_coverage.drop_duplicates(['concept', 'set'])
print("Average Coverage (excluding first sets)", manual_rule_coverage[manual_rule_coverage['coverage_manual_code'] >= 0]['coverage_manual_code'].mean())
print("Average Consistency (excluding no-replies)", manual_rule_coverage.dropna()['consistency'].mean())
print("Arrived at right rule on last set", (mdf[mdf['set'] == 25]['coverage_manual_code'] == 1).mean())
print("Arrived at right rule on any of last five", mdf[mdf['set'] >= 20].groupby('concept').apply(lambda x: (x['full_coverage_manual_code'] == 1).sum() > 0).reset_index().loc[:, 0].mean())
fdf = mdf[mdf['set'] == 25].sort_values('coverage_manual_code')
fdf.loc[:, ['concept_num', 'concept',  'rule_expression', 'coverage_manual_code', 'chance', 'model_reply', 'manual_code', ]]

Average Coverage (excluding first sets) 0.7658433538207211
Average Consistency (excluding no-replies) 0.9554695062923524
Arrived at right rule on last set 0.0
Arrived at right rule on any of last five 0.0


Unnamed: 0,concept_num,concept,rule_expression,coverage_manual_code,chance,model_reply,manual_code
23,hg26,one of the largest or smallest,(large and green) or (large and circle) or (gr...,0.421053,0.795858,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.large and o...
69,hg103,exactly one blue object in the set,(triangle or rectangle) and (small or medium),0.557143,0.549634,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.triangle or...
227,hg98,the unique smallest object,(small and not yellow) or (medium and blue) or...,0.671875,0.659735,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.small and n...
237,hg71,does not exist another object with same shape ...,small or green or rectangle,0.675676,0.739645,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.small or obj...
271,hg17,there is a triangle in the set,not(small or blue) or ((small and yellow and t...,0.688312,0.604329,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return not(obj.small or...
275,hg96,the unique object,rectangle,0.690141,0.875556,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.rectangle
296,hg35,larger than some other object,(large and blue) or (medium and yellow) or ((t...,0.698413,0.500433,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.large and o...
367,hg106,exactly one blue object that is the same shape...,(large and green and rectangle) or (small and ...,0.723684,0.673899,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.large and o...
420,hg108,all other objects of the same shape are blue,(green and not rectangle) or blue or (yellow a...,0.737705,0.53719,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return (obj.green and n...
443,hg105,exactly one blue object that is the same shape,blue or rectangle,0.742857,0.500386,"Based on your examples, I've learned that the ...",def is_rule(obj):\n return obj.blue or obj....


In [None]:
manual_rule_coverage[manual_rule_coverage['coverage_manual_code'] >= 0].loc[:, ['concept', 'coverage_manual_code']].groupby(['concept']).mean()

In [269]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/compiled_fol_rules_annotated_subset_stats.csv')
mdf = manual_rule_coverage.drop_duplicates(['concept', 'set']).dropna()
mdf['uses_fol_logic'] = mdf['manual_code'].str.contains('set')
print(mdf['uses_fol_logic'].mean())
mdf[mdf['uses_fol_logic']].loc[:, ['concept', 'rule_expression', 'set']]

0.0446927374301676


Unnamed: 0,concept,rule_expression,set
7,one of the largest or smallest,another object in the group shares two features,6
10,the unique smallest object,smallest of color and shape in all objects or ...,24
11,the unique smallest object,smallest of color and shape in all objects,23
16,exactly one blue object in the set,object has not appeared as true before,21
29,exactly one blue object in the set,object with same size and color has been previ...,17
49,all other objects of the same shape are blue,object with same size and shape has been previ...,16
76,exactly one blue object in the set,unique size in group,10
78,the unique smallest object,(smallest of its shape in the group) and (shap...,16
99,all other objects of the same shape are blue,unique shape and color,10
120,the unique smallest object,(not yellow) and (unique shape and color),21


###  FOL Annotated Last Sets

In [199]:
manual_rule_coverage = check_rule_coverage(
    pd.read_csv('./results/experiment_2/compiled_fol_rules_annotated_last_sets.csv'), only_eval_last_set=True)
manual_rule_coverage = manual_rule_coverage[manual_rule_coverage['to_annotate']]
manual_rule_coverage.to_csv('./results/experiment_2/compiled_fol_rules_annotated_last_sets_stats.csv')
manual_rule_coverage.loc[:, ['concept_num', 'coverage_manual_code', 'human_lq', 'concept', 'rule_expression']]

100%|██████████| 5905/5905 [02:09<00:00, 45.48it/s]  


Unnamed: 0,concept_num,coverage_manual_code,human_lq,concept,rule_expression
69,hg100,0.579710,0.838624,one of the smallest of objects who share a sha...,(small) or (yellow and triangle)
139,hg101,0.652174,0.666667,same shape as a uniquely smallest object,small or ((medium and yellow and not rectangle...
216,hg102,0.394737,0.610417,exactly one object in the set is blue excludin...,large or (medium and triangle) or (small and (...
217,hg102,0.394737,0.610417,exactly one object in the set is blue excludin...,large or (medium and triangle) or (small and (...
218,hg102,0.394737,0.610417,exactly one object in the set is blue excludin...,large or (medium and triangle) or (small and (...
...,...,...,...,...,...
5836,hg98,0.484375,0.801932,the unique smallest object,small or ((medium and yellow and not rectangle...
5837,hg98,0.296875,0.801932,the unique smallest object,large or (medium and triangle) or (small and (...
5838,hg98,0.687500,0.801932,the unique smallest object,small and not yellow
5839,hg98,0.296875,0.801932,the unique smallest object,large or (medium and triangle) or (small and (...


In [203]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/compiled_fol_rules_annotated_last_sets_stats.csv')
mdf = manual_rule_coverage.drop_duplicates(['concept', 'set'])
print("Arrived at right rule on last set", (mdf[mdf['set'] == 25]['full_coverage_manual_code'] == 1).mean())
fdf = mdf[mdf['set'] == 25].sort_values('coverage_manual_code')
fdf.loc[:, ['concept_num', 'concept',  'rule_expression', 'full_coverage_manual_code', 'chance', 'model_answer', 'manual_code', ]].to_csv('./results/experiment_2/gpt4_fol_final_rules.csv')
fdf.loc[:, ['concept_num', 'concept',  'rule_expression', 'full_coverage_manual_code', 'chance', 'model_answer', 'manual_code', ]]

Arrived at right rule on last set 0.0


Unnamed: 0,concept_num,concept,rule_expression,full_coverage_manual_code,chance,model_answer,manual_code
108,hg47,(same shape as one of the largest) or blue,medium and (same color as last `True’ object) ...,0.207317,0.685901,False,def is_rule(obj):\n return obj.medium and (...
224,hg91,same shape as an object that is blue or green,small and blue and (rectangle or triangle),0.244444,0.785432,True,def is_rule(obj):\n return obj.small and ob...
187,hg71,does not exist another object with same shape ...,(triangle or rectangle) and (blue or yellow) a...,0.269231,0.739645,False,def is_rule(obj):\n return (obj.triangle or...
251,hg99,one of the smallest,large or (medium and triangle) or (small and (...,0.353846,0.501065,False,def is_rule(obj):\n return obj.large or (ob...
177,hg68,(circle) or (same color as all other objects o...,(blue and not (circle or rectangle)) or (recta...,0.384615,0.521039,False,def is_rule(obj):\n return (obj.blue and no...
...,...,...,...,...,...,...,...
84,hg39,same shape as a blue or green object,(blue or green) or (yellow and not ((large and...,0.894737,0.771468,False,def is_rule(obj):\n return (obj.blue or obj...
66,hg34,blue or (larger or equal in size to all other ...,large or blue,0.913043,0.555556,False,def is_rule(obj):\n return obj.large or obj...
129,hg52,(blue or green) and no other object is blue or...,small and blue and circle,0.962025,0.950649,False,def is_rule(obj):\n return obj.small and ob...
135,hg54,(blue or circle) and no other object is (blue ...,small and blue and triangle,0.977778,0.956543,False,def is_rule(obj):\n return obj.small and ob...


## PTG16 Model

## Compiling files

In [255]:
def convert_bayesian_model_file():
    """
    hg13L2	2	1	0	11	13	0.854544341564178466796875	0.26362450824922234460245817899704	"λx.apply1(apply2(size-gt,x.o),x.o)
    """
    df_dict = {k: [] for k in ['concept', 'is_bool', 'item_num', 'set_num', 'answer', 'hyes', 'hno', 'p(yes)', 'lambda_rule']}
    with open('./data/PTG16_model.txt', 'r') as f:
        for line in f.readlines():
            concept, item_num, set_num, answer, hyes, hno, alpha, posterior, rule = line.split("\t")
            df_dict['concept'].append(concept[:-2])
            df_dict['is_bool'].append(concept[:-2] in SUBSETS['boolean'])
            df_dict['item_num'].append(int(item_num))
            df_dict['set_num'].append(int(set_num) + 1)
            df_dict['answer'].append(answer)
            df_dict['hyes'].append(hyes)
            df_dict['hno'].append(hno)
            df_dict['p(yes)'].append(float(posterior))
            df_dict['lambda_rule'].append(rule)

    ptg16_df = pd.DataFrame.from_dict(df_dict)
    return ptg16_df

In [256]:
from src.utils.preprocess import format_shape

def fill_in_bayesian_model_df(df):
    with open('./data/labels_to_data.json', 'r') as f:
        data = json.load(f)
    
    model_answer = []
    df = df.sort_values(['concept', 'item_num'])

    # def function(x):
    #     print(x['concept'], x['set_num'], x['item_num'])
    #     print(data[x['concept']]['L2']['sets'][x['set_num']][x['item_num']])
    #     return data[x['concept']]['L2']['sets'][x['set_num']][x['item_num']]
    # df['object'] = df.apply(lambda x: data[x['concept']]['L2']['sets'][x['set_num']][x['item_num']], axis=1)
    # df['object'] = df.apply(lambda x: function(x), axis=1)
    
    # for group in df.groupby(['concept']):
    #     concept = group[0][0]
    #     object += [format_shape(i) for s in data[concept]['L2']['sets'][:25] for i in s]
    #     print(concept, [format_shape(i) for s in data[concept]['L2']['sets'][:25] for i in s])
    #     data_answer += [i for s in data[concept]['L2']['answers'][:25] for i in s]

    object = []
    for concept in df['concept'].unique():
        object += [format_shape(i) for s in data[concept]['L2']['sets'][:25] for i in s]

    model_answer = df.apply(lambda x: x['p(yes)'] > 0.5, axis=1)
    concept_names = df['concept'].apply(lambda x: READABLE[x])
    df['object'] = object
    df['concept_num'] = df['concept']
    df['set'] = df['set_num']
    df['concept'] = concept_names
    df['model_answer'] = model_answer
    return df

In [163]:
fdf = fill_in_bayesian_model_df(convert_bayesian_model_file())
fdf[fdf['is_bool']].to_csv('./results/experiment_2/PTG16_model_bool_rules.csv')

In [257]:
bdf = fill_in_bayesian_model_df(convert_bayesian_model_file())
with open('./results/experiment_2/fol_subset.txt', 'r') as f:
    subset = [x.strip() for x in f.readlines()]
    bdf['to_annotate'] = bdf['concept_num'].isin(subset)

bdf[~bdf['is_bool']].to_csv('./results/experiment_2/PTG16_model_fol_rules.csv')

In [20]:
fdf = pd.read_csv('./results/experiment_2/PTG16_model_bool_rules.csv')
fdf['model_answer'] = fdf['p(yes)'] > 0.5
fdf['score'] = fdf['model_answer'].astype(str) == fdf['answer'].astype(bool).astype(str)

lq_accuracies = []
for gdf in fdf.groupby(['concept']):
    concept = gdf[0][0]
    cdf = gdf[1]
    lq_start = 3 * math.floor(cdf['item_num'].max() / 4)
    lq_accuracies.append(cdf[cdf['item_num'] >= lq_start]['score'].mean())

print("Mean Accuracy and Standard Deviation for pLoT model, Boolean Rules", np.mean(lq_accuracies), np.std(lq_accuracies))

(0.9940242763772176, 0.020393105938247294)

In [23]:
fdf = pd.read_csv('./results/experiment_2/PTG16_model_fol_rules.csv')
fdf['model_answer'] = fdf['p(yes)'] > 0.5
fdf['score'] = fdf['model_answer'].astype(str) == fdf['answer'].astype(bool).astype(str)

lq_accuracies = []
for gdf in fdf.groupby(['concept']):
    concept = gdf[0][0]
    cdf = gdf[1]
    lq_start = 3 * math.floor(cdf['item_num'].max() / 4)
    lq_accuracies.append(cdf[cdf['item_num'] >= lq_start]['score'].mean())

print("Mean Accuracy and Standard Deviation for pLoT model, FOL Rules", np.mean(lq_accuracies), np.std(lq_accuracies))

(0.8810646297928431, 0.11557895196620963)

## Adding stats to annotations

## Boolean Rules

In [230]:
df = pd.read_csv('./results/experiment_2/PTG16_model_bool_rules_annotated.csv')
df['concept_num'] = df['concept']
df['set'] = df['set_num'] + 1
manual_rule_coverage = check_rule_coverage(df)
manual_rule_coverage.to_csv('./results/experiment_2/PTG16_model_bool_rules_annotated_stats.csv')

  0%|          | 0/2555 [00:00<?, ?it/s]

  coverage.append(previous_likelihood.mean())
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 2555/2555 [00:07<00:00, 321.00it/s]


In [239]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/PTG16_model_bool_rules_annotated_stats.csv')
print("Consistency", manual_rule_coverage['consistency'].mean())
print("Coverage", manual_rule_coverage['coverage_manual_code'].mean())
print("Last object set rate", (manual_rule_coverage[manual_rule_coverage['set'] == 25].drop_duplicates('concept').loc[:, ['concept', 'rule_expression', 'full_coverage_manual_code']] == 1)['full_coverage_manual_code'].mean())
manual_rule_coverage[manual_rule_coverage['set'] == 25].drop_duplicates('concept').loc[:, ['concept', 'rule_expression', 'full_coverage_manual_code']]

Consistency 0.9021526418786693
Coverage 0.9566204658010048
Last object set rate 0.8235294117647058


Unnamed: 0,concept,rule_expression,full_coverage_manual_code
61,hg01,same color as itself,1.0
139,hg02,larger than itself,1.0
220,hg03,not yellow and not green,1.0
289,hg04,circle,1.0
379,hg05,not circle,1.0
456,hg06,circle or blue,1.0
536,hg07,not rectangle,1.0
610,hg08,not yellow,1.0
684,hg09,larger than itself,0.910256
758,hg10,circle and green,0.958904


## FOL Rules

In [204]:
df = pd.read_csv('./results/experiment_2/PTG16_model_fol_rules_annotated_subset.csv')
manual_rule_coverage = check_rule_coverage(df)

  0%|          | 0/1169 [00:00<?, ?it/s]

  coverage.append(previous_likelihood.mean())
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 1169/1169 [42:28<00:00,  2.18s/it] 


In [206]:
manual_rule_coverage.to_csv('./results/experiment_2/PTG16_model_fol_rules_annotated_subset_stats.csv')

In [246]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/PTG16_model_fol_rules_annotated_subset_stats.csv') 
print("Consistency", manual_rule_coverage['consistency'].mean())
print("Coverage", manual_rule_coverage['coverage_manual_code'].mean())
print("Last object set rate", (manual_rule_coverage[manual_rule_coverage['set'] == 25].drop_duplicates('concept').loc[:, ['concept', 'rule_expression', 'full_coverage_manual_code']] == 1)['full_coverage_manual_code'].mean())
manual_rule_coverage[manual_rule_coverage['set'] == 24].drop_duplicates('concept').loc[:, ['concept', 'rule_expression', 'full_coverage_manual_code']]

Consistency 0.8964927288280582
Coverage 0.8890269135058918
Last object set rate 0.26666666666666666


Unnamed: 0,concept,rule_expression,full_coverage_manual_code
68,blue and same shape as another object,larger than itself,0.842105
148,larger than all other objects,strictly larger than all other objects,1.0
225,same shape as another object that is blue,larger than itself,0.802632
304,exists another object of the same shape that i...,there is a blue object of the same shape that ...,1.0
386,no other object has the same shape,no other object has the same shape,1.0
467,the unique element and is (blue or green),larger than itself,0.925926
542,there is another object of the same shape<br>t...,there is an object of the same shape that is n...,0.888889
618,same shape as the unique largest,no circle in the set,0.833333
695,unique largest blue object,large and blue,0.897436
778,(same shape as one of the largest) and blue,(not green) and (not yellow),0.918605


## Last sets

In [247]:
df = pd.read_csv('./results/experiment_2/PTG16_model_fol_rules_annotated_last_sets.csv')
manual_rule_coverage = check_rule_coverage(df, only_eval_last_set=True)
manual_rule_coverage = manual_rule_coverage[manual_rule_coverage['set'] == 25]
manual_rule_coverage.to_csv('./results/experiment_2/PTG16_fol_rules_annotated_last_sets_stats.csv')
manual_rule_coverage.loc[:, ['concept_num', 'coverage_manual_code', 'human_lq', 'concept', 'rule_expression']]

100%|██████████| 5905/5905 [17:53<00:00,  5.50it/s]  


Unnamed: 0,concept_num,coverage_manual_code,human_lq,concept,rule_expression
0,hg64,1.000000,0.727891,no other object has the same shape,same color as (the unique object with the same...
1,hg64,1.000000,0.727891,no other object has the same shape,same color as (the unique object with the same...
2,hg64,1.000000,0.727891,no other object has the same shape,same color as (the unique object with the same...
3,hg64,1.000000,0.727891,no other object has the same shape,same color as (the unique object with the same...
4,hg98,0.640625,0.801932,the unique smallest object,strictly larger than all other objects
...,...,...,...,...,...
247,hg60,1.000000,0.824561,same size as another object,there is an object of the same size that is ((...
248,hg17,0.571429,0.829932,there is a triangle in the set,triangle
249,hg17,0.571429,0.829932,there is a triangle in the set,triangle
250,hg17,0.571429,0.829932,there is a triangle in the set,triangle


In [248]:
manual_rule_coverage = pd.read_csv('./results/experiment_2/PTG16_fol_rules_annotated_last_sets_stats.csv') 
print("Consistency", manual_rule_coverage['consistency'].mean())
print("Coverage", manual_rule_coverage['coverage_manual_code'].mean())
print("Last object set rate", (manual_rule_coverage[manual_rule_coverage['set'] == 25].drop_duplicates('concept').loc[:, ['concept', 'rule_expression', 'full_coverage_manual_code']] == 1)['full_coverage_manual_code'].mean())
manual_rule_coverage[manual_rule_coverage['set'] == 25].drop_duplicates('concept').loc[:, ['concept', 'concept_num', 'rule_expression', 'full_coverage_manual_code']].to_csv('./results/experiment_2/ptg16_fol_final_rules.csv')
manual_rule_coverage[manual_rule_coverage['set'] == 25].drop_duplicates('concept').loc[:, ['concept', 'concept_num', 'rule_expression', 'full_coverage_manual_code']]

Consistency 0.9285714285714286
Coverage 0.8631193383189815
Last object set rate 0.22666666666666666


Unnamed: 0,concept,concept_num,rule_expression,full_coverage_manual_code
0,no other object has the same shape,hg64,same color as (the unique object with the same...,1.000000
4,the unique smallest object,hg98,strictly larger than all other objects,0.637681
9,larger than all other objects,hg29,strictly larger than all other objects,1.000000
12,does not exist another object with same shape ...,hg71,same color as (the unique object with the same...,1.000000
16,one of the largest or smallest,hg26,same color as itself,0.884615
...,...,...,...,...
237,there is another object of the same shape<br>a...,hg74,there is an object of the same shape that is n...,0.782609
242,there is another object of the same shape<br>t...,hg75,there is an object of the same shape that is n...,0.888889
244,same shape as another object that is blue or g...,hg90,there is an object of the same shape that is n...,0.888889
246,same size as another object,hg60,there is an object of the same size that is ((...,1.000000
