# Evaluate the results

Problem: there are many ways to give a correct answer and it's not easy to find all of them. Some of our tests are a bit shaky but we hand-test them to make sure they get the right gist.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Toy problem 3 colors zero shot

In [2]:
# import results
df_toy_problem_3c_results = pd.read_csv("data/toy_problem_3_results/toy_problem_3c_results.csv")
df_toy_problem_3c_results

Unnamed: 0,sequence,switched,first_color,second_color,final_color,answers_start_chain,answers_second_chain,answers_into_hole
0,The blue ball hit the red ball. The red ball h...,False,blue,red,green,The blue ball.,The green ball.,The green ball.
1,The red ball hit the green ball. The blue ball...,True,blue,red,green,The blue ball.,The blue ball.,The green ball.
2,The blue ball hit the red ball. The red ball h...,False,blue,red,brown,The blue ball.,The blue ball.,The brown ball.
3,The red ball hit the brown ball. The blue ball...,True,blue,red,brown,The blue ball.,The blue ball.,The brown ball.
4,The blue ball hit the red ball. The red ball h...,False,blue,red,purple,The blue ball.,The blue ball.,The purple ball.
...,...,...,...,...,...,...,...,...
65,The purple ball hit the white ball. The brown ...,True,brown,purple,white,The purple ball.,The brown ball.,The white ball.
66,The brown ball hit the black ball. The black b...,False,brown,black,white,The brown ball.,The black ball.,The white ball.
67,The black ball hit the white ball. The brown b...,True,brown,black,white,The black ball.,The black ball.,The white ball.
68,The purple ball hit the black ball. The black ...,False,purple,black,white,The purple ball.,The black ball.,The white ball.


In [3]:
df_toy_problem_3c_results_switched = df_toy_problem_3c_results[df_toy_problem_3c_results["switched"] == True]
df_toy_problem_3c_results_non_switched = df_toy_problem_3c_results[df_toy_problem_3c_results["switched"] == False]
df_toy_problem_3c_results_non_switched.head(2)

Unnamed: 0,sequence,switched,first_color,second_color,final_color,answers_start_chain,answers_second_chain,answers_into_hole
0,The blue ball hit the red ball. The red ball h...,False,blue,red,green,The blue ball.,The green ball.,The green ball.
2,The blue ball hit the red ball. The red ball h...,False,blue,red,brown,The blue ball.,The blue ball.,The brown ball.


In [4]:
def eval_first_color(df):
    fc = df["first_color"].values
    fc_answers = df["answers_start_chain"].values
    correct = []
    for i, c in enumerate(fc):
        correct.append(c in fc_answers[i])
        
    return(correct)

print(np.sum(eval_first_color(df_toy_problem_3c_results)))
print(np.sum(eval_first_color(df_toy_problem_3c_results_switched)))
print(np.sum(eval_first_color(df_toy_problem_3c_results_non_switched)))

56
21
35


In [5]:
def eval_second_color(df):
    fc = df["second_color"].values
    fc_answers = df["answers_second_chain"].values
    correct = []
    for i, c in enumerate(fc):
        correct.append(c in fc_answers[i])
        
    return(correct)

print(np.sum(eval_second_color(df_toy_problem_3c_results)))
print(np.sum(eval_second_color(df_toy_problem_3c_results_switched)))
print(np.sum(eval_second_color(df_toy_problem_3c_results_non_switched)))

24
2
22


In [6]:
def eval_final_color(df):
    fc = df["final_color"].values
    fc_answers = df["answers_into_hole"].values
    correct = []
    for i, c in enumerate(fc):
        correct.append(c in fc_answers[i])
        
    return(correct)

print(np.sum(eval_final_color(df_toy_problem_3c_results)))
print(np.sum(eval_final_color(df_toy_problem_3c_results_switched)))
print(np.sum(eval_final_color(df_toy_problem_3c_results_non_switched)))

70
35
35


## Toy problem 3 colors one shot

In [19]:
# import results
df_toy_problem_3c_os_results = pd.read_csv("data/toy_problem_3_results/toy_problem_3c_results_one_shot.csv")
df_toy_problem_3c_os_results

Unnamed: 0,sequence,switched,first_color,second_color,final_color,answers_start_chain,answers_second_chain,answers_into_hole
0,The blue ball hit the red ball. The red ball h...,False,blue,red,green,The blue ball.,The red ball.,The green ball.
1,The red ball hit the green ball. The blue ball...,True,blue,red,green,The blue ball.,The green ball.,The green ball.
2,The blue ball hit the red ball. The red ball h...,False,blue,red,brown,The blue ball.,The brown ball.,The brown ball.
3,The red ball hit the brown ball. The blue ball...,True,blue,red,brown,The blue ball.,The blue ball.,The brown ball.
4,The blue ball hit the red ball. The red ball h...,False,blue,red,purple,The blue ball.,The purple ball.,The purple ball.
...,...,...,...,...,...,...,...,...
65,The purple ball hit the white ball. The brown ...,True,brown,purple,white,The purple ball.,The purple ball.,The white ball.
66,The brown ball hit the black ball. The black b...,False,brown,black,white,The brown ball.,The black ball.,The white ball.
67,The black ball hit the white ball. The brown b...,True,brown,black,white,The black ball.,The brown ball.,The white ball.
68,The purple ball hit the black ball. The black ...,False,purple,black,white,The purple ball.,The black ball.,The white ball.


In [20]:
df_toy_problem_3c_os_results_switched = df_toy_problem_3c_os_results[df_toy_problem_3c_os_results["switched"] == True]
df_toy_problem_3c_os_results_non_switched = df_toy_problem_3c_os_results[df_toy_problem_3c_os_results["switched"] == False]
df_toy_problem_3c_os_results_non_switched.head(2)

Unnamed: 0,sequence,switched,first_color,second_color,final_color,answers_start_chain,answers_second_chain,answers_into_hole
0,The blue ball hit the red ball. The red ball h...,False,blue,red,green,The blue ball.,The red ball.,The green ball.
2,The blue ball hit the red ball. The red ball h...,False,blue,red,brown,The blue ball.,The brown ball.,The brown ball.


In [21]:
print(np.sum(eval_first_color(df_toy_problem_3c_os_results)))
print(np.sum(eval_first_color(df_toy_problem_3c_os_results_switched)))
print(np.sum(eval_first_color(df_toy_problem_3c_os_results_non_switched)))

49
14
35


In [22]:
print(np.sum(eval_second_color(df_toy_problem_3c_os_results)))
print(np.sum(eval_second_color(df_toy_problem_3c_os_results_switched)))
print(np.sum(eval_second_color(df_toy_problem_3c_os_results_non_switched)))

22
2
20


In [23]:
print(np.sum(eval_final_color(df_toy_problem_3c_os_results)))
print(np.sum(eval_final_color(df_toy_problem_3c_os_results_switched)))
print(np.sum(eval_final_color(df_toy_problem_3c_os_results_non_switched)))

70
35
35


## Toy problem nonsense words zero shot

In [9]:
# import results
df_toy_problem_3n_results = pd.read_csv("data/toy_problem_3_results/toy_problem_3nonsense_results.csv")
df_toy_problem_3n_results

Unnamed: 0,sequence,switched,first_word,second_word,final_word,answers_start_chain,answers_second_chain,answers_into_hole
0,The baz hit the fuu. The fuu hit the schleep. ...,False,baz,fuu,schleep,The baz.,The baz.,The schleep
1,The fuu hit the schleep. The baz hit the fuu. ...,True,baz,fuu,schleep,The baz hit the fuu.,The baz,The schleep
2,The baz hit the fuu. The fuu hit the blubb. Th...,False,baz,fuu,blubb,The baz.,The blubb.,The blubb
3,The fuu hit the blubb. The baz hit the fuu. Th...,True,baz,fuu,blubb,The baz hit the fuu.,The baz,The blubb
4,The baz hit the fuu. The fuu hit the bla. The ...,False,baz,fuu,bla,The baz.,The bla.,The bla
...,...,...,...,...,...,...,...,...
65,The bla hit the dinglebob. The blubb hit the b...,True,blubb,bla,dinglebob,The blubb hitting the bla.,The blubb.,The dinglebob
66,The blubb hit the plomp. The plomp hit the din...,False,blubb,plomp,dinglebob,The blubb.,The blubb.,The dinglebob
67,The plomp hit the dinglebob. The blubb hit the...,True,blubb,plomp,dinglebob,The plomp,The blubb.,The dinglebob
68,The bla hit the plomp. The plomp hit the dingl...,False,bla,plomp,dinglebob,The bla,The bla,The dinglebob


In [10]:
df_toy_problem_3n_results_switched = df_toy_problem_3n_results[df_toy_problem_3n_results["switched"] == True]
df_toy_problem_3n_results_non_switched = df_toy_problem_3n_results[df_toy_problem_3n_results["switched"] == False]
df_toy_problem_3n_results_non_switched.head(2)

Unnamed: 0,sequence,switched,first_word,second_word,final_word,answers_start_chain,answers_second_chain,answers_into_hole
0,The baz hit the fuu. The fuu hit the schleep. ...,False,baz,fuu,schleep,The baz.,The baz.,The schleep
2,The baz hit the fuu. The fuu hit the blubb. Th...,False,baz,fuu,blubb,The baz.,The blubb.,The blubb


In [12]:
# eval first part of the chain
def eval_first_word(df):
    fc = df["first_word"].values
    fc_answers = df["answers_start_chain"].values
    correct = []
    for i, c in enumerate(fc):
        correct.append(c in fc_answers[i])
        
    return(correct)

print(np.sum(eval_first_word(df_toy_problem_3n_results)))
print(np.sum(eval_first_word(df_toy_problem_3n_results_switched)))
print(np.sum(eval_first_word(df_toy_problem_3n_results_non_switched)))

46
12
34


In [14]:
def eval_second_word(df):
    fc = df["second_word"].values
    fc_answers = df["answers_second_chain"].values
    correct = []
    for i, c in enumerate(fc):
        correct.append(c in fc_answers[i])
        
    return(correct)

print(np.sum(eval_second_word(df_toy_problem_3n_results)))
print(np.sum(eval_second_word(df_toy_problem_3n_results_switched)))
print(np.sum(eval_second_word(df_toy_problem_3n_results_non_switched)))

22
7
15


In [16]:
def eval_final_word(df):
    fc = df["final_word"].values
    fc_answers = df["answers_into_hole"].values
    correct = []
    for i, c in enumerate(fc):
        correct.append(c in fc_answers[i])
        
    return(correct)

print(np.sum(eval_final_word(df_toy_problem_3n_results)))
print(np.sum(eval_final_word(df_toy_problem_3n_results_switched)))
print(np.sum(eval_final_word(df_toy_problem_3n_results_non_switched)))

69
34
35


## Toy problem nonsense words one shot