In [2]:
pip install penman

Collecting penman
  Downloading penman-1.3.0-py3-none-any.whl.metadata (7.3 kB)
Downloading penman-1.3.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m319.2 kB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
[?25hInstalling collected packages: penman
Successfully installed penman-1.3.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
import random
import re
from pathlib import Path
import penman

import openai
from openai import OpenAI
import tiktoken

import matplotlib as plt
import seaborn as sns

pd.set_option('display.max_colwidth', None)

In [4]:
# OPEN AI KEY
with open('/Users/noragarrity/Documents/LING_575_NG/LING575APIKEY.txt') as f:
    OPENAI_API_KEY = f.read().strip()
    
client = OpenAI(api_key=OPENAI_API_KEY)

In [5]:
# Loading data
phy = Path('/Users/noragarrity/Documents/LING_575_NG/data/test/college_physics_test.csv')
dev_phy = Path('/Users/noragarrity/Documents/LING_575_NG/data/dev/college_physics_dev.csv')
df_phy = pd.read_csv(phy, names=['question', 'A', 'B', 'C', 'D', 'answer'])
df_dev_phy = pd.read_csv(dev_phy, names=['question', 'A', 'B', 'C', 'D', 'answer'])

In [6]:
# Quick check / Visualization
df_phy.head()

Unnamed: 0,question,A,B,C,D,answer
0,"The quantum efficiency of a photon detector is 0.1. If 100 photons are sent into the detector, one after the other, the detector will detect photons","an average of 10 times, with an rms deviation of about 4","an average of 10 times, with an rms deviation of about 3","an average of 10 times, with an rms deviation of about 1","an average of 10 times, with an rms deviation of about 0.1",B
1,"White light is normally incident on a puddle of water (index of refraction 1.33). A thin (500 nm) layer of oil (index of refraction 1.5) floats on the surface of the puddle. Of the following, the most strongly reflected wavelength is",500 nm,550 nm,600 nm,650 nm,C
2,Which of the following is true about any system that undergoes a reversible thermodynamic process?,There are no changes in the internal energy of the system.,The temperature of the system remains constant during the process.,The entropy of the system and its environment remains unchanged.,The entropy of the system and its environment must increase.,C
3,The best type of laser with which to do spectroscopy over a range of visible wavelengths is,a dye laser,a helium-neon laser,an excimer laser,a ruby laser,A
4,Excited states of the helium atom can be characterized as para- (antiparallel electron spins) and ortho- (parallel electron spins). The observation that an ortho- state has lower energy than the corresponding para- state can be understood in terms of which of the following?,The Heisenberg uncertainty principle,The Pauli exclusion principle,The Bohr model of the atom,Nuclear hyperfine coupling,B


In [7]:
df_dev_phy.head()

Unnamed: 0,question,A,B,C,D,answer
0,A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is,4,5,6,20,A
1,For which of the following thermodynamic processes is the increase in the internal energy of an ideal gas equal to the heat added to the gas?,Constant temperature,Constant volume,Constant pressure,Adiabatic,B
2,"One end of a Nichrome wire of length 2L and cross-sectional area A is attached to an end of another Nichrome wire of length L and cross- sectional area 2A. If the free end of the longer wire is at an electric potential of 8.0 volts, and the free end of the shorter wire is at an electric potential of 1.0 volt, the potential at the junction of the two wires is most nearly equal to",2.4 V,3.3 V,4.5 V,5.7 V,A
3,A refracting telescope consists of two converging lenses separated by 100 cm. The eye-piece lens has a focal length of 20 cm. The angular magnification of the telescope is,4,5,6,20,A
4,"The muon decays with a characteristic lifetime of about 10^-6 second into an electron, a muon neutrino, and an electron antineutrino. The muon is forbidden from decaying into an electron and just a single neutrino by the law of conservation of",charge,mass,energy and momentum,lepton number,D


# Generating Prompts #

In next few cells, we are going to generate two prompts:
1) one prompts to create a baseline
2) one prompts to create AMR representation based on pre-generated AMRs

First we will update the dev_test set by
1) Adding manually annotated AMR to each question
2) Changing one of reduant questions (0,3) into a different question. This question was selected from college physics validation dataset 

In [8]:
# updating the one of a duplicates into a question from validation set
df_dev_phy.loc[0] = ["Electromagnetic radiation emitted from a nucleus is most likely to be in the form of",
                     "gamma rays",
                     "microwaves",
                     "ultraviolet radiation",
                     "visible light",
                     "A"
                    ]

amr_dev_manual = ["(e / emit-01 :ARG0 (n / nucleus) :ARG1 (r / radiation-01 :medium (e2 / electromagnetic) :ARG1-of (l / likely-01 :degree (m / most) :ARG1 (f / form-01 :ARG1 (g / gamma-rays)))))",
                  "(q / question :domain (p / process-01 :mod (t / thermodynamic) :ARG1 (g / gas :mod (i / ideal))) :ARG1 (e / equal-01 :ARG1 (i2 / increase-01 :ARG1 (e2 / energy-03 :ARG1 g :mod (i3 / internal))) :ARG2 (a / add-01 :ARG2 (h / heat) :ARG1 g)))",
                  "(q / question :ARG1 (p / potential :location (j / junction :part-of (w1 / wire :mod (n1 / nichrome) :ARG1-of (a1 / attach-01) :quant (l2 / length :quant 2 :unit (l / L)) :ARG1-of (c1 / cross-section-01 :quant (a / area :quant A))) :part-of (w2 / wire :mod (n2 / nichrome) :ARG1-of (a2 / attach-01) :quant (l1 / length :quant 1 :unit l) :ARG1-of (c2 / cross-section-01 :quant (a2 / area :quant 2 :unit A)))) :mod (n / nearly) :degree (m / most) :condition (and :op1 (e1 / electric :mod (p1 / potential :quant 8.0 :unit (v / volt)) :location (e / end :part-of w1)) :op2 (e2 / electric :mod (p2 / potential :quant 1.0 :unit v) :location (e3 / end :part-of w2)))))",
                  "(t / telescope :mod (r / refract-01) :consist-of (l / lens :quantity 2 :mod (c / converge-01)) :separation (d / distance :quant 100 :unit (c / cm)) :part (e / eyepiece :mod (l2 / lens) :ARG1-of (h / have-01 :ARG2 (f / focal-length :quant 20 :unit c))) :ARG1-of (q / query-01 :ARG1 (a / angular-magnification)))",
                  "(d / decay-01 :ARG0 (m / muon) :duration (l / lifetime :mod (c / characteristic) :quant (t / time-quantity :quant 1e-6 :unit (s / second))) :result (a / and :op1 (e / electron) :op2 (mn / neutrino :mod (m2 / muon)) :op3 (an / antineutrino :mod (e2 / electron))) :condition (f / forbid-01 :ARG0 m :ARG1 (a2 / and :op1 e :op2 (n / neutrino :quantity 1)) :ARG2 (l2 / law :mod (c2 / conservation))))"
                 ]

***Checking the quality of amr_dev_manual***

In [9]:
for idx, manual_amr in enumerate(amr_dev_manual):
    try:
        penman.decode(manual_amr)
        print(f"Manual AMR {idx+1} has no issue!")
    except penman.DecodeError as e:
        print(f"{idx+1}, Decoding error occured {e}")
    except:
        print("Something else went wrong")

Manual AMR 1 has no issue!
Manual AMR 2 has no issue!
Manual AMR 3 has no issue!
Manual AMR 4 has no issue!
Manual AMR 5 has no issue!


# Function to generate prompts #

In [10]:
def generate_prompts(df, df_dev=None, amr_dev_steps=None, amrs_test_steps=None, prompt_type='base'):
    """
    Generate prompts for College Physics questions.
    
    Parameters:
    - df: DataFrame containing the questions and choices from College Physics Test Data.
    - df_dev: DataFrame for College Physics dev dataset, used in generating AMRs.
    - amr_steps: List containing pre-generated amr representations for dev_set.
    - amrs: List of pre-generated amr representation for test_data.
    - prompt_type: type of prompt: 'base', 'amr_steps', 'amr_base', 'amr_cot'.
    
    Returns:
    - A list of prompts according to the specified type.
        - (amr) base: system instruct: task explanation
                      user instruct: questions and answer choice from df

        - amr_steps: system instruct: task explanation of generating AMR
                     user instruct: questions and answer choice from df_dev

        - amr_cot: system instruct: task explanation
               user instruct: questions and answer choice from df_dev with accompanying AMRs.
               assistant instruct: answer 
               ... (x5 examples)
               user instrcut: questions and answer choice from df
    """
    
    prompts = []
    system_instruct = {
        "role": "system",
        "content": ""
    }

    if prompt_type == 'base':
        system_instruct["content"] = "You are given a College Physics question. Read the question, then answer the question by picking the correct answer from A, B, C, and D."
        for index, row in df.iterrows():
            prompt_content = f"{row['question']}\nA: {row['A']}\nB: {row['B']}\nC: {row['C']}\nD: {row['D']}"
            user_instruct = {"role": "user", "content": prompt_content}
            prompts.append([system_instruct, user_instruct])

    # short_amr_base + amr_base prompts
    elif prompt_type == 'amr_base' and amrs_test_steps is not None:
        system_instruct["content"] = "You are given a College Physics question and its Abstract Meaning Representation(AMR). Read the provided question and its AMR pair, then answer the question by picking the correct answer from A, B, C, and D."
        for index, row in df.iterrows():
            amr = amrs_test_steps[index]
            prompt_content = f"{row['question']}\n {amr}\n A: {row['A']}\nB: {row['B']}\nC: {row['C']}\nD: {row['D']}"
            user_instruct = {"role": "user", "content": prompt_content}
            prompts.append([system_instruct, user_instruct])
    
    elif prompt_type == 'amr_steps' and df_dev is not None and amr_dev_steps is not None:
        system_instruct["content"] = "You are given a College Physics question and its Abstract Meaning Representation(AMR). Read the provided question and its AMR pair, then generate AMR for the given sentence."
        amr_instructions = []
        for index, row in df_dev.iterrows():
            amr = amr_dev_steps[index]
            prompt_content = f"{row['question']}"
            amr_instructions.append({"role": "user", "content": prompt_content})
            amr_instructions.append({"role": "assistant", "content": amr})

        for index, row in df.iterrows():
            prompt_content = f"{row['question']}"
            final_question = {"role": "user", "content": prompt_content}
            prompts.append([system_instruct] + amr_instructions + [final_question]) 
            
    elif prompt_type == 'amr_cot' and df_dev is not None and amr_dev_steps is not None and amrs_test_steps is not None:
        system_instruct["content"] = "You are given a College Physics question and its Abstract Meaning Representation(AMR). Read the provided question and its AMR pair, then answer the question by picking the correct answer from A, B, C, and D."
        amr_instructions = []
        for index, row in df_dev.iterrows():
            amr = amr_dev_steps[index]
            prompt_content = f"{row['question']}\n {amr}\n A: {row['A']}\nB: {row['B']}\nC: {row['C']}\nD: {row['D']}"
            amr_instructions.append({"role": "user", "content": prompt_content})
            amr_instructions.append({"role": "assistant", "content": row['answer']})

        for index, row in df.iterrows():
            amr = amrs_test_steps[index]
            prompt_content = f"{row['question']}\n {amr}\n A: {row['A']}\nB: {row['B']}\nC: {row['C']}\nD: {row['D']}"
            final_question = {"role": "user", "content": prompt_content}
            prompts.append([system_instruct] + amr_instructions + [final_question])
            
    return prompts

# Example of base and amr_steps prompts #

In [11]:
base_prompts = generate_prompts(df_phy)
amr_step_prompts = generate_prompts(df_phy, df_dev=df_dev_phy, amr_dev_steps=amr_dev_manual, prompt_type='amr_steps')

In [12]:
print(f" Prompts for baseline experiments: \n {base_prompts[1]}\n")
print(f" Prompts for generating amrs: \n {amr_step_prompts[1]}")

 Prompts for baseline experiments: 
 [{'role': 'system', 'content': 'You are given a College Physics question. Read the question, then answer the question by picking the correct answer from A, B, C, and D.'}, {'role': 'user', 'content': 'White light is normally incident on a puddle of water (index of refraction 1.33). A thin (500 nm) layer of oil (index of refraction 1.5) floats on the surface of the puddle. Of the following, the most strongly reflected wavelength is\nA: 500 nm\nB: 550 nm\nC: 600 nm\nD: 650 nm'}]

 Prompts for generating amrs: 
 [{'role': 'system', 'content': 'You are given a College Physics question and its Abstract Meaning Representation(AMR). Read the provided question and its AMR pair, then generate AMR for the given sentence.'}, {'role': 'user', 'content': 'Electromagnetic radiation emitted from a nucleus is most likely to be in the form of'}, {'role': 'assistant', 'content': '(e / emit-01 :ARG0 (n / nucleus) :ARG1 (r / radiation-01 :medium (e2 / electromagnetic) 

# Setting up api calls #

In [13]:
# Generating logit bias (token limitation) for prompt answers
def generate_logit_bias(mode='standard'):
    """
    Generate a dictionary of logit biases based on the specified mode.
    
    Parameters:
    - mode: A string specifying the mode ('standard' or 'amr_steps').
    - encoding: The encoding object or function capable of encoding text into token IDs for the model.

    Returns:
    - A dictionary where keys are token IDs and values are the logit biases for those tokens.
    """
    logit_bias = {}
    encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')

    excluded = ['The', 'Answer', 'Correct', 'None']
    for e in excluded:
        logit_bias[encoding.encode(e)[0]] = -100    
    
    if mode != 'amr_steps':
        for j in encoding.encode("A B C D"):
            logit_bias[j] = -0.01
            
    return logit_bias

In [14]:
standard_logit_bias = generate_logit_bias()
amr_step_logit_bias = generate_logit_bias(mode='amr_steps')

In [15]:
def send_prompt(messages, logit_bias, mode='default'):
    """
    Send a prompt to the OpenAI API and receive a completion. Can operate in two modes: 'default' or 'amr_steps'.

    Parameters:
    - messages: The input messages for the completion.
    - mode: Specifies the operation mode. 'default' for default behavior, 'amr_steps' for generating amrs.
    - logit_bias: Optional dictionary specifying logit biases for certain tokens, applicable in 'amr_steps' mode.

    Returns:
    - For 'default' mode, returns the completion's content and its log probabilities (if applicable).
    - For 'amr_steps' mode, returns the completion's content.
    """
    if mode == 'default':  
        max_new_tokens = 1
        return_logprobs = True
    else:  
        max_new_tokens = 300
        return_logprobs = False  

    # Common completion arguments
    completion_args = {
        "model": 'gpt-3.5-turbo',
        "messages": messages,
        "temperature": 0.0,
        "max_tokens": max_new_tokens,
        "logit_bias": logit_bias,
        "frequency_penalty": 0.0,
        "presence_penalty": 0.0,
    }
    
    # Handle logprobs for 'default' mode
    if mode == 'default' and return_logprobs:
        completion_args.update({"logprobs": True, "top_logprobs": 3})

    # Send the completion request
    completion = client.chat.completions.create(**completion_args)

    # Extract the completion's content
    answer = completion.choices[0].message.content
    
    if mode == 'default' and return_logprobs:
        log_prob = completion.choices[0].logprobs.content[0].top_logprobs
        return answer, log_prob
    else:
        return answer

In [16]:
def run_experiments(PROMPTS, logit_bias, mode='default'):
    results = []
    if mode == 'default':
        for prompt in PROMPTS:
            messages = prompt
            final_answer, log_prob = send_prompt(messages, logit_bias, mode=mode)
            # print(f"Model answer: {final_answer}")
            log_probs_formatted = ", ".join([f"{token.token}: {token.logprob:.4f}, {round(np.exp(token.logprob), 2)}" for token in log_prob])
            # print(f"Log Probabilities: {log_probs_formatted}")
            results.append((final_answer, log_probs_formatted))
    elif mode == 'amr_steps':
        for prompt in PROMPTS:
            messages = prompt
            amr = send_prompt(messages, logit_bias, mode=mode)
            results.append(amr)
    else:
        raise ValueError("Invalid mode specified. Use 'default' or 'amr_steps'.")

    return results

# Generating AMRs for test data #

In [17]:
amr_steps = run_experiments(amr_step_prompts, amr_step_logit_bias, mode='amr_steps')

In [18]:
amr_steps[0]

'(q / query :op1 (d / detect-01 :ARG0 (p / photon) :ARG1 (d2 / detector :mod (e / efficiency :quant 0.1))) :op2 (s / send-01 :ARG0 (p2 / photon :quant 100) :ARG1 (d3 / detector) :ARG2 (a / after) :ARG3 (o / one) :ARG4 (a2 / after) :ARG5 (o2 / other)))'

***Checking the results and improving the AMR***\
Using penman library, we checked the quality of the machine generated AMR.\
There are two types of error:
1) Expected: ROLE
2) Unexpected end of input

In [19]:
def check_amr_quality(amr_list):
    no_issue_indices = []
    unexpected_end_indices = []
    expected_role_indices = []
    other_error_indices = []
    for idx, amr in enumerate(amr_list):
        try:
            penman.decode(amr)
            print(f"Machine Generated AMR {idx} has no issue!")
            no_issue_indices.append(idx)
        except penman.DecodeError as e:
            if "Unexpected end of input" in str(e):
                print(f"{idx}, Decoding error occurred: Unexpected end of input")
                unexpected_end_indices.append(idx)
            elif "Expected: ROLE" in str(e):
                print(f"{idx}, Decoding error occurred: Expected: ROLE")
                expected_role_indices.append(idx)
            else:
                print(f"{idx}, Decoding error occurred: {e}")
                other_error_indices.append(idx)
        except Exception as e:
            print(f"{idx}, An unexpected error occurred: {e}")
            other_error_indices.append(idx)
    return no_issue_indices, unexpected_end_indices, expected_role_indices, other_error_indices

In [20]:
no_issue_indices, unexpected_end_indices, expected_role_indices, other_error_indices = check_amr_quality(amr_steps)

ignoring epigraph data for duplicate triple: ('k2', ':instance', 'kinetic-energy')
ignoring epigraph data for duplicate triple: ('Hz', ':instance', 'Hz')
ignoring epigraph data for duplicate triple: ('m', ':instance', 'measure-01')
ignoring epigraph data for duplicate triple: ('k', ':instance', 'kinetic-energy')
ignoring epigraph data for duplicate triple: ('d', ':ARG1', 'k')


Machine Generated AMR 0 has no issue!
Machine Generated AMR 1 has no issue!
Machine Generated AMR 2 has no issue!
Machine Generated AMR 3 has no issue!
4, Decoding error occurred: Unexpected end of input
Machine Generated AMR 5 has no issue!
6, Decoding error occurred: Unexpected end of input
Machine Generated AMR 7 has no issue!
Machine Generated AMR 8 has no issue!
Machine Generated AMR 9 has no issue!
10, Decoding error occurred: Unexpected end of input
11, Decoding error occurred: Unexpected end of input
Machine Generated AMR 12 has no issue!
Machine Generated AMR 13 has no issue!
Machine Generated AMR 14 has no issue!
Machine Generated AMR 15 has no issue!
16, Decoding error occurred: Unexpected end of input
17, Decoding error occurred: Expected: ROLE
Machine Generated AMR 18 has no issue!
Machine Generated AMR 19 has no issue!
Machine Generated AMR 20 has no issue!
Machine Generated AMR 21 has no issue!
22, Decoding error occurred: Unexpected end of input
Machine Generated AMR 23

## Analyzing expected_role_indices ##

In [21]:
# just in case 
new_amr = amr_steps.copy()

In [22]:
# analyzing AMR expected_role_indices errors
for idx in expected_role_indices:
    try: 
        penman.decode(new_amr[idx])
    except penman.DecodeError as e:
        print(f"error{e}")

error
  line 1
    (q / question :ARG1 (e / energy :mod (m / mechanical) :mod (t / total) :ARG1-of (b / be :polarity (a / absence-of :mod (f / friction)))) :condition (a2 / and :op1 (d / displace-01 :ARG0 (m / mass :quant 0.30 :unit (k / kg)) :ARG1 (p / position :mod (e2 / equilibrium)) :ARG2 (d2 / distance :quant 0.030 :unit (m / m)) :ARG3 (r / release-01) :ARG4 (s / speed-01 :quant 0.040 :unit m/s) :ARG5 (p2 / pass-01 :ARG1 p)) :op2 (a3 / attach-01 :ARG1 (e2 / end :mod (h / horizontal) :mod (m2 / massless) :ARG1-of (s / spring)) :ARG2 (w / wall) :ARG3 (m3 / mass :quant 0.30 :unit kg) :ARG4 (t2 / table)))
                                                                                                                                                                                                                                                                                                                                                                                                 ^

***Since it's hard to analyze the results, we implemented a visualization function:***

In [23]:
def format_amr_to_penman(amr_string):
    indent_size = 3  # Number of spaces for each indent level
    formatted_amr = ""
    indent_level = 0
    i = 0
    first_colon_encountered = False  # Flag to track the first colon

    while i < len(amr_string):
        char = amr_string[i]

        # Handle opening parenthesis
        if char == "(":
            if i > 0 and amr_string[i - 1] != " " and first_colon_encountered:
                formatted_amr += "\n" + " " * indent_size * indent_level
            formatted_amr += char
            indent_level += 1
        elif char == ")":
            indent_level -= 1
            formatted_amr += char
        elif char == ":":
            if not first_colon_encountered:  # Adjust indentation after the first colon
                indent_level += 1
                first_colon_encountered = True
            formatted_amr += "\n" + " " * indent_size * (indent_level - 1) + char
        else:
            formatted_amr += char

        # Special handling for concept/identifier pairs
        if char == "/":
            # Skip ahead to include the identifier without breaking it
            while i + 1 < len(amr_string) and amr_string[i + 1] != " ":
                i += 1
                formatted_amr += amr_string[i]

        i += 1

    return formatted_amr

In [24]:
for idx in expected_role_indices:
    print(f"{format_amr_to_penman(amr_steps[idx])}\n")

(q / question 
   :ARG1 (e / energy 
      :mod (m / mechanical) 
      :mod (t / total) 
      :ARG1-of (b / be 
         :polarity (a / absence-of 
            :mod (f / friction)))) 
   :condition (a2 / and 
      :op1 (d / displace-01 
         :ARG0 (m / mass 
            :quant 0.30 
            :unit (k / kg)) 
         :ARG1 (p / position 
            :mod (e2 / equilibrium)) 
         :ARG2 (d2 / distance 
            :quant 0.030 
            :unit (m / m)) 
         :ARG3 (r / release-01) 
         :ARG4 (s / speed-01 
            :quant 0.040 
            :unit m/s) 
            :ARG5 (p2 / pass-01 
               :ARG1 p)) 
         :op2 (a3 / attach-01 
            :ARG1 (e2 / end 
               :mod (h / horizontal) 
               :mod (m2 / massless) 
               :ARG1-of (s / spring)) 
            :ARG2 (w / wall) 
            :ARG3 (m3 / mass 
               :quant 0.30 
               :unit kg) 
            :ARG4 (t2 / table)))

(d / driver 
   :ARG0-of (h / hea

In [25]:
expected_role_indices

[17, 46, 48, 76, 78, 83, 95, 98, 99]

***three types of Expected: ROLE errors***\
From the above graph, we were able to locate following errors:
1) Unit 'm/s' is breaking the structure since it is trying to interprete seperately
   - index = 46, 72, 76, 83, 95, 99
   - :unit (m / m/s))) => replace to 'meterpersecond' 
2) Messed up quantification
   - index = 98
   - :quant (* 2 (c2 / electron)))  => :mod (c2 / electron) :quant 2
   - :quant (/ pi 4) => quarter of pi => 0.785
3) Missing arguments
   - index 39
   - :ARG2 (m2 / melt-01 i))) => 'i' should be followed by ':' with correct node
   - index 67   
   - :ARG2 (r3 / remove-01 e))) => 'e' should be followed by ':' with correct node
    - index 78    - :quant (h2 / half-01 c)))))) => 'c' should be followed by ':' with correct node

In [26]:
# solution 1
for idx in expected_role_indices:
    if 'm/s' in new_amr[idx]:
        # Replace 'm/s' with 'meterpersecond'
        updated_amr_string = new_amr[idx].replace('m/s', 'meterpersecond')
        # Update the entry in new_amr with the replaced string
        new_amr[idx] = f"{updated_amr_string}"

In [27]:
# solution 2
def replace_patterns(amr_strings):
    """
    Replace specific patterns in AMR strings:
    - '*' with an empty string
    - '/ pi 4' with '0.785'
    - 'number/number' with the calculated value
    """
    for idx, amr in enumerate(amr_strings):
        # Replace '*' with an empty string
        amr = amr.replace('* ', '')
        
        # Replace '/ pi 4' with '0.785'
        amr = amr.replace('/ pi 4', '0.785')

        # Replace 'number/number' with the calculated value
        # This regex matches simple fractions like '3/4' and replaces them with their calculated float value
        def eval_match(match):
            return str(eval(match.group(0)))

        amr = re.sub(r'\b\d+/\d+\b', lambda match: eval_match(match), amr)
        
        # Update the AMR string in the list
        amr_strings[idx] = amr

In [28]:
replace_patterns(new_amr)

In [29]:
check_amr_quality(new_amr)

ignoring epigraph data for duplicate triple: ('k2', ':instance', 'kinetic-energy')
ignoring epigraph data for duplicate triple: ('Hz', ':instance', 'Hz')
ignoring epigraph data for duplicate triple: ('m', ':instance', 'measure-01')
ignoring epigraph data for duplicate triple: ('k', ':instance', 'kinetic-energy')
ignoring epigraph data for duplicate triple: ('d', ':ARG1', 'k')


Machine Generated AMR 0 has no issue!
Machine Generated AMR 1 has no issue!
Machine Generated AMR 2 has no issue!
Machine Generated AMR 3 has no issue!
4, Decoding error occurred: Unexpected end of input
Machine Generated AMR 5 has no issue!
6, Decoding error occurred: Unexpected end of input
Machine Generated AMR 7 has no issue!
Machine Generated AMR 8 has no issue!
Machine Generated AMR 9 has no issue!
10, Decoding error occurred: Unexpected end of input
11, Decoding error occurred: Unexpected end of input
Machine Generated AMR 12 has no issue!
Machine Generated AMR 13 has no issue!
Machine Generated AMR 14 has no issue!
Machine Generated AMR 15 has no issue!
16, Decoding error occurred: Unexpected end of input
17, Decoding error occurred: Unexpected end of input
Machine Generated AMR 18 has no issue!
Machine Generated AMR 19 has no issue!
Machine Generated AMR 20 has no issue!
Machine Generated AMR 21 has no issue!
22, Decoding error occurred: Unexpected end of input
Machine Generat

([0,
  1,
  2,
  3,
  5,
  7,
  8,
  9,
  12,
  13,
  14,
  15,
  18,
  19,
  20,
  21,
  23,
  24,
  25,
  26,
  30,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  43,
  46,
  49,
  50,
  51,
  52,
  57,
  59,
  60,
  61,
  62,
  63,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  73,
  77,
  80,
  81,
  82,
  84,
  85,
  86,
  88,
  90,
  91,
  92,
  94,
  96,
  101],
 [4,
  6,
  10,
  11,
  16,
  17,
  22,
  27,
  28,
  29,
  31,
  32,
  40,
  41,
  42,
  44,
  45,
  47,
  48,
  53,
  54,
  55,
  56,
  58,
  64,
  72,
  74,
  75,
  76,
  79,
  83,
  87,
  89,
  93,
  95,
  97,
  100],
 [78, 98, 99],
 [])

In [30]:
def fix_unexpected_end_of_input(amr_steps, max_attempts=10):
    """
    Recursively fixes "Unexpected end of input" errors in AMR strings by adding missing parentheses.
    
    Parameters:
    - amr_steps: List of AMR strings to be fixed.
    - max_attempts: Maximum number of attempts to add parentheses to fix decoding errors.
    
    Returns:
    - A tuple containing the updated list of AMR strings and a list of indices that still have issues after attempts.
    """
    def try_decode(amr, attempts=0):
        """
        Tries to decode an AMR string, adding closing parentheses if necessary.
        
        Parameters:
        - amr: The AMR string to be decoded.
        - attempts: Current attempt count.
        
        Returns:
        - Updated AMR string if fixed, or the original AMR string if not fixed after max_attempts.
        - Boolean indicating whether the string was successfully decoded.
        """
        if attempts >= max_attempts:
            return amr, False
        try:
            penman.decode(amr)
            return amr, True
        except penman.DecodeError as e:
            if "Unexpected end of input" in str(e):
                # Try fixing by adding a closing parenthesis and decode again
                return try_decode(amr + ')', attempts + 1)
            else:
                return amr, False

    fixed_amr_steps = []
    still_issues_indices = []

    for idx, amr_string in enumerate(amr_steps):
        fixed_amr, success = try_decode(amr_string)
        fixed_amr_steps.append(fixed_amr)
        if not success:
            still_issues_indices.append(idx)

    return fixed_amr_steps, still_issues_indices

In [31]:
fixed_amr, issues_indices = fix_unexpected_end_of_input(new_amr)
check_amr_quality(fixed_amr)

ignoring epigraph data for duplicate triple: ('s2', ':instance', 'sound')
ignoring epigraph data for duplicate triple: ('k2', ':instance', 'kinetic-energy')
ignoring epigraph data for duplicate triple: ('t2', ':ARG1', 'd')
ignoring epigraph data for duplicate triple: ('Hz', ':instance', 'Hz')
ignoring epigraph data for duplicate triple: ('m', ':instance', 'measure-01')
ignoring epigraph data for duplicate triple: ('k', ':instance', 'kinetic-energy')
ignoring epigraph data for duplicate triple: ('d', ':ARG1', 'k')
ignoring epigraph data for duplicate triple: ('s2', ':instance', 'sound')
ignoring epigraph data for duplicate triple: ('k2', ':instance', 'kinetic-energy')
ignoring epigraph data for duplicate triple: ('t2', ':ARG1', 'd')
ignoring epigraph data for duplicate triple: ('Hz', ':instance', 'Hz')
ignoring epigraph data for duplicate triple: ('m', ':instance', 'measure-01')
ignoring epigraph data for duplicate triple: ('k', ':instance', 'kinetic-energy')
ignoring epigraph data for 

Machine Generated AMR 0 has no issue!
Machine Generated AMR 1 has no issue!
Machine Generated AMR 2 has no issue!
Machine Generated AMR 3 has no issue!
Machine Generated AMR 4 has no issue!
Machine Generated AMR 5 has no issue!
Machine Generated AMR 6 has no issue!
Machine Generated AMR 7 has no issue!
Machine Generated AMR 8 has no issue!
Machine Generated AMR 9 has no issue!
Machine Generated AMR 10 has no issue!
Machine Generated AMR 11 has no issue!
Machine Generated AMR 12 has no issue!
Machine Generated AMR 13 has no issue!
Machine Generated AMR 14 has no issue!
Machine Generated AMR 15 has no issue!
Machine Generated AMR 16 has no issue!
Machine Generated AMR 17 has no issue!
Machine Generated AMR 18 has no issue!
Machine Generated AMR 19 has no issue!
Machine Generated AMR 20 has no issue!
Machine Generated AMR 21 has no issue!
Machine Generated AMR 22 has no issue!
Machine Generated AMR 23 has no issue!
Machine Generated AMR 24 has no issue!
Machine Generated AMR 25 has no iss

([0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  100,
  101],
 [],
 [78, 98, 99],
 [])

# Helper functions for shortening the AMR Steps #
***After algorithimically improve the AMR, most of AMRs are well formed***\
Even if the temperature is set to 0, there is always a set of variety in the generated AMRs.\
Our approach above is a basic filtering and developing methodology for preparing better AMR for experiment.\
However, it is fair enough for us to proceed to next step.

In [32]:
def shorten_amr(amr):
    depth = 0
    result = ''
    skip = False  # Indicates whether to skip adding characters to the result

    for i, char in enumerate(amr):
        if char == '(':
            depth += 1
            if depth == 3:  # Start skipping characters at depth 3
                skip = True
        elif char == ')':
            depth -= 1
            if depth == 2:  # Stop skipping characters when returning to depth 2
                skip = False
                continue  # Avoid appending this character immediately; it will be added if depth <= 2
        elif char == ':' and depth == 2:  # Start skipping characters after ':' at depth 2
            skip = True
            continue
        elif char == ' ' and skip:  # Stop skipping characters after a space
            skip = False

        # Append character if not skipping and at depth 1 or handling parentheses
        if not skip and (depth <= 2):
            result += char

    # Post-process to clean up any roles leading into empty structures
    result = re.sub(r'\s+\)', ')', result)
    result = re.sub(r'\(\s+\)', '()', result)

    return result

In [33]:
# short_amr for physics_test data
shortened_amr = []
for amr in fixed_amr:
    short_amr = shorten_amr(amr)
    shortened_amr.append(short_amr)

In [34]:
# short_amr for physics_dev data
short_dev_amr = [] 
for dev_amr in amr_dev_manual:
    short_dev_amr_string = shorten_amr(dev_amr)
    short_dev_amr.append(short_dev_amr_string)

***Visualize the shortening***

In [35]:
def visualize_n_many_amr(df, amr_results, short_results, n):
    random_index = random.sample(range(0,len(df)), n)
    for idx in random_index:
        original_question = df.loc[idx]['question']
        amr_result = amr_results[idx]
        short_result = short_results[idx]
        amr_graph = format_amr_to_penman(amr_result)
        amr_short_graph = format_amr_to_penman(short_result)
        print(f"Original question is: {original_question}\n")
        print(f"AMR graph:\n {amr_graph}\n")
        print(f"Short AMR Graph:\n {amr_short_graph}\n")

In [36]:
visualize_n_many_amr(df_phy, fixed_amr, shortened_amr, 3)

Original question is: Under ideal conditions, the electric and magnetic fields inside a superconductor are zero. Maxwell’s equations imply that which of the following must be true just outside the surface of the superconductor?

AMR graph:
 (q / question 
   :ARG1 (t / true 
      :mod (m / must) 
      :op1 (o / outside 
         :location (s / surface 
            :mod (s2 / superconductor))) 
      :op2 (i / imply-01 
         :ARG1 (e / equation 
            :mod (m2 / Maxwell)) 
         :ARG2 (b / be-located-at 
            :ARG1 (f / field 
               :mod (e2 / electric)) 
            :ARG2 (f2 / field 
               :mod (m2 / magnetic)) 
            :ARG3 (z / zero)))))

Short AMR Graph:
 (q / question 
   :ARG1 (t / true))

Original question is: A proton moves in the +z-direction after being accelerated from rest through a potential difference V. The proton then passes through a region with a uniform electric field E in the +x-direction and a uniform magnetic field B in

In [37]:
visualize_n_many_amr(df_dev_phy, amr_dev_manual, short_dev_amr, 1)

Original question is: The muon decays with a characteristic lifetime of about 10^-6 second into an electron, a muon neutrino, and an electron antineutrino. The muon is forbidden from decaying into an electron and just a single neutrino by the law of conservation of

AMR graph:
 (d / decay-01 
   :ARG0 (m / muon) 
   :duration (l / lifetime 
      :mod (c / characteristic) 
      :quant (t / time-quantity 
         :quant 1e-6 
         :unit (s / second))) 
   :result (a / and 
      :op1 (e / electron) 
      :op2 (mn / neutrino 
         :mod (m2 / muon)) 
      :op3 (an / antineutrino 
         :mod (e2 / electron))) 
   :condition (f / forbid-01 
      :ARG0 m 
      :ARG1 (a2 / and 
         :op1 e 
         :op2 (n / neutrino 
            :quantity 1)) 
      :ARG2 (l2 / law 
         :mod (c2 / conservation))))

Short AMR Graph:
 (d / decay-01 
   :ARG0 (m / muon) 
   :duration (l / lifetime) 
   :result (a / and) 
   :condition (f / forbid-01  m))



# Finally we are running an experiment # 

In [38]:
# def generate_prompts(df, df_dev=None, amr_dev_steps=None, amrs_test_steps=None, prompt_type='base'):
amr_base_prompts = generate_prompts(df_phy, amrs_test_steps=fixed_amr, prompt_type='amr_base')
amr_base_short_prompts = generate_prompts(df_phy, amrs_test_steps=shortened_amr, prompt_type='amr_base')

amr_cot_prompts = generate_prompts(df_phy, df_dev_phy, amr_dev_manual, fixed_amr, prompt_type='amr_cot')
amr_cot_short_prompts = generate_prompts(df_phy, df_dev_phy, short_dev_amr, shortened_amr, prompt_type='amr_cot')

In [39]:
print(f" Prompts for amr_base experiments: \n {amr_base_prompts[1]}\n")
print(f" Prompts for amr_base_short experiments: \n {amr_base_short_prompts[1]}\n")
print(f" Prompts for amr_cot experiments: \n {amr_cot_prompts[1]}\n")
print(f" Prompts for amr_cot_short experiments: \n {amr_cot_short_prompts[1]}")

 Prompts for amr_base experiments: 
 [{'role': 'system', 'content': 'You are given a College Physics question and its Abstract Meaning Representation(AMR). Read the provided question and its AMR pair, then answer the question by picking the correct answer from A, B, C, and D.'}, {'role': 'user', 'content': 'White light is normally incident on a puddle of water (index of refraction 1.33). A thin (500 nm) layer of oil (index of refraction 1.5) floats on the surface of the puddle. Of the following, the most strongly reflected wavelength is\n (i / incident :ARG1 (l / light :mod (w / white)) :ARG2 (p / puddle :mod (n / normal) :part-of (w2 / water :mod (i2 / index :quant 1.33))) :part-of (l2 / layer :mod (t / thin) :quant (d / distance :quant 500 :unit (n2 / nm)) :mod (o / oil :mod (i3 / index :quant 1.5))) :ARG1-of (r / reflect-01 :degree (s / strong) :ARG1 (w2 / wavelength)) :ARG2-of (m / most))\n A: 500 nm\nB: 550 nm\nC: 600 nm\nD: 650 nm'}]

 Prompts for amr_base_short experiments: 
 [{

# Running experiments #

In [40]:
#def run_experiments(PROMPTS, logit_bias, mode='default'):
base_results = run_experiments(base_prompts, standard_logit_bias)

In [41]:
amr_base_results = run_experiments(amr_base_prompts, standard_logit_bias)

In [42]:
amr_base_short_results = run_experiments(amr_base_short_prompts, standard_logit_bias)

In [43]:
amr_cot_results = run_experiments(amr_cot_prompts, standard_logit_bias)

In [44]:
amr_cot_short_results = run_experiments(amr_cot_short_prompts, standard_logit_bias)

In [64]:
print(f" Output for amr_base_short_results: \n {amr_base_short_results[1]}\n")

 Output for amr_base_short_results: 
 ('(i', '(i: -0.0034, 1.0, i: -7.0666, 0.0, l: -7.4859, 0.0')



## RESULTS ##

***Helper function for results***

In [45]:
def calculate_accuracy(df, results):
    correct = 0
    for i in range(len(df)):
        actual_answer = df.iloc[i]['answer']
        predicted_answer = results[i][0]
        if actual_answer == predicted_answer:
            correct += 1
    total = len(results)
    accuracy = correct / total
    return correct, total, accuracy

In [46]:
results_list = [base_results, 
                amr_base_results, 
                amr_base_short_results, 
                amr_cot_results, 
                amr_cot_short_results
               ]

# List of experiment names
experiment_names = ["base", "amr_base", "amr_base_short", "amr_cot", "amr_cot_short"]

# Initialize a DataFrame to store accuracy results
accuracy_df = pd.DataFrame(columns=["Experiment", "Correct", "Total", "Accuracy"])

for idx, (results, name) in enumerate(zip(results_list, experiment_names), 1):
    correct, total, accuracy = calculate_accuracy(df_phy, results)
    accuracy_df.loc[idx] = [name, correct, total, accuracy]

***Accuracy Result***

In [47]:
print(accuracy_df)

       Experiment  Correct  Total  Accuracy
1            base       42    102  0.411765
2        amr_base       20    102  0.196078
3  amr_base_short        1    102  0.009804
4         amr_cot       51    102  0.500000
5   amr_cot_short       49    102  0.480392


In [48]:
amr_base_short_results[:10]

[('(q', '(q: -0.7127, 0.49, q: -1.2691, 0.28, Question: -2.0970, 0.12'),
 ('(i', '(i: -0.0034, 1.0, i: -7.0666, 0.0, l: -7.4859, 0.0'),
 ('(question',
  '(question: -0.8100, 0.44, (q: -0.8123, 0.44, q: -2.3647, 0.09'),
 ('(q', '(q: -0.3679, 0.69, q: -1.4477, 0.24, (question: -2.9559, 0.05'),
 ('Question', 'Question: -0.2366, 0.79, (Q: -2.6365, 0.07, (: -2.8688, 0.06'),
 ('(q', '(q: -0.2608, 0.77, (question: -2.9428, 0.05, 1: -2.9596, 0.05'),
 ('(', '(: -1.5448, 0.21, q: -1.9020, 0.15, (question: -2.1552, 0.12'),
 ('(a', '(a: -1.0648, 0.34, (q: -1.3704, 0.25, (: -2.1709, 0.11'),
 ('(', '(: -0.1308, 0.88, (A: -3.3242, 0.04, (a: -3.3398, 0.04'),
 ('(', '(: -0.2154, 0.81, (A: -2.2988, 0.1, (a: -3.3092, 0.04')]

In [49]:
amr_base_results[:10]

[('B', 'B: -0.7043, 0.49, The: -1.2417, 0.29, Answer: -2.3855, 0.09'),
 ('B', 'B: -0.2928, 0.75, The: -2.2569, 0.1, A: -2.4851, 0.08'),
 ('AM', 'The: -0.3136, 0.73, Answer: -1.9123, 0.15, AM: -2.5164, 0.08'),
 ('(q', '(q: -0.6476, 0.52, q: -1.5520, 0.21, (: -2.8531, 0.06'),
 ('B', 'B: -0.0057, 0.99, What: -5.6599, 0.0, Which: -6.6632, 0.0'),
 ('(q', '(q: -0.1211, 0.89, (question: -3.0813, 0.05, (answer: -3.7332, 0.02'),
 ('B', 'B: -0.6873, 0.5, The: -1.2540, 0.29, Answer: -2.4858, 0.08'),
 ('B', 'B: -0.2587, 0.77, A: -1.5755, 0.21, The: -4.8861, 0.01'),
 ('AM', 'AM: -1.1790, 0.31, (A: -1.2411, 0.29, Question: -1.3054, 0.27'),
 ('AM', 'The: -0.5948, 0.55, AM: -1.5261, 0.22, Answer: -2.1045, 0.12')]

# Analysis and Follow Up Experiment #
Since I got sooo shocking results, I am going to run a follow up experiment,\
where I switch the order of amr and original sentence in the query. 

In [50]:
def generate_prompts_flipped(df, df_dev=None, amr_dev_steps=None, amrs_test_steps=None, prompt_type='base'):
    """
    Generate prompts for College Physics questions with different order between question and amr.
    
    Parameters:
    - df: DataFrame containing the questions and choices from College Physics Test Data.
    - df_dev: DataFrame for College Physics dev dataset, used in generating AMRs.
    - amr_steps: List containing pre-generated amr representations for dev_set.
    - amrs: List of pre-generated amr representation for test_data.
    - prompt_type: type of prompt: 'base', 'amr_steps', 'amr_base', 'amr_cot'.
    
    Returns:
    - A list of prompts according to the specified type.
        - (amr) base: system instruct: task explanation
                      user instruct: questions and answer choice from df

        - amr_steps: system instruct: task explanation of generating AMR
                     user instruct: questions and answer choice from df_dev

        - amr_cot: system instruct: task explanation
               user instruct: questions and answer choice from df_dev with accompanying AMRs.
               assistant instruct: answer 
               ... (x5 examples)
               user instrcut: questions and answer choice from df
    """
    
    prompts = []
    system_instruct = {
        "role": "system",
        "content": ""
    }

    if prompt_type == 'base':
        system_instruct["content"] = "You are given a College Physics question. Read the question, then answer the question by picking the correct answer from A, B, C, and D."
        for index, row in df.iterrows():
            prompt_content = f"{row['question']}\nA: {row['A']}\nB: {row['B']}\nC: {row['C']}\nD: {row['D']}"
            user_instruct = {"role": "user", "content": prompt_content}
            prompts.append([system_instruct, user_instruct])

    # short_amr_base + amr_base prompts
    elif prompt_type == 'amr_base' and amrs_test_steps is not None:
        system_instruct["content"] = "You are given a College Physics question and its Abstract Meaning Representation(AMR). Read the provided question and its AMR pair, then answer the question by picking the correct answer from A, B, C, and D."
        for index, row in df.iterrows():
            amr = amrs_test_steps[index]
            prompt_content = f"{amr}\n {row['question']}\n A: {row['A']}\nB: {row['B']}\nC: {row['C']}\nD: {row['D']}"
            user_instruct = {"role": "user", "content": prompt_content}
            prompts.append([system_instruct, user_instruct])
    
    elif prompt_type == 'amr_steps' and df_dev is not None and amr_dev_steps is not None:
        system_instruct["content"] = "You are given a College Physics question and its Abstract Meaning Representation(AMR). Read the provided question and its AMR pair, then generate AMR for the given sentence."
        amr_instructions = []
        for index, row in df_dev.iterrows():
            amr = amr_dev_steps[index]
            prompt_content = f"{row['question']}"
            amr_instructions.append({"role": "user", "content": prompt_content})
            amr_instructions.append({"role": "assistant", "content": amr})

        for index, row in df.iterrows():
            prompt_content = f"{row['question']}"
            final_question = {"role": "user", "content": prompt_content}
            prompts.append([system_instruct] + amr_instructions + [final_question]) 
            
    elif prompt_type == 'amr_cot' and df_dev is not None and amr_dev_steps is not None and amrs_test_steps is not None:
        system_instruct["content"] = "You are given a College Physics question and its Abstract Meaning Representation(AMR). Read the provided question and its AMR pair, then answer the question by picking the correct answer from A, B, C, and D."
        amr_instructions = []
        for index, row in df_dev.iterrows():
            amr = amr_dev_steps[index]
            prompt_content = f"{amr}\n {row['question']}\n A: {row['A']}\nB: {row['B']}\nC: {row['C']}\nD: {row['D']}"
            amr_instructions.append({"role": "user", "content": prompt_content})
            amr_instructions.append({"role": "assistant", "content": row['answer']})

        for index, row in df.iterrows():
            amr = amrs_test_steps[index]
            prompt_content = f"{amr}\n {row['question']}\n A: {row['A']}\nB: {row['B']}\nC: {row['C']}\nD: {row['D']}"
            final_question = {"role": "user", "content": prompt_content}
            prompts.append([system_instruct] + amr_instructions + [final_question])
            
    return prompts

In [51]:
amr_base_prompts_flipped = generate_prompts_flipped(df_phy, amrs_test_steps=fixed_amr, prompt_type='amr_base')
amr_base_short_prompts_flipped = generate_prompts_flipped(df_phy, amrs_test_steps=shortened_amr, prompt_type='amr_base')
amr_cot_prompts_flipped = generate_prompts_flipped(df_phy, df_dev_phy, amr_dev_manual, fixed_amr, prompt_type='amr_cot')
amr_cot_short_prompts_flipped = generate_prompts_flipped(df_phy, df_dev_phy, short_dev_amr, shortened_amr, prompt_type='amr_cot')

In [52]:
print(f" Prompts for amr_base_flipped experiments: \n {amr_base_prompts_flipped[1]}\n")
print(f" Prompts for amr_base_short_flipped experiments: \n {amr_base_short_prompts_flipped[1]}\n")
print(f" Prompts for amr_cot_flipped experiments: \n {amr_cot_prompts_flipped[1]}\n")
print(f" Prompts for amr_cot_short_flipped experiments: \n {amr_cot_short_prompts_flipped[1]}")

 Prompts for amr_base_flipped experiments: 
 [{'role': 'system', 'content': 'You are given a College Physics question and its Abstract Meaning Representation(AMR). Read the provided question and its AMR pair, then answer the question by picking the correct answer from A, B, C, and D.'}, {'role': 'user', 'content': '(i / incident :ARG1 (l / light :mod (w / white)) :ARG2 (p / puddle :mod (n / normal) :part-of (w2 / water :mod (i2 / index :quant 1.33))) :part-of (l2 / layer :mod (t / thin) :quant (d / distance :quant 500 :unit (n2 / nm)) :mod (o / oil :mod (i3 / index :quant 1.5))) :ARG1-of (r / reflect-01 :degree (s / strong) :ARG1 (w2 / wavelength)) :ARG2-of (m / most))\n White light is normally incident on a puddle of water (index of refraction 1.33). A thin (500 nm) layer of oil (index of refraction 1.5) floats on the surface of the puddle. Of the following, the most strongly reflected wavelength is\n A: 500 nm\nB: 550 nm\nC: 600 nm\nD: 650 nm'}]

 Prompts for amr_base_short_flipped e

***Experiments***

In [53]:
amr_base_flipped_results = run_experiments(amr_base_prompts_flipped, standard_logit_bias)

In [54]:
amr_base_short_flipped_results = run_experiments(amr_base_short_prompts_flipped, standard_logit_bias)

In [55]:
amr_cot_flipped_results = run_experiments(amr_cot_prompts_flipped, standard_logit_bias)

In [56]:
amr_cot_short_flipped_results = run_experiments(amr_cot_short_prompts_flipped, standard_logit_bias)

***Result***

In [57]:
results_list = [base_results, 
                amr_base_flipped_results, 
                amr_base_short_flipped_results, 
                amr_cot_flipped_results, 
                amr_cot_short_flipped_results
               ]

# List of experiment names
experiment_names = ["base", "amr_base_flipped", "amr_base_short_flipped", "amr_cot_flipped", "amr_cot_short_flipped"]

# Initialize a DataFrame to store accuracy results
accuracy_flipped_df = pd.DataFrame(columns=["Experiment", "Correct", "Total", "Accuracy"])

for idx, (results, name) in enumerate(zip(results_list, experiment_names), 1):
    correct, total, accuracy = calculate_accuracy(df_phy, results)
    accuracy_flipped_df.loc[idx] = [name, correct, total, accuracy]

In [58]:
print(f"original:\n {accuracy_df}\n")
print(f"flipped:\n {accuracy_flipped_df}")

original:
        Experiment  Correct  Total  Accuracy
1            base       42    102  0.411765
2        amr_base       20    102  0.196078
3  amr_base_short        1    102  0.009804
4         amr_cot       51    102  0.500000
5   amr_cot_short       49    102  0.480392

flipped:
                Experiment  Correct  Total  Accuracy
1                    base       42    102  0.411765
2        amr_base_flipped       16    102  0.156863
3  amr_base_short_flipped        4    102  0.039216
4         amr_cot_flipped       41    102  0.401961
5   amr_cot_short_flipped       45    102  0.441176


In [61]:
def calculate_average_token_count(outputs):
    total_tokens = 0
    total_outputs = len(outputs)
    for output in outputs:
        if isinstance(output, dict):
            for value in output.values():
                if isinstance(value, str):
                    total_tokens += len(value.split())  # Splitting by whitespace to count tokens
                elif isinstance(value, list):  # Assuming nested list
                    for sub_value in value:
                        total_tokens += len(sub_value.split())
    average_token_count = total_tokens / total_outputs
    return average_token_count

# Assuming `generate_prompts` returns a list of dictionaries
amr_base_prompts = generate_prompts(df_phy, amrs_test_steps=fixed_amr, prompt_type='amr_base')

average_token_count = calculate_average_token_count(amr_base_prompts)
print("Average token count:", average_token_count)

Average token count: 0.0


***Flipping the ordering did not change the result at all***\
Just providing shorter_amr did not improve the performance.