In [24]:
import os
import sys
import anthropic
import ollama
import random
import pandas as pd
from tqdm import tqdm
from google.generativeai.types import RequestOptions
from google.api_core import retry
from typing import List, Tuple
import json
from openai import OpenAI
import datetime

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from concurrent.futures import ThreadPoolExecutor, TimeoutError

# Prompts

In [13]:
mega_prompt = """
You are a helpful assistant tasked with tagging key facts in the text and then using those facts to answer the final question. Your goal is to analyze the input question, identify distinct key points needed to answer the question, and then wrap each of these points in custom HTML-like tags like <fact1>, <fact2>, <fact3>, etc and then use these tags to answer the question. After you have tagged the question in tags, use these tags in your reasoning process to answer the question. The <fact> tags should be interweaved in the sentences in your reasoning.

Instructions:

1. **Read and Understand the Input Question**:
   - Carefully analyze the input question to understand its context.
   - Identify distinct facts, entities, or concepts that contribute to the meaning of the question.

2. **Identify Key Points (Facts)**:
   - Each distinct and meaningful segment of the question that provides important information should be considered a "fact."
   - This can include the subject, object, context, or any qualifiers that make the question specific.

3. **Tag Each Fact**:
   - Assign a unique tag to each fact in the form <fact1>, <fact2>, etc.
   - Wrap each identified key point in these tags.
   - The tags should start from <fact1> for the first key point, and increment for each new fact identified.

4. **Formatting Requirements**:
   - Maintain the original structure of the question as much as possible.
   - Make sure each tag encapsulates the entire key point clearly without splitting phrases unnecessarily.

### Example

#### Input Question:
"Does climate change positively affect polar bear populations in the Arctic? Anwser Options: Yes, No, Depends"

#### Step-by-Step Identification:
1. Fact 1: "Climate change" (main topic causing the effect)
2. Fact 2: "Polar bear populations" (who is affected)
3. Fact 3: "In the Arctic" (the location/context)

#### Reformatted Output:
"How does <fact1>climate change</fact1> affect <fact2>polar bear populations</fact2> in <fact3>the Arctic</fact3>?"

#### Answer Reasoning
"<fact1>Climate change</fact1> primarily affects <fact2>polar bears</fact2> through the loss of <fact3>sea ice</fact3>, which is crucial for their hunting and survival. As <fact3>Arctic ice</fact3> melts earlier and forms later due to <fact1>rising temperatures</fact1>, <fact2>polar bears</fact2> face reduced access to their main prey (seals), leading to nutritional stress, longer fasting periods, and <fact2>overall population decline</fact2>. The impact of <fact1>climate change</fact1> on <fact2>polar bear populations</fact2> is therefore negative, so the answer is {No}"

### Final Output Format

Ensure that the final output is:
- Grammatically correct.
- Properly formatted with each key point enclosed in <fact> tags.
- Consistent with the original meaning of the input question.
- Final answer is enclosed in curly braces e.g. {answer}
"""

In [35]:
semantic_zero_prompt = """
# General Instructions
You are a helpful assistant tasked with tagging key facts in the text and then using those facts to answer the final question. Your first goal is to analyze the input question and identify distinct key points needed to answer the question and then wrap each of these points in custom HTML-like tags like <fact1>, <fact2>, <fact3>. After you have tagged the reformatted question in tags, use these tags in your reasoning process to answer the question. The <tags> should be interweaved in the sentences in your reasoning. You are only concerned with highlighting facts that are essential to answering the question. You should not tag irrelevant information. Once you have determined the answer to the question, you should put the concise version of your final answer in curly braces. For example, {3} or {True} or {A}.

Your response should follow this format:

Reformatted Question:
.... (reformatted question with tags)

Answer Reasoning:
.... (reasoning with tags)

Final Answer: {answer}

# Tagging Information
Each tag should be named according to the type of information it represents. There are multiple distinct types of tags:

- <person> for people or characters
- <location> for places or regions
- <number> for numerical values
- <time> for time-related information
- <concept> for abstract ideas
- <object> for physical things


## Tagging Examples 

### Person Example
For the text "Adam worked as a volunteer firefighter in order to help serve his community.", you would tag the text as follows: "<person>Adam worked as a volunteer firefighter</person> in order to help serve his community."

The text in the <person> tag should include key information relevant to that person. While acceptable, it does not always need to be just their name. 

### Location Example
For the text "What is the capital of France?", you would tag the question as follows: "What is the <location>capital of France</location>?"

### Number Example
For the text "The average temperature in the desert is 110 degrees Fahrenheit.", you would tag the text as follows: "The average temperature in the desert is <number>110 degrees Fahrenheit</number>."

The text in the <number> tag should include any information relevant to a quantity of something. 

### Time Example
For the text "The earliest recorded human writing was composed nearly 4,000 years ago, sometime around 2000 B.C." would be tagged as "The earliest recorded human writing was composed nearly <time1>4,000 years</time1> ago, sometime around <time2>2000 B.C.</time2>"

### Concept Example
For the text "The field of machine learning is a rapidly growing area of study.", you would tag the text as follows: "The field of <concept>machine learning</concept> is a rapidly growing area of study."

### Object Example
For the text "The astronaut used a special tool to repair the damaged satellite.", you would tag the text as follows: "The astronaut used a <object1>special tool</object2> to repair the <object2>damaged satellite</object2>."

## What NOT to Tag
Given an inputted question, there are large amounts of possible tags that could be used. However, not all of these tags are relevant to the final question. You should only tag information that is ESSENTIAL to answering the question. Here is an example of a question that has been over-tagged:

Original Question:
Kaden is a computer science major at NYU where he studies subjects such as physics, math, and the principles of programming languages. He has a pet dog named Max who is 3 years old. Kaden has a part-time job at the local grocery store where he works every Saturday and Sunday. What major ocean does Kaden live near?

Reformatted Question:
"<person1>Kaden</person1> is a <concept1>computer science major</concept1> at <location1>NYU</location1> where he studies subjects such as <concept2>physics</concept2>, <concept3>math</concept3>, and the <concept4>principles of programming languages</concept4>. <person2>He has a pet dog</person2> named <person3>Max</person3> who is <time1>3 years old</time1>. <person4>Kaden has a part-time job</person4> at the <location2>local grocery store</location2> where he works every <time2>Saturday</time2> and <time3>Sunday</time3>. What <object1>major ocean</object1> does <person1>Kaden</person1> live near?"

While this question does have valid tags, almost all of them are not relevant to the question. Here is an exmaple of the same question with only the relevant tags:

Reformatted Question:
"<person1>Kaden</person1> is a computer science major at <location1>NYU</location1> where he studies subjects such as physics, math, and the principles of programming languages. He has a pet dog named Max who is 3 years old. Kaden has a part-time job at the local grocery store where he works every Saturday and Sunday. What <object1>major ocean</object1> does <person1>Kaden</person1> live near?"

Here is an example answer for the properly formatted question:
"Since <person1>Kaden</person1> attends <location1>NYU</location1>, he lives in New York City. Given that New York City is on the east coast of the United States, the nearest major ocean that <person1>Kaden</person1> lives near is the Atlantic Ocean. 
Final answer: {Atlantic Ocean}"
"""

In [49]:
extract_facts = """
# General Instructions
You are a helpful assistant tasked with extracting key facts from the text. Your goal is to analyze the input question and identify distinct sections of the text needed to answer the question. You are only concerned with highlighting facts that are ESSENTIAL to answering the question. You should ignore irrelevant information that does not help answer the final question. 

Reasoning Process:
....

Key Facts:
....

# Fact Extraction Details

## Properly Done Example
Here is an example of properly extracting the information needed to answer the final question:

Original Question:
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?

Reasoning Process:
Identifying the important information in this problem centers on determining what leads to finding the difference between the wallet's cost and Betty's total available money.
The wallet's cost of $100 serves as the target amount, establishing the baseline for all calculations. Understanding Betty's available money requires summing all her financial sources. Her initial savings, described as half the needed amount, translates to $50. The parents' contribution of $15 directly adds to this sum. The grandparents' contribution, defined as twice the parents' amount, equals $30 and represents another essential component.
Each numerical value and relationship presented in the problem contributes directly to calculating Betty's total available funds. The absence of any single piece - the wallet cost, initial savings, parental contribution, or the relationship between parental and grandparental gifts - would make it impossible to determine the remaining amount needed.

Key Facts:
- The wallet costs $100
- Betty has half of the money needed
- Bett's parents gave her $15
- Betty's Grandparents gave twice as much as parents

## What NOT to Extract
Given an inputted question, there are large amounts of possible facts that could be used. However, not all of these citations are relevant to the final question. You should only extract information that is ESSENTIAL to answering the question. Here is an example of a question that has too many extracted facts:

Original Question:
Kaden is a computer science major at NYU where he studies subjects such as physics, math, and the principles of programming languages. He has a pet dog named Max who is 3 years old. Kaden has a part-time job at the local grocery store where he works every Saturday and Sunday. What major ocean does Kaden live near?

Reasoning Process:
....

Key Facts:
- Kaden is a computer science major
- Kaden attends NYU
- Kaden studies physics, math and programming languages
- Kaden has a dog
- Max, the dog, is 3 years old
- Kaden has a part time job
- Kaden works at a grocery store
- Kaden works every Saturday and Sunday

While these are all valid pieces of information, almost all of them are not relevant to the question. Here is what the key facts section should actually look like:

Key Facts:
- Kaden attends NYU
"""


In [2]:
extract_facts_reasoning = """
# General Instructions
You are a helpful assistant tasked with extracting key facts from the text. Your goal is to analyze the input question and identify distinct sections of the text needed to answer the question. You are only concerned with identifying quotes that are ESSENTIAL to answering the question. You should ignore irrelevant information that does not help answer the final question. You should extract the exact parts of the text, not summarized versions of the text. After you have decided what specific quotes to use, tag the original question with xml tags around those quotes. Here is what your response should look like:

### Fact Extraction Reasoning:
....

### Key Facts:
....

### Reformatted Question:
...

# Fact Extraction Details
Each block of text has a large amount of information that could be a valid fact. However, you should only extract the most important quotes. Try to keep each quote as short as possible while still maintaining the essential information.

## Properly Done Example
Here is an example of properly extracting the information needed to answer the final question:

### Original Question:
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?

### Fact Extraction Reasoning:
Identifying the important information in this problem centers on determining what leads to finding the difference between the wallet's cost and Betty's total available money.
The wallet's cost of $100 serves as the target amount, establishing the baseline for all calculations. Understanding Betty's available money requires summing all her financial sources. Her initial savings, described as half the needed amount, translates to $50. The parents' contribution of $15 directly adds to this sum. The grandparents' contribution, defined as twice the parents' amount, equals $30 and represents another essential component.
Each numerical value and relationship presented in the problem contributes directly to calculating Betty's total available funds. The absence of any single piece - the wallet cost, initial savings, parental contribution, or the relationship between parental and grandparental gifts - would make it impossible to determine the remaining amount needed.

### Key Facts:
<fact1>wallet which costs $100</fact1>
<fact2>Betty has only half of the money</fact2>
<fact3>parents decided to give her $15</fact3>
<fact4>grandparents twice as much as her parents</fact4>

### Reformatted Question:
Betty is saving money for a new <fact1>wallet which costs $100</fact1>. <fact2>Betty has only half of the money</fact2> she needs. Her <fact3>parents decided to give her $15</fact3> for that purpose, and her <fact4>grandparents twice as much as her parents</fact4>. How much more money does Betty need to buy the wallet?

## What NOT to Extract
Given an inputted question, there are large amounts of possible facts that could be used. However, not all of these citations are relevant to the final question. You should only extract information that is essential to answering the question. Here is an example of a question that has too many extracted facts:

### Original Question:
Kaden is a computer science major at NYU where he studies subjects such as physics, math, and the principles of programming languages. He has a pet dog named Max who is 3 years old. Kaden has a part-time job at the local grocery store where he works every Saturday and Sunday. What major ocean does Kaden live near?

### Fact Extraction Reasoning:
....

### Key Facts:
<fact1>Kaden is a computer science major</fact1>
<fact2>NYU</fact2>
<fact3>he studies subjects such as physics, math, and the principles of programming languages</fact3>
<fact4>pet dog named Max</fact4>
<fact5>Max, the dog, is 3 years old</fact5>
<fact6>Kaden has a part-time job</fact6>
<fact7>local grocery store</fact7>
<fact8>he works every Saturday and Sunday</fact8>

### Reformatted Question:
...

While these are all valid pieces of information, almost none of them are relevant to the question. We only want information that is relevant to the final question, which in this case is "What major ocean does Kaden live near?" This is what the key facts section and the associated reformatted question should actually look like:

### Key Facts:
<fact1>Kaden is a computer science major at NYU</fact1>

### Reformatted Question 
<fact1>Kaden is a computer science major at NYU</fact1> where he studies subjects such as physics, math, and the principles of programming languages. He has a pet dog named Max who is 3 years old. Kaden has a part-time job at the local grocery store where he works every Saturday and Sunday. What major ocean does Kaden live near?
### Tagg An
"""

In [4]:
extract_facts_only = """
# General Instructions
You are a helpful assistant tasked with extracting key facts from the text. Your goal is to analyze the input question and identify distinct sections of the text needed to answer the question. You are only concerned with identifying quotes that are ESSENTIAL to answering the question. You should ignore irrelevant information that does not help answer the final question. You should extract the exact parts of the text, not summarized versions of the text. Try to keep the length of your key facts as short as possible without losing important context. Specific numbers, locations, details, etc tend to be good facts to extract. After you have decided what specific quotes to use from the text, tag the original question with xml tags around those quotes. If the question includes multiple choice questions, make sure to include those in the formatted question. Here is what your response should look like:

### Fact Extraction Reasoning:
....

### Key Facts:
....

### Reformatted Question:
...

# Fact Extraction Details
Each block of text has a large amount of information that could be a valid fact. However, you should only extract the most important quotes. Try to keep each quote as short as possible while still maintaining the essential information.

# Properly Done Examples
Here are examples of properly extracting the information needed to answer the final question:

## Example 1
### Original Question:
The following paragraphs each describe a set of seven objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were seven golfers: Eve, Rob, Dan, Mel, Ana, Eli, and Ada. Ada finished above Rob. Eve finished below Rob. Mel finished above Eli. Ada finished below Dan. Ana finished third. Eli finished second. Options: (A) Eve finished first (B) Rob finished first (C) Dan finished first (D) Mel finished first (E) Ana finished first (F) Eli finished first (G) Ada finished first

### Fact Extraction Reasoning:
Given that all the answer options contain information about all seven golfers and there is no immediate obvious answer based off the given answer options, we need to identify the relative positions of each of the golfers. The key facts in the statement are the positions of Eve, Rob, Dan, Mel, Ana, Eli, and Ada.

### Key Facts:
<fact1>Ada finished above Rob</fact1>
<fact2>Eve finished below Rob</fact2>
<fact3>Mel finished above Eli</fact3>
<fact4>Ada finished below Dan</fact4>
<fact5>Ana finished third</fact5>
<fact6>Eli finished second</fact6>

### Reformatted Question:
The following paragraphs each describe a set of seven objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were seven golfers: Eve, Rob, Dan, Mel, Ana, Eli, and Ada. <fact1>Ada finished above Rob</fact1>. <fact2>Eve finished below Rob</fact2>. <fact3>Mel finished above Eli</fact3>. <fact4>Ada finished below Dan</fact4>. <fact5>Ana finished third</fact5>. <fact6>Eli finished second</fact6>. Options: (A) Eve finished first (B) Rob finished first (C) Dan finished first (D) Mel finished first (E) Ana finished first (F) Eli finished first (G) Ada finished first

## Example 2
### Original Question:
Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?
A) $10
B) $15
C) $20
D) $25

### Fact Extraction Reasoning:
Identifying the important information in this problem centers on determining what leads to finding the difference between the wallet's cost and Betty's total available money.
The wallet's cost of $100 serves as the target amount, establishing the baseline for all calculations. Understanding Betty's available money requires summing all her financial sources. Her initial savings, described as half the needed amount, translates to $50. The parents' contribution of $15 directly adds to this sum. The grandparents' contribution, defined as twice the parents' amount, equals $30 and represents another essential component.
Each numerical value and relationship presented in the problem contributes directly to calculating Betty's total available funds. The absence of any single piece - the wallet cost, initial savings, parental contribution, or the relationship between parental and grandparental gifts - would make it impossible to determine the remaining amount needed.

### Key Facts:
<fact1>wallet which costs $100</fact1>
<fact2>Betty has only half of the money</fact2>
<fact3>parents decided to give her $15</fact3>
<fact4>grandparents twice as much as her parents</fact4>

### Reformatted Question:
Betty is saving money for a new <fact1>wallet which costs $100</fact1>. <fact2>Betty has only half of the money</fact2> she needs. Her <fact3>parents decided to give her $15</fact3> for that purpose, and her <fact4>grandparents twice as much as her parents</fact4>. How much more money does Betty need to buy the wallet?
A) $10
B) $15
C) $20
D) $25

## Example 3
### Original Question:
Are Doctors of Homeopathy more likely than Doctors of Internal Medicine to recommend Quartz as a treatment?

### Fact Extraction Reasoning:
The key facts in this question are the types of doctors being compared and the treatment they are likely to recommend. The specific types of doctors are Doctors of Homeopathy and Doctors of Internal Medicine. The treatment being considered is Quartz. If any of these facts were missing, it would be impossible to answer the question.

### Key Facts:
<fact1>Doctors of Homeopathy</fact1>
<fact2>Doctors of Internal Medicine</fact2>
<fact3>Quartz</fact3>

### Reformatted Question: 
Are <fact1>Doctors of Homeopathy</fact1> more likely than <fact2>Doctors of Internal Medicine</fact2> to recommend <fact3>Quartz</fact3> as a treatment?

## Example 4
### Original Question:
Alice and Bob are sister and brother. Alice has 4 sisters and Bob has 1 brother. How many sisters does Bob have?

### Fact Extraction Reasoning:
In this question, it's given that Alice and Bob are sister and brother and that Alice has 4 sisters. The question asks for the number of sisters Bob has, which is the missing piece of information needed to answer the question. The key facts in this question are the relationships between Alice and Bob and the number of sisters between them. Given that the question does not ask about the number of brothers Bob has, this information is not relevant to the final answer and does not need to be tagged.

### Key Facts:
<fact1>Alice and Bob are sister and brother</fact1>
<fact2>Alice has 4 sisters</fact2>

### Reformatted Question:
<fact1>Alice and Bob are sister and brother</fact1>. <fact2>Alice has 4 sisters</fact2> and Bob has 1 brother. How many sisters does Bob have?

## Example 5
### Original Question:
Are both Kurram Garhi and Trojkrsti located in the same country?

### Fact Extraction Reasoning:
The question asks about the location of two places, Kurram Garhi and Trojkrsti, and whether they are in the same country. The key facts in this question are the names of the two places. Without this information, it would be impossible to determine if they are in the same country.

### Key Facts:
<fact1>Kurram Garhi</fact1>
<fact2>Trojkrsti</fact2>

### Reformatted Question:
Are both <fact1>Kurram Garhi</fact1> and <fact2>Trojkrsti</fact2> located in the same country?

## Example 6
### Original Question:
Two friends plan to walk along a 43-km trail, starting at opposite ends of the trail at the same time. If Friend P's rate is 15% faster than Friend Q's, how many kilometers will Friend P have walked when they pass each other? A)21, B)21.5, C)22, D)22.5, E)23

### Fact Extraction Reasoning:
The key facts in this question are the length of the trail, the starting positions of the friends and the relative speeds of the friends. Without any of these pieces of information, it would be impossible to solve the problem.

### Key Facts:
<fact1>43-km trail</fact1>
<fact2>starting at opposite ends of the trail at the same time</fact2>
<fact3>P's rate is 15% faster than Friend Q's</fact3>

### Reformatted Question:
Two friends plan to walk along a <fact1>43-km trail</fact1>, <fact2>starting at opposite ends of the trail at the same time</fact2>. If Friend P's rate is <fact3>15% faster than Friend Q's</fact3>, how many kilometers will Friend P have walked when they pass each other? A)21, B)21.5, C)22, D)22.5, E)23

# What NOT to Extract
Given an inputted question, there are large amounts of possible facts that could be used. However, not all of these citations are relevant to the final question. You should only extract information that is essential to answering the question. Here is an example of a question that has too many extracted facts:

## Negative Example
### Original Question:
Kaden is a computer science major at NYU where he studies subjects such as physics, math, and the principles of programming languages. He has a pet dog named Max who is 3 years old. Kaden has a part-time job at the local grocery store where he works every Saturday and Sunday. What major ocean does Kaden live near?

### Fact Extraction Reasoning:
....

### Key Facts:
<fact1>Kaden is a computer science major</fact1>
<fact2>NYU</fact2>
<fact3>he studies subjects such as physics, math, and the principles of programming languages</fact3>
<fact4>pet dog named Max</fact4>
<fact5>3 years old</fact5>
<fact6>Kaden has a part-time job</fact6>
<fact7>local grocery store</fact7>
<fact8>he works every Saturday and Sunday</fact8>

### Reformatted Question:
<fact1>Kaden is a computer science major</fact1> at <fact2>NYU</fact2> where <fact3>he studies subjects such as physics, math, and the principles of programming languages</fact3>. He has a <fact4>pet dog named Max</fact4> who is <fact5>3 years old</fact5>. <fact6>Kaden has a part-time job</fact6> at the <fact7>local grocery store</fact7> where <fact8>he works every Saturday and Sunday</fact8>. What major ocean does Kaden live near?

While these are all valid pieces of information, almost none of these facts are relevant to the question. We only want information that is relevant to the final question, which in this case is "What major ocean does Kaden live near?" This is what the key facts section and the associated reformatted question should actually look like:

### Key Facts:
<fact1>Kaden is a computer science major at NYU</fact1>

### Reformatted Question 
<fact1>Kaden is a computer science major at NYU</fact1> where he studies subjects such as physics, math, and the principles of programming languages. He has a pet dog named Max who is 3 years old. Kaden has a part-time job at the local grocery store where he works every Saturday and Sunday. What major ocean does Kaden live near?

Extract the key facts and reformat the following question:
"""

In [1]:
second_turn_facts = """
Now that you have put the key facts in tags, use these tags in your reasoning process to answer the question. The <fact...> tags should be interweaved in the sentences in your reasoning. Put your final answer in curly braces. For example, {3} or {True} or {A}.

Here is an example of how you would answer the question based off the Reformatted Question you previously created:

### Reformatted Question:
How does <fact1>climate change</fact1> affect <fact2>polar bear populations</fact2> in <fact3>the Arctic</fact3>?

### Final Answer Reasoning
<fact1>Climate change</fact1> primarily affects <fact2>polar bears</fact2> through the loss of <fact3>sea ice</fact3>, which is crucial for their hunting and survival. As <fact3>Arctic ice</fact3> melts earlier and forms later due to <fact1>rising temperatures</fact1>, <fact2>polar bears</fact2> face reduced access to their main prey (seals), leading to nutritional stress, longer fasting periods, and <fact2>overall population decline</fact2>. The impact of <fact1>climate change</fact1> on <fact2>polar bear populations</fact2> is therefore negative, so the answer is {No}

Perform your final answering reasoning on the Reformatted Question you created earlier. Do not repeat the Reformatted Question in your new response.
"""

In [5]:
answer_question_only = """
You are a helpful assistant tasked with using the key facts in a block of text to reason through the final question. The most important parts of the text are wrapped in custom XML-like tags like <fact1>, <fact2>, <fact3>. While you are reasoning through your answer, the <fact> tags should be interweaved in your response. Think through your answer step by step using the information in these tags. Once you have determined the answer to the question, you should put the concise version of your final answer in curly braces.

# Example 1
## Question:
The following paragraphs each describe a set of seven objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were seven golfers: Eve, Rob, Dan, Mel, Ana, Eli, and Ada. <fact1>Ada finished above Rob</fact1>. <fact2>Eve finished below Rob</fact2>. <fact3>Mel finished above Eli</fact3>. <fact4>Ada finished below Dan</fact4>. <fact5>Ana finished third</fact5>. <fact6>Eli finished second</fact6>. Options: (A) Eve finished first (B) Rob finished first (C) Dan finished first (D) Mel finished first (E) Ana finished first (F) Eli finished first (G) Ada finished first

## Answer:
Step-by-Step Deduction
Step 1: Placing Eli and Ana
Since we know:
<fact6>Eli finished second</fact6> and
<fact5>Ana finished third</fact5>,
we can place them immediately in the following positions:

Position	Golfer
1	?
2	Eli
3	Ana
4	?
5	?
6	?
7	?
Step 2: Applying Ada and Dan's Information
<fact4>Ada finished below Dan</fact4>: This means Dan must be placed in a higher position than Ada.
Since <fact6>Eli is in the second position</fact6> and <fact5>Ana is in the third position</fact5>, Dan must be in the 1st, 4th, 5th, 6th, or 7th position.

Step 3: Applying Mel and Eli's Information
<fact3>Mel finished above Eli</fact3>: Since <fact6>Eli is in 2nd</fact6>, Mel must be in the 1st position.
So, we place Mel in the 1st position:

Position	Golfer
1	Mel
2	Eli
3	Ana
4	?
5	?
6	?
7	?
Step 4: Applying Ada and Rob's Information
<fact1>Ada finished above Rob</fact1>: This means Rob must be placed in a lower position than Ada.
<fact2>Eve finished below Rob</fact2>: This means Rob must be above Eve.
Since <fact4>Dan must be above Ada</fact4> and we still have the 4th, 5th, 6th, and 7th positions open, let's try placing Dan in the 4th position and Ada in the 5th position (since <fact4>Ada has to be below Dan</fact4>).

Position	Golfer
1	Mel
2	Eli
3	Ana
4	Dan
5	Ada
6	?
7	?
Step 5: Placing Rob and Eve
Since Ada finished above Rob and Eve finished below Rob, we can place Rob in the 6th position and Eve in the 7th position.

Position	Golfer
1	Mel
2	Eli
3	Ana
4	Dan
5	Ada
6	Rob
7	Eve
Conclusion
The final order of golfers is:

Mel
Eli
Ana
Dan
Ada
Rob
Eve
According to this arrangement, Mel finished first.

Answer
Since Mel finished first, the correct answer is {D}.

# Example 2
## Question:
<fact1>Betty picked 16 strawberries</fact1>. <fact2>Matthew picked 20 more strawberries than Betty</fact2> and <fact3>twice as many as Natalie</fact3>. They used their strawberries to make jam. <fact4>One jar of jam used 7 strawberries</fact4> and they sold <fact5>each jar at $4</fact5>. How much money were they able to make from the strawberries they picked?

## Answer:
Matthew picked <fact1>16</fact1> <fact2>+ 20</fact2> = 36 strawberries.\nNatalie picked 36<fact3>/2</fact3> = 18 strawberries.\nAll together, they have 16 + 36 + 18 = 70 strawberries.\nThey can make 70<fact4>/7</fact4> = 10 jars of strawberries.\nThey earn 10 <fact5>x $4</fact5> = 40 from the strawberries they picked. The total amount of money they earned was {40}.

# Example 3
## Question:
Are <fact1>Doctors of Homeopathy</fact1> more likely than <fact2>Doctors of Internal Medicine</fact2> to recommend <fact3>Quartz</fact3> as a treatment?

## Answer:
<fact1>Doctors of homeopathy</fact1> are more likely than <fact2>doctors of internal medicine</fact2> to recommend substances like <fact3>quartz</fact3> or other alternative therapies. <fact1>People practicing homeopathic medicine</fact1> often recommend natural elements and alternative remedies that are not generally supported by conventional scientific research, including crystals like <fact3>quartz</fact3>. <fact3>Quartz</fact3> may be suggested by some <fact1>homeopathic practitioners</fact1> for its perceived energetic or vibrational properties, though these claims lack empirical support in mainstream medical science.

In contrast, <fact2>doctors of internal medicine</fact2>, who adhere to evidence-based practices, would be unlikely to recommend <fact3>quartz</fact3> or similar treatments, as there is no scientifically validated therapeutic benefit attributed to <fact3>quartz</fact3> in medical literature. Therefore, the answer is {Yes}.

# Example 4
## Question:
<fact1>Alice and Bob are sister and brother</fact1>. <fact2>Alice has 4 sisters</fact2> and Bob has 1 brother. How many sisters does Bob have?

## Answer
Since <fact1>Alice is one of Bob's sisters</fact1>, and <fact2>Alice has 4 sisters</fact2>, we know that Bob must also have those same <fact2>4 sisters</fact2> in addition to having <fact1>Alice as a sister</fact1>. Therefore, Bob has {5} sisters.

# Example 5
## Question:
Are both <fact1>Kurram Garhi</fact1> and <fact2>Trojkrsti</fact2> located in the same country?
## Answer
Kurram Garhi is located in Pakistan. It’s a small area within the Kurram District, in the Khyber Pakhtunkhwa region.

Trojkrsti, on the other hand, is not located in Pakistan. It is a village situated in North Macedonia, a country in Southeast Europe.

Since Kurram Garhi is in Pakistan and Trojkrsti is in North Macedonia, they are not located in the same country. The answer is {No}.

# Example 6
## Question:
Two friends plan to walk along a <fact1>43-km trail</fact1>, <fact2>starting at opposite ends of the trail at the same time</fact2>. If Friend P's rate is <fact3>15% faster than Friend Q's</fact3>, how many kilometers will Friend P have walked when they pass each other? A)21, B)21.5, C)22, D)22.5, E)23
## Answer
If friend Q complete x kilometers, then P completes <fact3>1.15x kilometers</fact3>.\nx + <fact3>1.15x</fact3> = <fact1>43</fact1>\n2.15x=<fact1>43</fact1>\nx = <fact1>43</fact1>/2.15 = 20\nThen P will have have walked 1.15*20=23 km.\nThe answer is {E}.

# Example 7
## Question:
<fact1>Kaden is a computer science major at NYU</fact1> where he studies subjects such as physics, math, and the principles of programming languages. He has a pet dog named Max who is 3 years old. Kaden has a part-time job at the local grocery store where he works every Saturday and Sunday. What major ocean does Kaden live near?
## Answer

New York City, where <fact1>NYU</fact1> is located, is on the eastern coast of the United States, adjacent to the Atlantic Since Kaden is studying at <fact1>NYU</fact1>, he lives near the Atlantic Ocean. The answer is {Atlantic Ocean}.

Using the key facts in the text, think step by step to get your answer. Put your final answer in curly braces e.g. {3}. Your final answer should only be the final number with no other text.
"""

## Tin Prompt

In [1]:
examples_for_grounding_in_question = """
# EXAMPLES
Below are examples of questions before and after key phrases are tagged using <fact> tags.
If one key phrase was absent, it would be impossible for one to answer the question correctly.

## Question 1: 
### BEFORE: 
Sam works at the Widget Factory, assembling Widgets. He can assemble 1 widget every 10 minutes. Jack from the loading dock can help assemble widgets when he doesn't have anything else to do. When he helps, they put together 2 complete widgets every 15 minutes. Recently the factory hired Tony to help assemble widgets. Being new to the job, he doesn't work as fast as Sam or Jack. Yesterday Sam worked for 6 hours before he had to leave work early for a dentist appointment. Jack was able to help out for 4 hours before he had to go back to the loading dock to unload a new shipment of widget materials. Tony worked the entire 8-hour shift. At the end of the day, they had completed 68 widgets. How long does it take Tony to assemble a Widget, in minutes?

### AFTER:
Sam works at the Widget Factory, assembling Widgets. He can assemble <fact1>1 widget every 10 minutes</fact1>. Jack from the loading dock can help assemble widgets when he doesn't have anything else to do. When he helps, they put together <fact2>2 complete widgets every 15 minutes</fact2>. Recently the factory hired Tony to help assemble widgets. Being new to the job, he doesn't work as fast as Sam or Jack. Yesterday Sam worked for <fact3>6 hours</fact3> before he had to leave work early for a dentist appointment. Jack was able to help out for <fact4>4 hours</fact4> before he had to go back to the loading dock to unload a new shipment of widget materials. Tony worked the entire <fact5>8-hour shift</fact5>. At the end of the day, they had completed <fact6>68 widgets</fact6>. How long does it take Tony to assemble a Widget, in minutes?

Sam assembles <fact1>1 widget every 10 minutes</fact1>, or 6 per hour. He worked <fact3>6 hours</fact3> total, but for <fact4>4</fact4> of those, Jack helped, and together they assembled <fact2>2 complete widgets every 15 minutes</fact2>, which is 8 widgets per hour. In those <fact4>4 hours</fact4>, Sam and Jack assembled 32 widgets. For the remaining 2 hours, Sam worked alone, assembling 12 more widgets. Altogether, Sam and Jack contributed 44 widgets.

Tony worked 8 hours, and since the total number of widgets completed was 68, Tony assembled the remaining 24 widgets. His rate is 24 widgets over 8 hours, or 3 widgets per hour.

Since Tony assembles 3 widgets per hour, he takes 20 minutes to assemble one widget.

Answer: Tony takes 20 minutes per widget.


## Question 2: 
### BEFORE: 
For every 12 cans you recycle, you receive $0.50, and for every 5 kilograms of newspapers, you receive $1.50. If your family collected 144 cans and 20 kilograms of newspapers, how much money would you receive?

### AFTER: 
For <fact1>every 12 cans</fact1> you recycle, you receive <fact2>$0.50</fact2>, and for <fact3>every 5 kilograms of newspapers</fact3>, you receive <fact4>$1.50</fact4>. If your family collected <fact5>144 cans</fact5> and <fact6>20 kilograms of newspapers</fact6>, how much money would you receive?

## Question 3: 
### BEFORE: 
At a presentation about post traumatic stress disorder, would Ariana Grande be a topic of relevance?

### AFTER: 
At a presentation about <fact1>post traumatic stress disorder</fact1>, would <fact2>Ariana Grande</fact2> be a topic of relevance?

## Question 4: 
### BEFORE: 
Has the Indian Ocean garbage patch not completed two full rotations of debris since its discovery?

### AFTER:
Has the <fact1>Indian Ocean garbage patch</fact1> <fact2>not</fact2> completed <fact3>two full rotations</fact3> of debris since its discovery?

## Question 5: 
### BEFORE:
The following paragraphs each describe a set of seven objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were seven golfers: Ana, Eve, Ada, Dan, Rob, Amy, and Joe. Dan finished third. Ana finished above Ada. Amy finished last. Dan finished below Rob. Eve finished below Ada. Rob finished below Joe. Choose one correct option: (A) Ana finished third (B) Eve finished third (C) Ada finished third (D) Dan finished third (E) Rob finished third (F) Amy finished third (G) Joe finished third

### AFTER:
The following paragraphs each describe a set of seven objects arranged in a fixed order. The statements are logically consistent within each paragraph. In a golf tournament, there were seven golfers: Ana, Eve, Ada, Dan, Rob, Amy, and Joe. <fact1>Dan finished third</fact1>. Ana finished above Ada. Amy finished last. Dan finished below Rob. Eve finished below Ada. Rob finished below Joe. Choose one correct option: (A) Ana finished third (B) Eve finished third (C) Ada finished third (D) Dan finished third (E) Rob finished third (F) Amy finished third (G) Joe finished third

## Question 6: 
### BEFORE:
The following paragraphs each describe a set of seven objects arranged in a fixed order. The statements are logically consistent within each paragraph. On a branch, there are seven birds: a cardinal, a blue jay, a robin, a crow, a falcon, a hawk, and a raven. The hawk is the second from the right. The raven is the fourth from the left. The robin is the second from the left. The cardinal is to the left of the raven. The falcon is to the left of the robin. The crow is to the right of the blue jay. Choose one correct option: (A) The cardinal is the second from the left (B) The blue jay is the second from the left (C) The robin is the second from the left (D) The crow is the second from the left (E) The falcon is the second from the left (F) The hawk is the second from the left (G) The raven is the second from the left

### AFTER:
On a branch, there are seven birds: a cardinal, a blue jay, a robin, a crow, a falcon, a hawk, and a raven. <fact1>The hawk is the second from the right</fact1>. <fact2>The raven is the fourth from the left</fact2>. <fact3>The robin is the second from the left</fact3>. <fact4>The cardinal is to the left of the raven</fact4>. <fact5>The falcon is to the left of the robin</fact5>. <fact6>The crow is to the right of the blue jay</fact6>. Options: (A) The cardinal is the second from the left (B) The blue jay is the second from the left (C) The robin is the second from the left (D) The crow is the second from the left (E) The falcon is the second from the left (F) The hawk is the second from the left (G) The raven is the second from the left

## Question 7:
### BEFORE:
How would a typical person answer each of the following questions about causation? A machine is set up in such a way that it will short circuit if both the black wire and the red wire touch the battery at the same time. The machine will not short circuit if just one of these wires touches the battery. The black wire is designated as the one that is supposed to touch the battery, while the red wire is supposed to remain in some other part of the machine. One day, the black wire and the red wire both end up touching the battery at the same time. There is a short circuit. Did the black wire cause the short circuit? Options: - Yes - No

### AFTER:
How would a typical person answer each of the following questions about causation? A <fact1>machine is set up in such a way that it will short circuit</fact1> if both the <fact2>black wire and the red wire touch the battery at the same time</fact2>. The <fact3>machine will not short circuit if just one of these wires touches the battery</fact3>. The black wire is designated as the one that is supposed to touch the battery, while the red wire is supposed to remain in some other part of the machine. One day, the <fact2>black wire and the red wire both end up touching the battery at the same time</fact2>. There is a short circuit. Did the black wire cause the short circuit?
Options: - Yes - No

## Question 8:
### BEFORE:
A coin is heads up. roxas does not flip the coin. scheideman does not flip the coin.  Is the coin still heads up? Flip means reverse.

### AFTER:
<fact1>A coin is heads up</fact1>. <fact2>roxas does not flip the coin</fact2>. <fact3>scheideman does not flip the coin</fact3>.  Is the coin still heads up? Flip means reverse.

## Question 9:
### BEFORE:
I have four pianos, four snails, three chickens, a pig, a dog, and two cows. How many animals do I have?

### AFTER:
I have four pianos, <fact1>four snails</fact1>, <fact2>three chickens</fact2>, <fact3>a pig</fact3>, <fact4>a dog</fact4>, and <fact5>two cows</fact5>. How many animals do I have?

## Question 10:
### BEFORE:
2015 is coming in 36 hours. What is the date one week from today in MM/DD/YYYY?

### AFTER:
2015 is coming in <fact1>36 hours</fact1>. What is the date <fact2>one week from today</fact2> in MM/DD/YYYY?

## Question 10:
### BEFORE:
If you follow these instructions, do you return to the starting point? Always face forward. Take 1 step right. Take 3 steps left. Take 2 steps right. Options: - Yes - No

### AFTER:
If you follow these instructions, do you return to the starting point? Always face forward. Take <fact1>1 step right</fact1>. Take <fact2>3 steps left</fact2>. Take <fact3>2 steps right</fact3>. Options: - Yes - No

## Question 11: #reasoining color
### BEFORE:
Question: On the desk, you see a set of things arranged in a row: a grey cup, a purple mug, and a blue teddy bear. What is the color of the thing directly to the right of the cup? Options: (A) red (B) orange (C) yellow (D) green (E) blue (F) brown (G) magenta (H) fuchsia (I) mauve (J) teal (K) turquoise (L) burgundy (M) silver (N) gold (O) black (P) grey (Q) purple (R) pink

### AFTER:
On the desk, you see a set of things arranged in a row: a <fact1>grey cup</fact1>, a <fact2>purple mug</fact2>, and a blue teddy bear. What is <fact3>the color of the thing directly to the right of the cup</fact3>? Options: (A) red (B) orange (C) yellow (D) green (E) blue (F) brown (G) magenta (H) fuchsia (I) mauve (J) teal (K) turquoise (L) burgundy (M) silver (N) gold (O) black (P) grey (Q) purple (R) pink

## Question 12:
### BEFORE:
Among the various models of Delta vacuum cleaners, one cannot accurately predict how effectively a particular model cleans simply by determining how powerful its motor is. The efficiency of dust filtration systems varies significantly, even between models of Delta vacuum cleaners equipped with identically powerful motors. The argument's conclusion is properly drawn if which one of the following is assumed?
Answer Choices:
(a) All Delta vacuum cleaners that clean equally effectively have identically powerful motors.
(b) One cannot accurately assess how effectively any Delta vacuum cleaner cleans without knowing how powerful that vacuum cleaner's motor is.
(c) For each Delta vacuum cleaner, the efficiency of its dust filtration system has a significant impact on how effectively it cleans.
(d) For any two Delta vacuum cleaners with equally efficient dust filtration systems, the one with the more powerful motor cleans more effectively.

### AFTER:
Among the various models of Delta vacuum cleaners, <fact1>one cannot accurately predict how effectively a particular model cleans</fact1> simply by <fact2>determining how powerful its motor is</fact2>. The efficiency of <fact3>dust filtration systems varies significantly</fact3>, even between models of <fact4>Delta vacuum cleaners equipped with identically powerful motors</fact4>. The argument's conclusion is properly drawn if which one of the following is assumed?
(a) All Delta vacuum cleaners that clean equally effectively have identically powerful motors.
(b) One cannot accurately assess how effectively any Delta vacuum cleaner cleans without knowing how powerful that vacuum cleaner's motor is.
(c) For each Delta vacuum cleaner, the efficiency of its dust filtration system has a significant impact on how effectively it cleans.
(d) For any two Delta vacuum cleaners with equally efficient dust filtration systems, the one with the more powerful motor cleans more effectively.

## Question 13:
### BEFORE:
We have three blocks, A, B and C. Block A has a medium blue square. Below block A is block B which has one medium black square. To the left of block B there is block C which has two medium blue squares. Medium blue square number one is below medium blue square number two. A medium yellow square is below medium blue square number two and medium blue square number one. What is to the left of the black thing? a medium blue square that is in block A or a medium blue square number two?
(a) medium blue square  that is in block A
(b) medium blue square  number two
(c) both of them
(d) none of them

### AFTER:
We have three blocks, A, B, and C. Block A has a medium blue square. <fact1>Below block A is block B, which has one medium black square</fact1>. <fact2>To the left of block B, there is block C, which has two medium blue squares</fact2>. Medium blue square number one is below medium blue square number two. A medium yellow square is below medium blue square number two and medium blue square number one. What is to the left of the black thing? A medium blue square that is in block A or a medium blue square number two?

My Question is:
"""

instruction_for_grounding_in_question = 'Read the question. Detect the exact key facts in the question via following rules:\
1. Do not change, paraphrase, or introduce new words or phrases to the key facts. \
2. If the question just mentions about one object, one character or one location, etc, then you do not need to include that in the fact, whereas, if the question has many objects, characters or locations, etc, please include them in the fact as well. \
3. Extract the shortest and most concise key facts, and make sure that if any of them were removed, it would make it impossible to answer the question. \
4. Do not tag irrelevant key facts. \
5. If two or more facts are consecutive and cannot be meaningfully split, tag them together. \
Provide your detected key facts as the following form:\
    Key Information: '

question = 'Betty picked 16 strawberries. Matthew picked 20 more strawberries than Betty and twice as many as Natalie. They used their strawberries to make jam. One jar of jam used 7 strawberries and they sold each jar at $4. How much money were they able to make from the strawberries they picked?'

prompt = f"{examples_for_grounding_in_question}\n{question}\n{instruction_for_grounding_in_question}"

# Main Functions

In [25]:
def query_4o_multiturn(prompt: str) -> str:
    client = OpenAI()
    messages = [
        {"role": "user", "content": prompt}
    ]
    
    try:
        # First turn
        completion1 = client.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=messages,
            temperature=0
        )
        response1 = completion1.choices[0].message.content.strip()
        
        # Append assistant's first response
        messages.append({"role": "assistant", "content": response1})
        
        # Second turn: follow-up prompt
        follow_up_prompt = second_turn_facts
        messages.append({"role": "user", "content": follow_up_prompt})
        
        completion2 = client.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=messages,
            temperature=0
        )
        response2 = completion2.choices[0].message.content.strip()
        
        # Combine both responses
        combined_response = f"{response1}\n\n{response2}"
        # print(f"Combined response: {combined_response}")
        return combined_response
    
    except Exception as e:
        print(f"Error in multiturn query_4o: {str(e)}")
        return ""
    
# def query_gemini(prompt: str, problem_id) -> str:
#     """
#     Queries the Gemini LLM with the given prompt and returns the response text.
#     """
#     genai.configure(api_key=get_gemini_key(problem_id))
#     model = genai.GenerativeModel('gemini-1.5-pro-latest')
#     response = model.generate_content(prompt, request_options=RequestOptions(retry=retry.Retry(initial=20, multiplier=3, maximum=121, timeout=60)))
#     text = response.candidates[0].content.parts[0].text
#     return text

def extract_question(text):
    marker = "Reformatted Question:"
    if marker not in text:
        return None
    
    extracted_text = text.split(marker)[1].strip()
    
    return extracted_text

def query_4o_multiconvo(fact_prompt, answer_prompt) -> str:
    client = OpenAI()
    message = [
        {"role": "user", "content": fact_prompt}
    ]
    
    try:
        completion1 = client.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=message,
            temperature=0
        )
        extracted_facts = completion1.choices[0].message.content.strip()
        
        extracted_question = extract_question(extracted_facts)
        answer_prompt += "\n" + extracted_question
        message2 = [
            {"role": "user", "content": answer_prompt}
        ]
        completion2 = client.chat.completions.create(
            model="gpt-4o-2024-08-06",
            messages=message2,
            temperature=0
        )
        answer_reasoning = completion2.choices[0].message.content.strip()
        
        full_convo = extracted_facts + "\n--------------- End of Conversation ---------------\n" +  extracted_question +"\n--------------- Extracted Question ---------------\n"+ "\n" + answer_reasoning
        return full_convo 
    
    except Exception as e:
        print(f"Error in multiturn query_4o: {str(e)}")
        return ""
    
def query_llama_multiconvo(fact_prompt: str, answer_prompt: str) -> str:
    try:
        # First turn - fact extraction
        fact_response = ollama.generate(
            model='llama3.1',  # Update this to match your specific LLaMA model
            prompt=fact_prompt,
            options={
                'temperature': 0  # Keep temperature 0 for consistent outputs
            }
        )
        extracted_facts = fact_response['response'].strip()
        
        # Extract question from facts
        extracted_question = extract_question(extracted_facts)
        # print(extracted_question)
        # Second turn - answer generation
        full_answer_prompt = answer_prompt + "\n" + extracted_question
        answer_response = ollama.generate(
            model='llama3.1',  # Update this to match your specific LLaMA model
            prompt=full_answer_prompt,
            options={
                'temperature': 0
            }
        )
        answer_reasoning = answer_response['response'].strip()
        
        # Combine all parts into final output
        full_convo = (
            f"{extracted_facts}\n"
            f"--------------- End of Conversation ---------------\n"
            f"{extracted_question}\n"
            f"--------------- Extracted Question ---------------\n"
            f"{answer_reasoning}"
        )
        
        return full_convo
    
    except Exception as e:
        print(f"Error in multiturn query_llama: {str(e)}")
        return ""

def query_claude(prompt: str) -> str:
    """
    Queries the Claude LLM with the given prompt and returns the response text.
    """
    client = anthropic.Anthropic(api_key=API_KEYS['claude'])
    response = client.messages.create(
        model="claude-3-5-sonnet-20240620",
        max_tokens=1024,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.content[0].text

def query_4o(prompt: str) -> str:
    client = OpenAI()

    completion = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {
                "role": "user",
                "content": f"{prompt}"
            }
        ],
        temperature=0
    )

    return completion.choices[0].message.content

def query_llama(prompt: str, timeout_duration=10) -> str:
    def generate():
        response = ollama.generate(
            model='llama3.1',  # Update this to match your specific LLaMA model
            prompt=prompt,
            options={
                'temperature': 0  # Keep temperature 0 for consistent outputs
            }
        )
        return response['response'].strip()

    with ThreadPoolExecutor(max_workers=1) as executor:
        future = executor.submit(generate)
        try:
            result = future.result(timeout=timeout_duration)
            return result
        except TimeoutError:
            return "timeout"


In [14]:
def save_results(save_path: str, ids: List[str], questions: List[str], answers: List[str], append: bool = False):
    """
    Saves the results to a CSV file. If append is True and the file exists, it appends without headers.
    Otherwise, it writes a new file with headers.
    """
    df = pd.DataFrame({'id': ids, 'question': questions, 'answer': answers})
    if append and os.path.exists(save_path):
        df.to_csv(save_path, mode='a', index=False, header=False)
    else:
        df.to_csv(save_path, index=False)

def read_jsonl_file(filepath: str) -> List[dict]:
    """
    Reads a JSONL file and returns a list of JSON objects.
    """
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            json_obj = json.loads(line)
            data.append(json_obj)
    return data

def get_prompt(prompt_type: str, few_shot_prompt: str, question: str) -> str:
    prompts = {
        "cot": f"{few_shot_prompt}\n{question}\nPlease generate your explanation first, then generate the answer in the bracket as follow:\n" +"Answer: {}",
        "log_cot_mcq": f"{few_shot_prompt}\n{question}\nThink through your answer step by step and then choose the answer option that is the most correct. Then put your final answer in curly brackets. For example, Final_Answer:{{3}}",
        "vanilla_cot": f"{question}\nThink through your answer step by step. Then put your final answer in curly brackets. Your final answer should just be a number for example, Final answer:{{0}}",
        "fs": f"{few_shot_prompt}\n{question}",
        "fs_inst": f"{few_shot_prompt}\n{question}\nI want you to answer this question but your explanation should contain references referring back to the information in the question. To do that, first, re-generate the question with proper tags and then generate your answers. The output format is as follow:\n\
            Reformatted Question: \
                Answer:",
        "zs": f"{question}\nI want you to answer this question but your explanation should contain references referring back to the information in the question. To do that, first, re-generate the question with proper tags (<a>, <b>, <c>, etc) for refered information and then generate your answers that also have the tag (<a>, <b>, <c>, etc) for the grounded information. Give your answer by analyzing step by step, and give only numbers in the final answer. The output format is as follow:\n\
            Reformatted Question: \
                Answer:\
                    Final answer:",
        "fs_xml": f"{few_shot_prompt}\n\nRecreate the following question in the style of the correctly formatted examples shown previously. Make sure that your response has all its information inclosed in the proper <tags>. Begin your response with the <key_facts> section. Make sure that every fact in <key_facts> is very concise and contains a very short reference to the <question>. Do not include a <question> section in your response\n\n<question>\n{question}\n</question>",
        "fs_log_inst": f"{few_shot_prompt}\n\n{question}\nTo answer this question, your explanation should contain references referring back to the information in the question. To do that, first, re-generate the question with proper tags and then generate your answers based off the tags. Put your final answer in curly brackets e.g. Final_Answer: {{false}}. Your final answer should only be \"true\" or \"false\".",
        "fs_clause_inst": f"{few_shot_prompt}\n\n{question}\nTo answer this question, first regenerate the question with <fact> tags around each clause or phrase in the text. Each clause or phrase should be as concise as possible so that long sentences will be broken up into multiple segments. Then, to answer the original question, your explanation should contain references back to the information in the tagged question. After you have generated the reformatted question and your reasoning which contains references to the tagged reformatted question, put your answer in curly brackets e.g. Final_Answer: {{false}}. Your final answer should either be \"true\" or \"false\".",
        "stripped_clause": f"{few_shot_prompt}\n\n{question}\nTo answer this question, your explanation should contain references referring back to the information in the question. Generate your answers based off the tags in the question. Use the example answers as a guide for what your answer format should look like. The <fact> tags should be interweaved in the sentences in your reasoning. Put your final answer in curly brackets e.g. Final_Answer: {{false}}. Your final answer should either be \"true\" or \"false\".",
        "mermaid_get_answer": f"{few_shot_prompt}\n\n Your job is to extract the key facts from a question relevant to answering the question. The facts should be represented in a hierarchal format through a mermaid diagram. Do not create duplicate facts across multiple branches that represent the same information. Create a mermaid diagram that represents the key facts in the following question. Then, use the nodes from this graph to cite specific facts in your answer reasoning. Put your final answer in curly brackets e.g. Final_Answer: {{30}} \n\nquestion: {question}", 
        # "mega_prompt": f"{mega_prompt}\nYour final answer to this question should ONLY be {{true}} or {{false}} \n\n{question}",
        # "semantic_zero_prompt": f"{semantic_zero_prompt}\n\n{question}\n\n ONLY include the number in your final answer. For example, {3}",
        # "extract_facts": f"{extract_facts_quote}\n\n{question}",
        # "extract_facts_reasoning": f"{extract_facts_reasoning}\n\n{question}",
        # "extract_facts_no_reasoning": f"{extract_facts_no_reasoning}\n\n{question}",
        "fact_prompt": f"{extract_facts_only}\n{question}",
        "answer_prompt": f"{answer_question_only}\n{question}"
    }
    return prompts.get(prompt_type, "")


def query_llm(llm_model: str, ids: List[str], questions: List[str], few_shot_prompt: str, prompt_type: str, save_path: str, already_answered_ids: set) -> Tuple[List[str], List[str], List[str]]:
    answers = []
    ids_can_be_answered = []
    questions_can_be_answered = []
    
    for id, q in tqdm(zip(ids, questions), total=len(ids)):
        # print(f"Processing ID: {id}")
        if id in already_answered_ids:
            print(f"Skipping already answered ID: {id}")
            continue
        
        prompt = get_prompt(prompt_type, few_shot_prompt, q)
        try:
            if llm_model == 'gemini':
                answer = query_gemini(prompt, id)
            elif llm_model == 'claude':
                answer = query_claude(prompt)
            elif llm_model == '4o':
                # answer = query_4o_multiturn(prompt)
                if prompt_type == 'multi_convo':
                    fact_prompt = get_prompt(prompt_type="fact_prompt", few_shot_prompt="", question=q)
                    answer_prompt = get_prompt(prompt_type="answer_prompt", few_shot_prompt="", question=q)
                    answer = query_4o_multiconvo(fact_prompt=fact_prompt, answer_prompt=answer_prompt)
                else:
                    answer = query_4o(prompt)
                
            elif llm_model == 'llama3.1':
                if prompt_type == 'multi_convo':
                    fact_prompt = get_prompt(prompt_type="fact_prompt", few_shot_prompt="", question=q)
                    answer_prompt = get_prompt(prompt_type="answer_prompt", few_shot_prompt="", question=q)
                    answer = query_4o_multiconvo(fact_prompt=fact_prompt, answer_prompt=answer_prompt)                
                else:
                    print(prompt)
                    answer = query_llama(prompt)
            else:
                raise ValueError(f"Unsupported LLM model: {llm_model}")
            # print(f"Answer for ID {id}: {answer}")
            
            answers.append(answer)
            questions_can_be_answered.append(q)
            ids_can_be_answered.append(id)

            # Save after each answer
            save_results(save_path, [id], [q], [answer], append=True)
        except Exception as e:
            print(f"Error processing question {id}: {str(e)}")
            continue
    
    return ids_can_be_answered, questions_can_be_answered, answers

def load_data_size_specific(data_path: str, sample_size: int = 0, random_seed: int = 0):
    random.seed(random_seed)

    data = read_jsonl_file(data_path)
    # with open(data_path, 'r') as file:
    #     data = json.load(file)
    
    question_length = 0 # 336  # 526 # 800
    eligible_data = [x for x in data if len(x["question"]) >= question_length]
    
    if sample_size > 0 and sample_size < len(eligible_data):
        sampled_data = random.sample(eligible_data, sample_size)
    else:
        sampled_data = eligible_data
    
    ids = [x["id"] for x in sampled_data]
    questions = [x["question"] for x in sampled_data]
    
    return ids, questions

def load_data_csv(data_path, sample_size: int = 0, random_seed: int = 0):
    data = pd.read_csv(data_path)
    ids = []
    questions = []
    for row in data.iterrows():
        id = row[1]['id']
        extracted_question = row[1]['extracted_question']
        ids.append(id)
        questions.append(extracted_question)
    return ids[:sample_size], questions[:sample_size]

def load_few_shot_prompt(prompt_path: str) -> str:
    with open(prompt_path, 'r') as file:
        prompt = file.read()
    # print(f"Loaded few-shot prompt from: {prompt_path}")
    return prompt

def load_already_answered_ids(save_path: str) -> set:
    if os.path.exists(save_path):
        df = pd.read_csv(save_path)
        answered_ids = set(df['id'].astype(int).tolist())
        # print(f"Loaded {len(answered_ids)} already answered IDs from: {save_path}")
        print(f"Already answered IDs: {answered_ids}")
        return answered_ids
    else:
        print(f"No existing save file found at: {save_path}. Starting fresh.")
        return set()

def initialize_save_file(save_path: str):
    if not os.path.exists(save_path):
        # Create an empty DataFrame with headers and save
        df = pd.DataFrame(columns=['id', 'question', 'answer'])
        df.to_csv(save_path, index=False)
        print(f"Initialized new save file with headers at: {save_path}")

# Driver

In [None]:
# time = datetime.datetime.now().strftime("%m%d_%H%M%S")
time = '1028_231620'
project_root = '/Users/log/Github/textual_grounding/'
dataset = 'GSM8K'

llm_model = 'llama3.1'
prompt_type = 'vanilla_cot'
# prompt_type = 'multi_convo'
# prompt_type = 'fs_clause_inst'
# few_shot_txt = 'clause_fs.txt'
few_shot_txt = None

# Paths
data_path = os.path.join(project_root, 'data', dataset, 'test.jsonl')
# data_path = os.path.join(project_root, 'data', dataset, 'test.json')
# data_path = '/Users/log/Github/textual_grounding/logan/results/GSM8K/llama/mermaid/mermaid_get_graph_llama3.1_20240924_001821.csv'

if few_shot_txt:
    fewshot_prompt_path = os.path.join(project_root, "prompt", dataset, few_shot_txt)
# fewshot_prompt_path = '/Users/log/Github/textual_grounding/prompt/GSM8K/fewshot_mermaid_full.txt'
save_dir = os.path.join(project_root, 'logan/results', dataset, f'{llm_model}/grounded_fact')
os.makedirs(save_dir, exist_ok=True)  # Ensure the directory exists
save_path = os.path.join(save_dir, f'{prompt_type}_{few_shot_txt}_{llm_model}_{time}.csv')

ids, questions = load_data_size_specific(data_path, sample_size=200)
# csv_path = '/Users/log/Github/textual_grounding/logan/results/SPARTQA/4o/grounded_fact/multi_convo_None_4o_1028_154404.csv'
# ids, questions = load_data_csv(csv_path, sample_size=200)
if few_shot_txt:
    few_shot_prompt = load_few_shot_prompt(fewshot_prompt_path)
else:
    few_shot_prompt = ""

initialize_save_file(save_path)
already_answered_ids = load_already_answered_ids(save_path)

ids_answered, questions_answered, answers = query_llm(
    llm_model=llm_model,
    ids=ids,
    questions=questions,
    few_shot_prompt=few_shot_prompt,
    prompt_type=prompt_type,
    save_path=save_path,
    already_answered_ids=already_answered_ids
)

print(f"Processing complete. {len(ids_answered)} new answers saved to {save_path}.")

Already answered IDs: {513, 1033, 530, 788, 1047, 151, 285, 286, 676, 1194, 300, 829, 447, 577, 194, 1090, 966, 202, 976, 1232, 82, 733, 861, 995, 621, 1266, 635}


  0%|          | 0/200 [00:00<?, ?it/s]

Skipping already answered ID: 788
Skipping already answered ID: 861
Skipping already answered ID: 82
Skipping already answered ID: 530
Skipping already answered ID: 1047
Skipping already answered ID: 995
Skipping already answered ID: 829
Skipping already answered ID: 621
Skipping already answered ID: 976
Skipping already answered ID: 733
Skipping already answered ID: 1194
Skipping already answered ID: 447
Skipping already answered ID: 1033
Skipping already answered ID: 285
Skipping already answered ID: 577
Skipping already answered ID: 286
Skipping already answered ID: 194
Skipping already answered ID: 1266
Skipping already answered ID: 513
Skipping already answered ID: 1090
Skipping already answered ID: 1232
Skipping already answered ID: 300
Skipping already answered ID: 635
Skipping already answered ID: 202
Skipping already answered ID: 151
Skipping already answered ID: 676
Skipping already answered ID: 966


# todo

messed up GT options for the multi conv for aqua

the multi conv for aqua answers with num instead of letter sometimes so its not a fair comp. Should be able to just run code now to make it work though. 

## LogiQA

In [None]:
import sys
import os
from datasets import load_dataset

# Add the directory containing logiqa.py to the Python path
logiqa_path = "/Users/log/Github/textual_grounding/data/logiqa"
sys.path.append(logiqa_path)

# Import the LogiQA class from the logiqa module if needed
from logiqa import LogiQA

# Load the dataset using Hugging Face load_dataset method
dataset = load_dataset('/Users/log/Github/textual_grounding/data/logiqa/logiqa.py', split='test')

# Print out the first 5 examples from the test set
for idx in range(5):
    example = dataset[idx]
    print(f"Example {idx + 1}:")
    print(f"Context: {example['context']}")
    print(f"Query: {example['query']}")
    print(f"Options: {example['options']}")
    print(f"Correct Option Index: {example['correct_option']}")
    print("-" * 50)


In [None]:
import json
from datasets import load_dataset

# Load the dataset (adjust the path as needed)
dataset = load_dataset('/Users/log/Github/textual_grounding/data/logiqa/logiqa.py', split='test')

# Prepare to write the first 300 examples to a JSONL file
output_file = 'logiqa_300_examples.jsonl'
with open(output_file, 'w', encoding='utf-8') as f:
    for idx, example in enumerate(dataset):
        if idx >= 300:
            break
        
        # Create the "question" field by concatenating context, query, and options
        context = example['context']
        query = example['query']
        options = example['options']
        options_str = " ".join([f"({chr(65 + i)}) {opt}" for i, opt in enumerate(options)])
        question = f"{context} {query}\n{options_str}"
        
        # Create the dictionary for the current example
        example_dict = {
            "id": idx,
            "question": question,
            "answer": chr(65 + example['correct_option'])  # Convert index to letter (A, B, C, D)
        }
        
        # Write the example as a JSON object to the JSONL file
        f.write(json.dumps(example_dict) + '\n')

print(f"Saved 300 examples to {output_file}")

# Visualization

# Grounded Visual

In [13]:
import csv
import re
import json
import os

def add_color_to_tags_new(text):
    # Find all unique tags in the text using regex
    tags = set(re.findall(r'<([A-Za-z]+\d*)>', text))

    # Predefined color palette
    color_palette = [
        'lightyellow', 'lightblue', 'lightgreen', 'lightcoral',
        'lightcyan', 'lightpink', 'lightsalmon', 'lightgray',
        'lightgoldenrodyellow', 'lightseagreen', 'lightskyblue',
        'lightsteelblue',
        'lavender', 'peachpuff', 'paleturquoise', 'wheat', 'mistyrose'
    ]

    # Dictionary to hold tag-color mapping
    tag_color_mapping = {}

    # Assign colors to tags, cycling through the color palette if necessary
    for i, tag in enumerate(sorted(tags)):
        color = color_palette[i % len(color_palette)]
        tag_color_mapping[tag] = color

    # Function to replace tags with styled spans including class names
    def replace_tag(match):
        tag = match.group(1)
        content = match.group(2)
        color = tag_color_mapping.get(tag, 'lightgray')  # Default color if not found
        return f'<span class="{tag}" style="background-color: {color}; font-weight: bold;">{content}</span>'

    # Regex to find tags and replace them with styled spans
    tag_regex = re.compile(r'<([A-Za-z]+\d*)>\s*([\s\S]*?)\s*</\1>')

    # Replace all tags with styled spans
    text = tag_regex.sub(replace_tag, text)

    return text


def parse_csv_file(file_path):
    qa_pairs = []
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            question = row.get('question', 'No question found.').strip()
            answer_text = row.get('answer', 'No answer found.').strip()
            id_ = row.get('id')
            if id_ is not None:
                try:
                    id_int = int(id_)
                except ValueError:
                    print(f"Skipping a row due to invalid 'id' (not an integer): {id_}")
                    continue
                qa_pairs.append((id_int, question, answer_text))
            else:
                # Handle cases without 'id' by skipping
                print(f"Skipping a row due to missing 'id': {row}")
    return qa_pairs


def read_ground_truth(jsonl_path):
    ground_truth = {}
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        # data_list = json.load(f)  # Load the entire JSON content
        # for data in data_list:
        for line in f:
            data = json.loads(line)
            id_ = data.get('id')
            answer = str(data.get('answer')).lower()
            if id_ is not None and answer is not None:

                # GSM ONLY
                answer = answer.split('####')[1].strip()
                ground_truth[id_] = answer
            else:
                print(f"Invalid ground truth entry: {data}")
                
            
    return ground_truth

# def read_ground_truth_option(jsonl_path):
    ground_truth = {}
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        # data_list = json.load(f)  # Load the entire JSON content
        # for data in data_list:
        for line in f:
            data = json.loads(line)
            id_ = data.get('id')
            answer = str(data.get('answer')).lower()
            options = data.get('options')
            for option in options:
                # print(option, answer)
                if option[0].lower() == answer:
                    answer = option
                    print
                    break
            if id_ is not None and answer is not None:

                # GSM ONLY
                # answer = answer.split('####')[1].strip()
                # ground_truth[id_] = answer
                numbers_only = re.sub(r'[^0-9]', '', answer)
                ground_truth[id_] = numbers_only
            else:
                print(f"Invalid ground truth entry: {data}")
    return ground_truth

def create_highlight_html_new(qa_pairs, ground_truth, ground_truth_option=None):
    html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Question and Answer Highlights</title>
        <style>
            body {
                font-family: Arial, sans-serif;
                margin: 20px;
                background-color: #f0f0f0;
            }
            .container {
                background-color: #ffffff;
                padding: 20px;
                margin-bottom: 20px;
                border-radius: 8px;
                box-shadow: 0 2px 5px rgba(0,0,0,0.1);
            }
            .question {
                font-size: 1.2em;
                margin-bottom: 10px;
            }
            .full-response, .final-answer, .ground-truth-answer {
                margin-bottom: 10px;
                white-space: pre-wrap; /* Add this line to preserve newlines */
            }
            .final-answer {
                font-weight: bold;
            }
            .ground-truth-answer {
                font-weight: bold;
            }
            /* Styles for the highlighted spans */
            .highlighted {
                padding: 2px 4px;
                border-radius: 3px;
                display: inline-block;
            }
            /* Styles for the summary section */
            .summary {
                background-color: #e0ffe0;
                padding: 15px;
                border: 2px solid #00cc00;
                border-radius: 8px;
                font-size: 1.2em;
                margin-top: 30px;
            }
        </style>
    </head>
    <body>
    <h1>Question and Answer Highlights</h1>
    """

    # Initialize counters for correct and total answers
    correct_answers = 0
    total_answers = 0

    for i, (id_, question, answer_text) in enumerate(qa_pairs, 1):
        try:
            full_response = answer_text.strip()
        except Exception as e:
            print(f"Cannot process answer for question ID {id_}: {e}")
            continue

        # Apply color to tags in the full_response
        highlighted_response = add_color_to_tags_new(full_response)
        
        # Replace newline characters with <br> tags to ensure they are rendered in HTML
        highlighted_response = highlighted_response.replace('\n', '<br>')

        # Extract the final answer within curly brackets {}
        # final_answer_match = re.search(r'\{([^}]+)\}', full_response)
        final_answer_match = re.search(r'\{([^}]+)\}(?=[^}]*$)', full_response, re.DOTALL)
        
        if final_answer_match:
            final_answer = final_answer_match.group(1).replace(',', '').replace('$', '').strip().lower()
            # Currentyl removes everything excecpt numbers
            extracted = final_answer_match.group(1)
            final_answer = re.sub(r'[^\d.]', '', extracted)
            
            # only get first letter for mcq
            # final_answer_mcq = final_answer[0]
            if "no" in final_answer or "false" in final_answer:
                final_answer = "false"
            elif "yes" in final_answer or "true" in final_answer:
                final_answer = "true"
            
        else:
            final_answer = ""

        # Retrieve ground truth answer
        gt_answer = str(ground_truth.get(id_))
        gt_answer = gt_answer.replace(',', '').replace('$', '')
        # print(f"id: {id_}: {gt_answer}")
        if gt_answer is None:
            gt_answer_display = "<span style='color: gray;'>Ground truth not available.</span>"
            is_correct = False
        else:
            # option = ground_truth_option.get(id_)
            # In case conversion fails, fallback to string comparison
            is_correct = final_answer == gt_answer
            final_answer_display = final_answer
            gt_answer_display = gt_answer

        # Style the final answer based on correctness
        if is_correct:
            highlighted_final_answer = f"<span style='font-size:1.1em; color: green;'>{final_answer_display}</span>"
            correct_answers += 1
        else:
            highlighted_final_answer = f"<span style='font-size:1.1em; color: red;'>{final_answer_display}</span>"
        total_answers += 1

        # Display ground truth answer
        if gt_answer is not None:
            ground_truth_html = f"<div class='ground-truth-answer'><strong>Ground Truth Answer:</strong> {gt_answer_display}</div>"
        else:
            ground_truth_html = f"<div class='ground-truth-answer'><strong>Ground Truth Answer:</strong> Not available.</div>"

        # Build the HTML structure
        html_content += f"<div class='container'>"
        html_content += f"<div class='question'><strong>Question:</strong> {question}</div>"
        html_content += f"<div class='full-response'>{highlighted_response}</div>"
        html_content += f"<div class='final-answer'><strong>Final Answer:</strong> {highlighted_final_answer}</div>"
        html_content += f"{ground_truth_html}"
        html_content += "</div>\n"

    # After processing all QA pairs, add the summary section
    summary_percentage = (correct_answers / total_answers * 100) if total_answers > 0 else 0
    summary_html = f"""
    <div class='summary'>
        <strong>Summary:</strong> Correct Answers: {correct_answers} / {total_answers} ({summary_percentage:.2f}%)
    </div>
    """
    # Close the HTML tags
    html_content += """
    </body>
    </html>
    """
    output_html = summary_html + html_content
    return output_html


def main():
    input_csv = '/Users/log/Github/textual_grounding/logan/results/GSM8K/llama3.1/grounded_fact/multi_convo_None_llama3.1_1028_224532.csv'  
    ground_truth_file = '/Users/log/Github/textual_grounding/data/GSM8K/test.jsonl'  
    output_file = 'multi_conv_GSM8K_llama8b.html' 

    # Check if input files exist
    if not os.path.isfile(input_csv):
        print(f"Input CSV file not found: {input_csv}")
        return
    if not os.path.isfile(ground_truth_file):
        print(f"Ground truth JSON file not found: {ground_truth_file}")
        return

    # Parse the input CSV file to extract IDs, questions, and answers
    qa_pairs = parse_csv_file(input_csv)
    print(f"Total QA Pairs Parsed: {len(qa_pairs)}")  # Debug: Print the number of QA pairs parsed

    # Read the ground truth answers
    ground_truth = read_ground_truth(ground_truth_file)
    # ground_truth_options = read_ground_truth_option(ground_truth_file)
    # print(ground_truth)
    # print(ground_truth_options)
    # ground_truth_options = None
    print(f"Total Ground Truth Entries: {len(ground_truth)}")  # Debug: Print the number of ground truth entries

    # Check if any QA pairs were found
    if not qa_pairs:
        print("No question-answer pairs were found in the input file.")
        return

    # Generate the HTML content
    html_content = create_highlight_html_new(qa_pairs, ground_truth)

    # Write the HTML content to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(html_content)

    print(f"HTML content has been successfully written to {output_file}")


if __name__ == "__main__":
    main()

Total QA Pairs Parsed: 200
Total Ground Truth Entries: 1319
HTML content has been successfully written to multi_conv_GSM8K_llama8b.html


## CoT - Visualize

In [57]:
import csv
import re
import json
import os

import re

def extract_final_answer(answer_text):
    # Regex pattern to match anything inside curly braces
    final_answer_pattern = re.compile(r'\{([^}]+)\}')
    
    # Find all matches of text inside curly braces
    matches = list(final_answer_pattern.finditer(answer_text))
    
    # If we have at least one match, get the content of the last match
    if matches:
        final_answer = matches[-1].group(1).strip()
        if final_answer[-1] == '\\':
            final_answer = final_answer[:-1]
        return final_answer
    else:
        return ""


def parse_csv_file(file_path):
    qa_pairs = []
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            question = row.get('question', 'No question found.').strip()
            answer_text = row.get('answer', 'No answer found.').strip()
            id_ = row.get('id')
            if id_ is not None:
                try:
                    id_int = int(id_)
                    qa_pairs.append((id_int, question, answer_text))
                except ValueError:
                    print(f"Skipping a row due to invalid 'id' (not an integer): {id_}")
            else:
                print(f"Skipping a row due to missing 'id': {row}")
    return qa_pairs

def read_ground_truth(jsonl_path):
    ground_truth = {}
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            id_ = data.get('id')
            answer = data.get('answer')
            if id_ is not None and answer is not None:
                # GSM ONLY
                # answer = answer.split('####')[1].strip()
                ground_truth[id_] = answer
            else:
                print(f"Invalid ground truth entry: {data}")
    return ground_truth

def create_simple_html(qa_pairs, ground_truth):
    html_content = """
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <title>Question and Answer Comparison</title>
        <style>
            body {
                font-family: Arial, sans-serif;
                margin: 20px;
                background-color: #f9f9f9;
            }
            .container {
                background-color: #ffffff;
                padding: 15px 20px;
                margin-bottom: 15px;
                border-radius: 6px;
                box-shadow: 0 1px 3px rgba(0,0,0,0.1);
            }
            .question {
                font-size: 1.1em;
                margin-bottom: 10px;
                color: #333333;
            }
            .answer-text {
                background-color: #f4f4f4;
                padding: 10px;
                border-left: 4px solid #2196F3;
                margin-bottom: 10px;
                white-space: pre-wrap;
                font-family: Consolas, "Courier New", monospace;
            }
            .final-answer, .ground-truth-answer {
                margin-bottom: 5px;
            }
            .final-answer span.correct {
                color: green;
                font-weight: bold;
            }
            .final-answer span.incorrect {
                color: red;
                font-weight: bold;
            }
            .ground-truth-answer {
                color: #555555;
            }
            .summary {
                background-color: #e0ffe0;
                padding: 15px;
                border: 2px solid #00cc00;
                border-radius: 8px;
                font-size: 1.2em;
                margin-top: 30px;
            }
        </style>
    </head>
    <body>
    <h1>Question and Answer Comparison</h1>
    """

    # Initialize counters for correct and total answers
    correct_answers = 0
    total_answers = 0

    for id_, question, answer_text in qa_pairs:
        final_answer = extract_final_answer(answer_text)
        gt_answer = ground_truth.get(id_)

        if gt_answer is None:
            gt_answer_display = "<span style='color: gray;'>Ground truth not available.</span>"
            is_correct = False
        else:
            # Normalize both final_answer and gt_answer for comparison
            try:
                final_answer_num = float(final_answer.replace(',', '').replace('$', ''))
                if isinstance(gt_answer, list):  # Handle list of answers if applicable
                    gt_answer_num = float(gt_answer[0].replace(',', '').replace('$', ''))
                else:
                    gt_answer_num = float(str(gt_answer).replace(',', '').replace('$', ''))
                is_correct = final_answer_num == gt_answer_num
                # Format numbers with commas and two decimal places if needed
                final_answer_display = f"{final_answer_num:,.2f}" if not final_answer_num.is_integer() else f"{int(final_answer_num):,}"
                gt_answer_display = f"{gt_answer_num:,.2f}" if not gt_answer_num.is_integer() else f"{int(gt_answer_num):,}"
            except (ValueError, TypeError):
                # Fallback to string comparison if conversion fails
                is_correct = final_answer.strip().lower() == str(gt_answer).strip().lower()
                final_answer_display = final_answer
                gt_answer_display = gt_answer

        # Style the final answer based on correctness
        if is_correct:
            final_answer_html = f"<span class='correct'>{final_answer_display}</span>"
            correct_answers += 1
        else:
            final_answer_html = f"<span class='incorrect'>{final_answer_display}</span>"
        total_answers += 1

        # Display ground truth answer
        if gt_answer is not None:
            ground_truth_html = f"<div class='ground-truth-answer'><strong>Ground Truth Answer:</strong> {gt_answer_display}</div>"
        else:
            ground_truth_html = f"<div class='ground-truth-answer'><strong>Ground Truth Answer:</strong> Not available.</div>"

        # Build the HTML structure for each QA pair
        html_content += f"<div class='container'>"
        html_content += f"<div class='question'><strong>Question:</strong> {question}</div>"
        html_content += f"<div class='answer-text'><strong>Model Response:</strong><br>{answer_text}</div>"
        html_content += f"<div class='final-answer'><strong>Final Answer:</strong> {final_answer_html}</div>"
        html_content += f"{ground_truth_html}"
        html_content += "</div>\n"

    # Add the summary section
    summary_percentage = (correct_answers / total_answers * 100) if total_answers > 0 else 0
    summary_html = f"""
    <div class='summary'>
        <strong>Summary:</strong> Correct Answers: {correct_answers} / {total_answers} ({summary_percentage:.2f}%)
    </div>
    """
    # html_content += summary_html

    # Close the HTML tags
    html_content += """
    </body>
    </html>
    """

    final_content = summary_html + html_content
    return final_content

def main():
    input_csv = '/Users/log/Github/textual_grounding/logan/results/logical_deduction_seven_objects/4o/grounded_fact/multi_convo_None_4o_1028_200316.csv'  
    ground_truth_file = '/Users/log/Github/textual_grounding/data/logical_deduction_seven_objects/test.json'  # Path to the ground truth JSONL file
    output_file = 'extract_facts_SPARTQA_4o.html'  # Desired output HTML file path

    # Check if input files exist
    if not os.path.isfile(input_csv):
        print(f"Input CSV file not found: {input_csv}")
        return
    if not os.path.isfile(ground_truth_file):
        print(f"Ground truth JSONL file not found: {ground_truth_file}")
        return

    # Parse the input CSV file to extract IDs, questions, and answers
    qa_pairs = parse_csv_file(input_csv)
    print(f"Total QA Pairs Parsed: {len(qa_pairs)}")

    # Read the ground truth answers
    ground_truth = read_ground_truth(ground_truth_file)
    print(f"Total Ground Truth Entries: {len(ground_truth)}")

    # Check if any QA pairs were found
    if not qa_pairs:
        print("No question-answer pairs were found in the input file.")
        return

    # Generate the HTML content
    html_content = create_simple_html(qa_pairs, ground_truth)

    # Write the HTML content to the output file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(html_content)

    print(f"HTML content has been successfully written to {output_file}")

if __name__ == "__main__":
    main()


Total QA Pairs Parsed: 25
Total Ground Truth Entries: 3591
HTML content has been successfully written to extract_facts_SPARTQA_4o.html


In [None]:
from datasets import load_dataset

ds = load_dataset("tasksource/spartqa-mchoice")

# Response Statistics

In [19]:
import csv
import re
import json  # For handling JSONL
import os

def extract_parts_regular_cot(answer_text):
    # Attempt to extract Final Answer from 'Final Answer:'
    final_match = re.search(r'Final Answer:\s*(\S+)', answer_text, re.IGNORECASE)
    if final_match and final_match.group(1).strip():
        final_answer = final_match.group(1).strip()
        has_curly = False
    else:
        # Fallback: Extract Final Answer from '{...}' in the reasoning
        curly_match = re.search(r'\{([\d.]+)\}', answer_text)
        final_answer = curly_match.group(1).strip() if curly_match else ""
        has_curly = bool(curly_match)

    return answer_text.strip(), final_answer, has_curly

def parse_csv_file(file_path):
    qa_pairs = []
    with open(file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            question = row.get('question', 'No question found.').strip()
            answer_text = row.get('answer', 'No answer found.').strip()
            id_ = row.get('id')
            if id_ is not None:
                try:
                    id_int = int(id_)
                except ValueError:
                    print(f"Skipping a row due to invalid 'id' (not an integer): {id_}")
                    continue
                qa_pairs.append((id_int, question, answer_text))
            else:
                # Handle cases without 'id' by skipping
                print(f"Skipping a row due to missing 'id': {row}")
    return qa_pairs

def read_ground_truth(jsonl_path):
    ground_truth = {}
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            id_ = data.get('id')
            answer = data.get('answer')
            if id_ is not None and answer is not None:
                # Extract the last number or text after '####'
                match = re.search(r'####\s*([\d.]+)', answer)
                if match:
                    ground_truth[id_] = match.group(1).strip()
                else:
                    print(f"No ground truth answer found for ID {id_}")
            else:
                print(f"Invalid ground truth entry: {data}")
    return ground_truth

def create_statistics(qa_pairs, ground_truth):
    total_responses = len(qa_pairs)
    responses_with_curly = 0
    responses_without_curly = 0
    correct_answers = 0
    incorrect_answers = 0
    no_ground_truth = 0

    # Variables for tag statistics
    total_tags = 0
    total_tag_length = 0
    tag_counts = []  # List to store number of tags per response
    tag_lengths = []  # List to store lengths of tag content across all responses

    for id_, question, answer_text in qa_pairs:
        try:
            answer_reasoning, final_answer, has_curly = extract_parts_regular_cot(answer_text)
        except Exception as e:
            print(f"Cannot extract parts for question ID {id_}: {e}")
            continue

        if has_curly:
            responses_with_curly += 1
        else:
            responses_without_curly += 1

        # Extract tags and their content
        tags_in_response = re.findall(r'<([A-Za-z]+\d*)>(.*?)</\1>', answer_text)
        number_of_tags = len(tags_in_response)
        tag_counts.append(number_of_tags)
        total_tags += number_of_tags

        for tag, content in tags_in_response:
            content_length = len(content)
            tag_lengths.append(content_length)
            total_tag_length += content_length

        # Retrieve ground truth answer
        gt_answer = ground_truth.get(id_)
        if gt_answer is None:
            no_ground_truth += 1
            continue

        # Compare final_answer with ground truth
        if final_answer == gt_answer:
            correct_answers += 1
        else:
            incorrect_answers += 1

    # Calculate additional metrics
    accuracy_percentage = (correct_answers / (correct_answers + incorrect_answers) * 100) if (correct_answers + incorrect_answers) > 0 else 0
    curly_percentage = (responses_with_curly / total_responses * 100) if total_responses > 0 else 0
    no_curly_percentage = (responses_without_curly / total_responses * 100) if total_responses > 0 else 0
    ground_truth_available = total_responses - no_ground_truth
    ground_truth_available_percentage = (ground_truth_available / total_responses * 100) if total_responses > 0 else 0

    # Calculate tag statistics
    average_tags_per_response = (total_tags / total_responses) if total_responses > 0 else 0
    average_tag_length = (total_tag_length / total_tags) if total_tags > 0 else 0

    # Print the statistics
    print("\n===== Analysis Statistics =====\n")
    print(f"Total Responses Analyzed: {total_responses}")
    print(f"Responses with Final Answer in Curly Brackets: {responses_with_curly} ({curly_percentage:.2f}%)")
    print(f"Responses without Final Answer in Curly Brackets: {responses_without_curly} ({no_curly_percentage:.2f}%)")
    print(f"Responses with Ground Truth Available: {ground_truth_available} ({ground_truth_available_percentage:.2f}%)")
    print(f"Correct Answers: {correct_answers}")
    print(f"Incorrect Answers: {incorrect_answers}")
    print(f"Accuracy: {accuracy_percentage:.2f}%")
    print(f"Responses without Ground Truth: {no_ground_truth}")

    # Tag Statistics
    print("\n----- Tag Statistics -----")
    print(f"Total Tags Found: {total_tags}")
    print(f"Average Number of Tags per Response: {average_tags_per_response:.2f}")
    print(f"Average Length of Tag Content: {average_tag_length:.2f} characters")
    print("--------------------------\n")
    print("===== End of Statistics =====\n")

def main():
    input_csv = '/Users/log/Github/textual_grounding/logan/results/GSM8K/llama/mermaid/mermaid_get_answer_llama3.1_20240926_215344.csv'  # Replace with your input CSV file path
    ground_truth_file = '/Users/log/Github/textual_grounding/data/GSM8K/test.jsonl'  # Path to the ground truth JSONL file

    # Check if input files exist
    if not os.path.isfile(input_csv):
        print(f"Input CSV file not found: {input_csv}")
        return
    if not os.path.isfile(ground_truth_file):
        print(f"Ground truth JSONL file not found: {ground_truth_file}")
        return

    # Parse the input CSV file to extract IDs, questions, and answers
    qa_pairs = parse_csv_file(input_csv)
    print(f"Total QA Pairs Parsed: {len(qa_pairs)}")  # Debug: Print the number of QA pairs parsed

    # Read the ground truth answers
    ground_truth = read_ground_truth(ground_truth_file)
    print(f"Total Ground Truth Entries: {len(ground_truth)}")  # Debug: Print the number of ground truth entries

    # Check if any QA pairs were found
    if not qa_pairs:
        print("No question-answer pairs were found in the input file.")
        return

    # Generate and print the statistics
    create_statistics(qa_pairs, ground_truth)

    print("Statistics analysis completed successfully.")

if __name__ == "__main__":
    main()


Total QA Pairs Parsed: 200
No ground truth answer found for ID 489
No ground truth answer found for ID 1113
Total Ground Truth Entries: 1317

===== Analysis Statistics =====

Total Responses Analyzed: 200
Responses with Final Answer in Curly Brackets: 136 (68.00%)
Responses without Final Answer in Curly Brackets: 64 (32.00%)
Responses with Ground Truth Available: 200 (100.00%)
Correct Answers: 94
Incorrect Answers: 106
Accuracy: 47.00%
Responses without Ground Truth: 0

----- Tag Statistics -----
Total Tags Found: 501
Average Number of Tags per Response: 2.50
Average Length of Tag Content: 8.68 characters
--------------------------

===== End of Statistics =====

Statistics analysis completed successfully.


In [2]:
import json

# Define the path to your JSON file
input_file = '/Users/log/Github/textual_grounding/data/AIW/test.json'

# Load the JSON data from the file
with open(input_file, 'r', encoding='utf-8') as file:
    try:
        data = json.load(file)
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        exit(1)

# Process each entry in the JSON data
for idx, entry in enumerate(data):
    # Get the current prompt
    prompt = entry.get('prompt', '')

    # Check if prompt is a string
    if isinstance(prompt, str):
        delimiter = 'have?'
        index = prompt.find(delimiter)

        if index != -1:
            # Truncate the prompt after "have?"
            truncated_prompt = prompt[:index + len(delimiter)]
            entry['prompt'] = truncated_prompt
        else:
            print(f"Warning: 'have?' not found in prompt of entry ID {entry.get('id', 'Unknown')}. Prompt left unchanged.")
    elif isinstance(prompt, list):
        print(f"Warning: 'prompt' is a list in entry ID {entry.get('id', 'Unknown')}. Attempting to join into a string.")
        # Attempt to join the list into a single string
        joined_prompt = ' '.join(str(item) for item in prompt)
        delimiter = 'have?'
        index = joined_prompt.find(delimiter)

        if index != -1:
            truncated_prompt = joined_prompt[:index + len(delimiter)]
            entry['prompt'] = truncated_prompt
        else:
            print(f"Warning: 'have?' not found after joining prompt in entry ID {entry.get('id', 'Unknown')}. Prompt left unchanged.")
    else:
        print(f"Warning: 'prompt' is neither a string nor a list in entry ID {entry.get('id', 'Unknown')}. Prompt left unchanged.")

    # Rename 'right_answer' to 'answer' if it exists
    if 'right_answer' in entry:
        entry['answer'] = entry.pop('right_answer')

# Save the updated data back to the same JSON file
with open(input_file, 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4, ensure_ascii=False)

print("JSON file has been updated successfully.")


JSON file has been updated successfully.
