# Experiment on Chain of Tables using Gemma2-9b-it model from groq 

Groq: https://groq.com/

Paper: https://arxiv.org/abs/2401.04398

##### Note: Groq provides API key for free, and some free usage, which has some limits such as token or requests per minute.  To get API get go to https://console.groq.com/keys

## Import libraries

In [2]:
# !pip install Groq

In [None]:
import pandas as pd
import requests
import zipfile
import io
import os
import re
import openai
import time
import glob 
from groq import Groq

from utils.load_data import wrap_input_for_demo
from utils.llm import ChatGPT
from utils.helper import *
from utils.evaluate import *
from utils.chain import *
from operations import *

## Define model

In [None]:
# User parameters
model_name: str = "gemma2-9b-it"
api_key: str = "YOUR_API_KEY"

In [26]:
gpt_llm = ChatGPT(
    model_name=model_name,
    key=api_key,
)

## Prepare WikiTQ dataset

In [27]:
wiki_tq_dir = "WikiTableQuestions/"
if os.path.exists(wiki_tq_dir) and os.path.isdir(wiki_tq_dir):
    print(f"WikiTableQuestions is already downloaded")
else:
    # Step 1: Download the zip file
    url = "https://github.com/ppasupat/WikiTableQuestions/releases/download/v1.0.2/WikiTableQuestions-1.0.2-compact.zip"
    response = requests.get(url)

    # Step 2: Unzip the contents
    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall("WikiTableQuestions")

    print("Download and extraction complete!")

WikiTableQuestions is already downloaded


Here we use a subset of test datset, including 100 questions

In [28]:
test_cases = pd.read_csv(wiki_tq_dir + "data/random-split-4-dev.tsv", sep="\t").head(100)
test_cases = test_cases.set_index("id")
test_cases.head(20)

Unnamed: 0_level_0,utterance,context,targetValue
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nt-2,which team won previous to crettyard?,csv/204-csv/772.csv,Wolfe Tones
nt-9,which players played the same position as ardo...,csv/203-csv/116.csv,Siim Ennemuist|Andri Aganits
nt-24,who ranked right after turkey?,csv/203-csv/812.csv,Sweden
nt-36,who was the top winner in 2002 of the division...,csv/204-csv/879.csv,Princeton
nt-42,what is the total number of popular votes cast...,csv/203-csv/558.csv,459640
nt-43,which division three team also played in the d...,csv/202-csv/73.csv,Seaford Town
nt-54,does theodis or david play center?,csv/204-csv/847.csv,Theodis Tarver
nt-72,what is the number of formula one series races...,csv/203-csv/198.csv,2
nt-75,how many places list no zip code in either the...,csv/204-csv/356.csv,18
nt-80,has the dominican republic won more or less me...,csv/203-csv/535.csv,less


## Helpful functions

In [29]:
def convert_df_to_table_text(df):
    return [list(df.columns)] + df.astype(str).values.tolist()

def normalize_answer(ans, normalize_numbers=True):
    # Remove "assistant:" prefix
    ans = re.sub(r'^Answer:\s*', '', ans, flags=re.IGNORECASE)

    # Lowercase
    ans = ans.lower()

    # Replace " and " with "|"
    ans = ans.replace(" and ", "|")

    # Remove punctuation (except "|")
    ans = re.sub(r"[^\w\s|]", "", ans)

    # Normalize numbers by removing text after numeric values if needed
    if normalize_numbers:
        ans = re.sub(r'(\d+)[^\d|]*', r'\1', ans)

    # Remove extra spaces around delimiters
    ans = "|".join(part.strip() for part in ans.split("|"))

    return ans.strip()



def merge_all_results(csv_folder):
    csv_files = glob.glob(os.path.join(csv_folder, "results_batch_*.csv"))
    dfs = []

    # Read and append each CSV file
    for file in csv_files:
        df = pd.read_csv(file)
        dfs.append(df)

    # Merge all DataFrames
    merged_df = pd.concat(dfs, ignore_index=True)
    merged_csv_path = os.path.join(csv_folder, "merged_results.csv")
    merged_df.to_csv(merged_csv_path, index=False)

    print(f"All CSV files merged successfully into '{merged_csv_path}'")


## WikiTQ With gemma2-9b-it

In [None]:
acc = 0
test_count = 0
results=[]
result_dir: str = "results/wiqiTQ"


# Ensure the result directory exists
os.makedirs(result_dir, exist_ok=True)

for testcase_id in test_cases.index:
    try:
        file_path = os.path.join(result_dir, f"{testcase_id}.pkl")

        # Skip test case if pkl file already exists
        if os.path.exists(file_path):
            print(f"Skipping {testcase_id} - already processed.")
            continue

        df_path = wiki_tq_dir + test_cases.loc[testcase_id]["context"]
        df = pd.read_csv(df_path)
        statement = test_cases.loc[testcase_id]["utterance"]
        answer = test_cases.loc[testcase_id]["targetValue"]
        
        table_caption = ""
        table_text = convert_df_to_table_text(df)
        
        demo_sample = wrap_input_for_demo(
        statement=statement, table_caption=table_caption, table_text=table_text
        )
        proc_sample, dynamic_chain_log = dynamic_chain_exec_one_sample(
            sample=demo_sample, llm=gpt_llm
        )
        output_sample = simple_query(
            sample=proc_sample,
            table_info=get_table_info(proc_sample),
            llm=gpt_llm,
            use_demo=False,
            llm_options=gpt_llm.get_model_options(
                temperature=0.0, per_example_max_decode_steps=200, per_example_top_p=1.0
            ),
        )
        cotable_log = get_table_log(output_sample)


        # Save the log to a .pkl file
        with open(file_path, "wb") as f:
            pickle.dump(cotable_log, f)
        
        response = cotable_log[-1]['cotable_result']
        response = response.strip().strip("'\"")
        final_response = response.split("Answer:")[-1].strip()
        
        print(f"ID: {testcase_id} | Response: {final_response} | Ground Truth: {answer}")
        results.append({'ID':testcase_id, 'Response':final_response, 'Ground Truth':answer})
        if final_response == answer:
            acc += 1
        test_count += 1
        if test_count % 5 == 0:
            print("Sleeping for 1 minute to prevent rate limiting...")
            time.sleep(60)

    except Exception as e:
        print(f"Error processing {testcase_id}") 
        
print(f"number of correct response = {acc}")

Note: Due to limitation on input tokens and requests per day, I have run above code using different api keys, so i saved csv files like batches.

In [31]:
# uncomment to save processed data
# df = pd.DataFrame(results)
# df.to_csv("results_batch_5.csv", index=False, encoding='utf-8')
# print("CSV file saved successfully!")

# uncomment to merge all 
# merge_all_results('./')

##### Load Results for 100 tables.

Note: Here we have results for 98 tables, becuase two table has incorrect csv format so it fails.

In [65]:
all_results_df= pd.read_csv('./merged_results.csv')
print(f"Total no of rows : {all_results_df.shape[0]}")
all_results_df.head(20)

Total no of rows : 98


Unnamed: 0,ID,Response,Ground Truth
0,nt-2,Confey,Wolfe Tones
1,nt-9,Siim Ennemuist,Siim Ennemuist|Andri Aganits
2,nt-24,I need more information to answer this question.,Sweden
3,nt-36,Princeton,Princeton
4,nt-42,459640,459640
5,nt-43,East Preston,Seaford Town
6,nt-54,Theodis Tarver plays center.,Theodis Tarver
7,nt-72,Test driver,2
8,nt-75,18,18
9,nt-80,The Dominican Republic has won more medals tha...,less


#### Normalize results and get accuracy

In [33]:
accuracy=0
incorrect_ids=[]
for index, row in all_results_df.iterrows():
    y_pred= normalize_answer(row['Response'])
    y_true= normalize_answer(row['Ground Truth'])
    if y_pred == y_true:
        accuracy+=1
    else:
        print(f"ID: {row['ID']}, Expected: {y_true}, Predicted: {y_pred} ")
        incorrect_ids.append(row['ID'])

print("-"*40)
accuracy_percentage = (accuracy / all_results_df.shape[0]) * 100
print(f"Accuracy: {accuracy_percentage:.2f}%")

ID: nt-2, Expected: wolfe tones, Predicted: confey 
ID: nt-9, Expected: siim ennemuist|andri aganits, Predicted: siim ennemuist 
ID: nt-24, Expected: sweden, Predicted: i need more information to answer this question 
ID: nt-43, Expected: seaford town, Predicted: east preston 
ID: nt-54, Expected: theodis tarver, Predicted: theodis tarver plays center 
ID: nt-72, Expected: 2, Predicted: test driver 
ID: nt-80, Expected: less, Predicted: the dominican republic has won more medals than china 
ID: nt-86, Expected: serbian progressive party српска напредна странка  srpska napredna stranka, Predicted: serbian progressive party 
ID: nt-120, Expected: bahrain, Predicted: south korea 
ID: nt-122, Expected: 2009, Predicted: 2007 
ID: nt-123, Expected: tikamgarh, Predicted: chhatarpur 
ID: nt-138, Expected: switzerland, Predicted: france 
ID: nt-141, Expected: 18, Predicted: this table does not contain information about games played by senators 
ID: nt-153, Expected: 10, Predicted: 100 
ID: nt-1

Comments: By verifying the failures, there are 7 more correct answers, dues to larger or different format of prompts: so the the accuracy using WikiTQ dataset with model gemma2, using chain of tables is 60%

## Examine some Failed testcases

In [34]:
### Example with chain-of-table
def get_chain_of_table(table_text, wrap_input, llm, answer):
    
    proc_sample, dynamic_chain_log = dynamic_chain_exec_one_sample(
        sample=wrap_input, llm=llm
    )
    output_sample = simple_query(
        sample=proc_sample,
        table_info=get_table_info(proc_sample),
        llm=llm,
        use_demo=False,
        llm_options=llm.get_model_options(
            temperature=0.0, per_example_max_decode_steps=200, per_example_top_p=1.0
        ),
    )
    cotable_log = get_table_log(output_sample)
    
    print(f'Question: {output_sample["statement"]}\n')
    print(f'Table: {output_sample["table_caption"]}')
    print(f"{pd.DataFrame(table_text[1:], columns=table_text[0])}\n")
    
    for table_info in cotable_log:
        if table_info["act_chain"]:
            table_text = table_info["table_text"]
            table_action = table_info["act_chain"][-1]
            if "skip" in table_action:
                continue
            if "query" in table_action:
                result = table_info["cotable_result"]
                print(f"{result}")
            else:
                print(f"-> {table_action}\n{pd.DataFrame(table_text[1:], columns=table_text[0])}")
                if 'group_sub_table' in table_info:
                    group_column, group_info = table_info["group_sub_table"]
                    group_headers = ["Group ID", group_column, "Count"]
                    group_rows = []
                    for i, (v, count) in enumerate(group_info):
                        if v.strip() == "":
                            v = "[Empty Cell]"
                        group_rows.append([f"Group {i+1}", v, str(count)])
                    print(f"{pd.DataFrame(group_rows, columns=group_headers)}")
                print()

    print(f"Groundtruth: {answer}")

### Testcase nt-9

In [56]:
testcase_id = "nt-9"
df_path = wiki_tq_dir + test_cases.loc[testcase_id]["context"]
statement = test_cases.loc[testcase_id]["utterance"]
answer = test_cases.loc[testcase_id]["targetValue"] 
df = pd.read_csv(df_path)

table_caption = ""
table_text = convert_df_to_table_text(df)

demo_sample = wrap_input_for_demo(
    statement=statement, table_caption=table_caption, table_text=table_text
)

In [57]:
get_chain_of_table(table_text, demo_sample, gpt_llm, answer)

Question: which players played the same position as ardo kreek?

Table: 
   No.           Player                  Birth Date Weight Height  \
0    4       Ardo Kreek     August 7, 1986 (age 27)     96    203   
1    5      Kert Toobal       June 3, 1979 (age 35)     78    189   
2    6   Martti Juhkami       June 6, 1988 (age 26)     96    196   
3    7    Argo Meresaar   January 13, 1980 (age 34)    107    206   
4    8     Kusti Nõlvak   November 6, 1991 (age 22)     81    186   
5    9      Robert Täht    August 15, 1993 (age 20)     80    190   
6   11     Oliver Venno       May 23, 1990 (age 24)    105    210   
7   14     Rait Rikberg    August 30, 1982 (age 31)     80    174   
8   16  Edgar Järvekülg      June 12, 1988 (age 26)     77    186   
9   17   Siim Ennemuist   December 5, 1989 (age 24)     89    196   
10  18  Jaanus Nõmmsalu   January 19, 1981 (age 33)     94    200   
11  19    Andri Aganits  September 7, 1993 (age 20)     99    207   

          Position      Curre

Comments: The model selected the first row where position is same, instead of selecting all rows where position is same

##### Test with direct prompt

In [None]:

# Serialize the table (you can adjust formatting)
def serialize_table(df):
    return df.to_csv(index=False)

# Build the prompt
def build_prompt(df, question):
    serialized_table = serialize_table(df)
    prompt_str = f"""\
Here's a serialized table.

{serialized_table}

Please answer the question: {question}
Answer: """
    return prompt_str

# Query Groq API
def query_groq(prompt):
    client = Groq(api_key=api_key)
    completion = client.chat.completions.create(
        model="gemma2-9b-it",
        messages=prompt,
        temperature=0,
        max_completion_tokens=1024,
        top_p=1,
        stream=True,
        stop=None,
    )

    response = ""  # Store the response in a variable
    for chunk in completion:
        if chunk.choices[0].delta.content:
            response += chunk.choices[0].delta.content

    return response.strip()  


In [59]:
prompt = build_prompt(df, statement)
prompt_message=np.array([{"role": "assistant",
                 "content": prompt}])
query_groq(prompt_message)



Andri Aganits 





Comments: Through direct prompt model also have selected one correct name.

### Testcase nt-141

In [49]:
testcase_id = "nt-141"
df_path = wiki_tq_dir + test_cases.loc[testcase_id]["context"]
statement = test_cases.loc[testcase_id]["utterance"]
answer = test_cases.loc[testcase_id]["targetValue"]
df = pd.read_csv(df_path)

table_caption = ""
table_text = convert_df_to_table_text(df)

demo_sample = wrap_input_for_demo(
    statement=statement, table_caption=table_caption, table_text=table_text
)

get_chain_of_table(table_text, demo_sample, gpt_llm, answer)

Question: what is the number of games the senators have played?

Table: 
     #         Date             Visitor Score                Home Record Pts
0    1  December 21     Ottawa Senators   5–2  Montreal Canadiens  1–0–0   2
1    2  December 26      Toronto Arenas   2–5     Ottawa Senators  2–0–0   4
2    3  December 31     Ottawa Senators   2–4      Toronto Arenas  2–1–0   4
3    4    January 2  Montreal Canadiens   2–7     Ottawa Senators  3–1–0   6
4    5    January 4     Ottawa Senators   2–5  Montreal Canadiens  3–2–0   6
5    6    January 9      Toronto Arenas   2–4     Ottawa Senators  4–2–0   8
6    7   January 14     Ottawa Senators   2–5      Toronto Arenas  4–3–0   8
7    8   January 16  Montreal Canadiens  10–6     Ottawa Senators  4–4–0   8
8    9   January 18     Ottawa Senators   3–5  Montreal Canadiens  4–5–0   8
9   10   January 23      Toronto Arenas   2–3     Ottawa Senators  5–5–0  10
10  11   January 25     Ottawa Senators   1–0  Montreal Canadiens  1–0–0   2
11 

##### Test with direct prompt

In [50]:
prompt = build_prompt(df, statement)
prompt_message=np.array([{"role": "assistant",
                 "content": prompt}])
query_groq(prompt_message)


18


Comments: The incorrect answer was given by chain of tables, but got correct answer through direct prompt. Reason could be in chain of tables the mdoel has selected the '#' columns, which is just index.

### Test n-80

In [63]:
testcase_id = "nt-80"
df_path = wiki_tq_dir + test_cases.loc[testcase_id]["context"]
statement = test_cases.loc[testcase_id]["utterance"]
answer = test_cases.loc[testcase_id]["targetValue"] 
df = pd.read_csv(df_path)

table_caption = ""
table_text = convert_df_to_table_text(df)

demo_sample = wrap_input_for_demo(
    statement=statement, table_caption=table_caption, table_text=table_text
)

get_chain_of_table(table_text, demo_sample, gpt_llm, answer)

Question: has the dominican republic won more or less medals than china?

Table: 
   Rank                 Nation Gold Silver Bronze Total
0    1.          United States    8      7      1    16
1    2.                 Russia    7      7      5    19
2    3.                 France    3      3      2     8
3    4.               Ethiopia    3      2      2     7
4    5.                Belarus    3      1      3     7
5    6.                 Sweden    2      1      2     5
6    7.                  Kenya    2      1      1     4
7    7=           South Africa    2      1      1     4
8    9.                Morocco    2      1      0     3
9   10.                 Greece    1      1      2     4
10  11.                   Cuba    1      1      0     2
11  12.                  Italy    1      0      2     3
12  13.                 Canada    1      0      1     2
13  14.                Algeria    1      0      0     1
14  14=              Australia    1      0      0     1
15  14=     Dominican 

#### Test With direct prompt

In [64]:
prompt = build_prompt(df, statement)
prompt_message=np.array([{"role": "assistant",
                 "content": prompt}])
query_groq(prompt_message)



The Dominican Republic has won 1 medal. China has won 2 medals. 

Therefore, the Dominican Republic has won **less** medals than China.


Comments: Chain of tables gives wrong answer, while direct prompt gives the correct answer.

### Test  nt-122	

In [67]:
testcase_id = "nt-122"
df_path = wiki_tq_dir + test_cases.loc[testcase_id]["context"]
statement = test_cases.loc[testcase_id]["utterance"]
answer = test_cases.loc[testcase_id]["targetValue"] 
df = pd.read_csv(df_path)

table_caption = ""
table_text = convert_df_to_table_text(df)

demo_sample = wrap_input_for_demo(
    statement=statement, table_caption=table_caption, table_text=table_text
)

get_chain_of_table(table_text, demo_sample, gpt_llm, answer)

Question: the team's record in 2011 was the same was it's record in what year

Table: 
           Season   Division    W–L Finish   Home   Road     GF     GA  \
0            2006    Western   1–15    6th    0–8    1–7    150    202   
1            2007    Western   6–10    5th    4–4    2–6    160    189   
2            2008    Western   4–12    5th    3–5    1–7    141    197   
3            2009    Western   5–11    6th    4–4    1–7    159    200   
4            2010    Western   10–6    3rd    5–3    5–3    186    201   
5            2011    Western   5–11    5th    4–4    1–7    175    204   
6            2012    Western   6–10    4th    4–4    2–6    167    175   
7            2013    Western    9–7    3rd    2–6    7–1    203    170   
8           Total  8 seasons  46–82    nan  26–38  20–44  1,341  1,538   
9  Playoff Totals        nan    3–3    nan    0–0    3–3     73     54   

                              Coach                    Playoffs Avg Attendance  
0                

#### Test with direct prompt

In [68]:
prompt = build_prompt(df, statement)
prompt_message=np.array([{"role": "assistant",
                 "content": prompt}])
query_groq(prompt_message)

2009




Comments: Here again we get the correct answer with direct prompt

## Examine some Success testcases through direct prompt

### Test Case nt-36

In [61]:
testcase_id = "nt-36"
df_path = wiki_tq_dir + test_cases.loc[testcase_id]["context"]
statement = test_cases.loc[testcase_id]["utterance"]
answer = test_cases.loc[testcase_id]["targetValue"]
df = pd.read_csv(df_path)
print(f"Ground truth is: {answer}")
prompt = build_prompt(df, statement)
prompt_message=np.array([{"role": "assistant",
                 "content": prompt}])
query_groq(prompt_message)

Ground truth is: Princeton

Princeton





### Test Case nt-121

In [66]:
testcase_id = "nt-121"
df_path = wiki_tq_dir + test_cases.loc[testcase_id]["context"]
statement = test_cases.loc[testcase_id]["utterance"]
answer = test_cases.loc[testcase_id]["targetValue"]
df = pd.read_csv(df_path)
print(f"Ground truth is: {answer}")
prompt = build_prompt(df, statement)
prompt_message=np.array([{"role": "assistant",
                 "content": prompt}])
query_groq(prompt_message)

Ground truth is: Tensile Modulus

Tensile Modulus


### Since promopting is giving better results on some cases, Let's run the model for 100 test samples

In [70]:
results = []
for testcase_id in test_cases.index:
    # need to check if processed
    try:
        df_path = wiki_tq_dir + test_cases.loc[testcase_id]["context"]
        statement = test_cases.loc[testcase_id]["utterance"]
        answer = test_cases.loc[testcase_id]["targetValue"]
        df = pd.read_csv(df_path)
        
        prompt = build_prompt(df, statement)
        response = query_groq(np.array([{"role": "assistant",
                 "content": prompt}]))
        
        response = response.strip().strip("'\"")
        final_response = response.split("Answer:")[-1].strip()
        
        print(f"ID: {testcase_id} | Response: {final_response} | Ground Truth: {answer}")
        
        # Save to results list
        results.append({
            "ID": testcase_id,
            "Response": final_response,
            "Ground Truth": answer
        })
    except Exception as e:
        print(f"Testcase {testcase_id} fails with error: {str(e)}")
        continue

ID: nt-2 | Response: Wolfe Tones | Ground Truth: Wolfe Tones
ID: nt-9 | Response: Andri Aganits | Ground Truth: Siim Ennemuist|Andri Aganits
ID: nt-24 | Response: Sweden | Ground Truth: Sweden
ID: nt-36 | Response: Princeton | Ground Truth: Princeton
ID: nt-42 | Response: 459,640 | Ground Truth: 459,640
ID: nt-43 | Response: East Preston | Ground Truth: Seaford Town
ID: nt-54 | Response: Theodis Tarver plays center. | Ground Truth: Theodis Tarver
ID: nt-72 | Response: 0 | Ground Truth: 2
ID: nt-75 | Response: 10 | Ground Truth: 18
ID: nt-80 | Response: The Dominican Republic has won 1 medal. China has won 2 medals. 

Therefore, the Dominican Republic has won **less** medals than China. | Ground Truth: less
ID: nt-81 | Response: Chevrolet has the most vehicles in the roster other than Dodge. | Ground Truth: Chevrolet
ID: nt-84 | Response: 2005 | Ground Truth: 2005
ID: nt-85 | Response: John Wark | Ground Truth: George Burley
ID: nt-86 | Response: The Serbian Progressive Party has the mo

In [74]:
results_df_prompting = pd.DataFrame(results)
results_df_prompting = results_df_prompting.set_index("ID")
results_df_prompting.to_csv("direct_prompting_results.csv")
results_df_prompting.head()

Unnamed: 0_level_0,Response,Ground Truth
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
nt-2,Wolfe Tones,Wolfe Tones
nt-9,Andri Aganits,Siim Ennemuist|Andri Aganits
nt-24,Sweden,Sweden
nt-36,Princeton,Princeton
nt-42,459640,459640


In [75]:
accuracy=0
incorrect_ids=[]
for index, row in results_df_prompting.iterrows():
    y_pred= normalize_answer(row['Response'])
    y_true= normalize_answer(row['Ground Truth'])
    if y_pred == y_true:
        accuracy+=1
    else:
        print(f"ID: {index}, Expected: {y_true}, Predicted: {y_pred} ")
        incorrect_ids.append(index)

print("-"*40)
accuracy_percentage = (accuracy / results_df_prompting.shape[0]) * 100
print(f"Accuracy: {accuracy_percentage:.2f}%")

ID: nt-9, Expected: siim ennemuist|andri aganits, Predicted: andri aganits 
ID: nt-43, Expected: seaford town, Predicted: east preston 
ID: nt-54, Expected: theodis tarver, Predicted: theodis tarver plays center 
ID: nt-72, Expected: 2, Predicted: 0 
ID: nt-75, Expected: 18, Predicted: 10 
ID: nt-80, Expected: less, Predicted: the dominican republic has won 12 
ID: nt-81, Expected: chevrolet, Predicted: chevrolet has the most vehicles in the roster other than dodge 
ID: nt-85, Expected: george burley, Predicted: john wark 
ID: nt-86, Expected: serbian progressive party српска напредна странка  srpska napredna stranka, Predicted: the serbian progressive party has the most mps with 134 
ID: nt-120, Expected: bahrain, Predicted: south korea has the most wins against bahrain 
ID: nt-138, Expected: switzerland, Predicted:  
ID: nt-153, Expected: 10, Predicted: 100 
ID: nt-154, Expected: pan troglodytes|nomascus leucogenys, Predicted: chimpanzee|gibbon 
ID: nt-171, Expected: 2, Predicted: 7 

Comments: By verifying the failures, there are 9 more correct answers, dues to larger or different format of prompts: so the the accuracy using WikiTQ dataset with model gemma2, using chain of tables is ~55%