# Reproducing Zero-shot CoT on MultiArith & CommonsenseQA

We replicate Zero-Shot and Zero-Shot-CoT prompting using Gemini. See relevant clauses and triggers:
- MultiArith answer trigger: **Therefore, the answer (arabic numerals) is**
- CommonsenseQA answer trigger: **Therefore, among A through E, the answer is**
- Temperature: **0.0**, greedy decoding
- Two-stage reasoning: rationale → answer extraction


In [1]:
import sys, os
sys.path.append(os.path.abspath(".."))


In [2]:
import os
import json
import csv
from src.dataset_loader import load_multiarith, load_commonsense
from src.pipeline import ZeroShotCOTPipeline, DirectAnswerPipeline
from src.gemini_client import GeminiClient

from src.dataset_loader import load_multiarith, load_commonsense
API_KEY = os.getenv("GEMINI_API_KEY")
client = GeminiClient(api_key=API_KEY)


In [3]:
def save_results(results, out_dir="./results", filename="results"):
    os.makedirs(out_dir, exist_ok=True)

    # Save JSON
    json_file = os.path.join(out_dir, f"{filename}.json")
    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=2, ensure_ascii=False)


## CommonSenseQA

In [4]:

csqa_direct = load_commonsense(r"../dataset/CommonsenseQA/dev_rand_split.jsonl")
csqa_cot    = load_commonsense(r"../dataset/CommonsenseQA/dev_rand_split.jsonl")

pipeline_direct = DirectAnswerPipeline(client, task="commonsenseqa")
pipeline_cot = ZeroShotCOTPipeline(client, task="commonsenseqa")

results_direct = [pipeline_direct.run_sample(sample) for sample in csqa_direct]
results_cot = [pipeline_cot.run_sample(sample) for sample in csqa_cot]

# Save results
save_results(results_direct, filename="csqa_zeroshot")
save_results(results_cot,  filename="csqa_zeroshot_cot")


### MultiArith

In [None]:
multiarith_direct = load_multiarith(r"../dataset/MultiArith/MultiArith.json")
multiarith_cot = load_multiarith(r"../dataset/MultiArith/MultiArith.json")

pipeline_direct = DirectAnswerPipeline(client, task="multi_arith")
pipeline_cot = ZeroShotCOTPipeline(client, task="multi_arith")

results_direct = [pipeline_direct.run_sample(sample) for sample in multiarith_direct]

# Run CoT
results_cot = [pipeline_cot.run_sample(sample) for sample in multiarith_cot]

# Save results
save_results(results_direct, filename="multiarith_zeroshot")
save_results(results_cot,  filename="multiarith_zeroshot_cot")


## Evaluation

In [8]:
from src.evaluate import evaluate_experiment

df = evaluate_experiment("./results")

print("Evaluation Table:")
df

Evaluation Table:


{}