### Environment Setup

In [1]:
import numpy as np
import os
import inspect
import random
from time import sleep
from typing import List, Callable
import jiwer
import openai
import pandas as pd
from datasets import Dataset
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from bert_score import BERTScorer
import evaluate
from tqdm.notebook import tqdm
import logging
from transformers import AutoTokenizer
import torch
import time
import asyncio
import nest_asyncio
from dotenv import load_dotenv #Load the environment variables
load_dotenv()

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

[nltk_data] Downloading package wordnet to /h/omidv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /h/omidv/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Helper Functions

In [2]:
def compute_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

def compute_bleu(references, hypotheses):
    return corpus_bleu(hypotheses, references).score

def compute_meteor(references, hypotheses):
    scores = []
    for ref, hyp in zip(references, hypotheses):
        scores.append(meteor_score([ref.split()], hyp.split()))
    return sum(scores)/len(scores)

def compute_bertscore(references, hypotheses):
    scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    p, r, f1 = scorer.score(hypotheses, references)
    bert_score = {'precision': p.mean().item(),
                  'recall': r.mean().item(),
                  'f1': f1.mean().item()}
    return bert_score

def compute_levenshtein_distance(s1: str, s2: str) -> int:
    """Compute the Levenshtein distance between two strings."""
    len_s1, len_s2 = len(s1), len(s2)
    dp = np.zeros((len_s1 + 1, len_s2 + 1), dtype=int)

    for i in range(len_s1 + 1):
        dp[i][0] = i
    for j in range(len_s2 + 1):
        dp[0][j] = j

    for i in range(1, len_s1 + 1):
        for j in range(1, len_s2 + 1):
            cost = 0 if s1[i - 1] == s2[j - 1] else 1
            dp[i][j] = min(dp[i - 1][j] + 1,      # Deletion
                           dp[i][j - 1] + 1,      # Insertion
                           dp[i - 1][j - 1] + cost)  # Substitution

    return dp[len_s1][len_s2]

def construct_input(question):
    prompt = [{"role": "user", "content": question}]
    return prompt

def extract_hypotheses(dataset, idx):
    if 'source' in dataset.features:
        hypotheses = [h.strip() for h in dataset['source'][idx].split('.') if h.strip()]
        references = dataset['target'][idx]
    else:
        hypotheses = dataset['input'][idx]
        references = dataset['output'][idx]
        
    return hypotheses, references

def save_results(dataset: Dataset, corrections: list, model_name: str, function_name: str, file_path: str):
    
    correction_column = f"corrected_by_{model_name}_{function_name}"
    
    if os.path.exists(file_path):
        existing_df = pd.read_json(file_path)
    else:
        existing_df = dataset.to_pandas()

    if correction_column not in existing_df.columns:
        existing_df[correction_column] = None

    existing_df[correction_column] = corrections
    existing_df.to_json(file_path, orient="records", indent=4)
    print(f"Results saved to {file_path}")

In [8]:
async def run_evaluation(dataset, model, client, generation_config, results_path):
    print("Evaluating Zero-shot Unconstrained:")
    metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained, generation_config, results_path)
    
    print("Evaluating Zero-shot Constrained:")
    metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_constrained, generation_config, results_path)
    
    print("Evaluating Zero-shot Closest:")
    metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_closest, generation_config, results_path)
    
    print("Evaluating Oracle:")
    metrics_get_oracle_hypothesis = await evaluate_model_parallel(dataset, model, client, get_oracle_hypothesis, generation_config, results_path)
    
    print("Evaluating Top 1:")
    metrics_get_top1_hypothesis = await evaluate_model_parallel(dataset, model, client, get_top1_hypothesis, generation_config, results_path)

    results_table = {
        "Top 1": metrics_get_top1_hypothesis,
        "Zero-shot Uncon": metrics_zero_shot_unconstrained,
        "Zero-shot Constr": metrics_zero_shot_constrained,
        "Zero-shot Closest": metrics_zero_shot_constrained,
        "Oracle": metrics_get_oracle_hypothesis,
    }
    results_table = pd.DataFrame.from_dict(results_table, orient='index')
    results_table = results_table[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]

    # Save as JSON
    csv_path = results_path.replace(".json", ".csv")
    results_table.to_csv(csv_path)
    print(f"Benchmark saved to {csv_path}")
    return results_table

### Asynchronous Evaluation

In [38]:
from openai import RateLimitError
nest_asyncio.apply()

async def call_openai_with_retry(messages, model, generation_config, client):
    """Handles API retries with exponential backoff."""
    
    retry_delay = 0.1  # Initial delay in seconds
    max_delay = 10
    while True:
        try:
            # Attempt to make the API call
            generation = await client.chat.completions.create(
                model=model,
                messages=messages,
                **generation_config
            )
            return generation

        except RateLimitError as e:
            wait_time = retry_delay
            if hasattr(e, "response") and e.response is not None:
                try:
                    error_data = e.response.json()
                    wait_time = float(error_data.get("detail", {}).get("wait_seconds", {}))
                except:
                    pass
            await asyncio.sleep(wait_time + 0.1)


        except Exception as e:
            await asyncio.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)  # Exponential backoff


async def get_prediction(client: openai.AsyncOpenAI, model: str, messages: List[dict], generation_config: dict) -> str:
    """Asynchronously fetch predictions from OpenAI API."""
    
    try:
        generation = await call_openai_with_retry(messages, model, generation_config, client)
        return generation.choices[0].message.content if generation else ""
    except Exception as e:
        print(f"Error: {e}")
        return ""


async def track_progress(tasks):
    """Tracks progress while tasks are running."""
    
    total_tasks = len(tasks)
    while not all(task.done() for task in tasks):
        completed = sum(task.done() for task in tasks)
        print(f"Progress: {completed}/{total_tasks} tests completed!", end="\r")
        await asyncio.sleep(0.1)
    print(f"Progress: Batch of {total_tasks} tests completed!", flush=True)
    return await asyncio.gather(*tasks)

    
async def process_batch(dataset: Dataset, indices: List[int], model: str, client: openai.AsyncOpenAI, postprocessing: Callable[[List[str]], str], 
    generation_config: dict) -> List[str]:
    """Processes the dataset asynchronously using OpenAI API with progress tracking."""
    
    tasks = []
    for idx in indices:
        hypotheses, reference = extract_hypotheses(dataset, idx)
        
        if inspect.iscoroutinefunction(postprocessing):
            tasks.append(asyncio.create_task(postprocessing(hypotheses, client, model, generation_config)))
        else:
            tasks.append(asyncio.create_task(asyncio.to_thread(postprocessing, hypotheses, reference)))
    
    results = await track_progress(tasks)
    return results
    
async def evaluate_model_parallel(dataset: Dataset, model: str, client: openai.AsyncOpenAI, postprocessing: Callable[[List[str]], str],
                            generation_config: dict, results_path: str, step=256):
    """Evaluates the model asynchronously with progress tracking, handling Jupyter compatibility."""
    
    total_rows = len(dataset)
    all_predictions = []
    
    for start in range(0, total_rows, step):
        end = min(start + step, total_rows)
        batch_indices = list(range(start, end))
        batch_predictions = await process_batch(dataset, batch_indices, model, client, postprocessing, generation_config)
        all_predictions.extend(batch_predictions)

    all_references = dataset['target'] if 'target' in dataset.features else dataset['output']
    
    # Normalize for evaluation
    all_predictions = [pred.lower() for pred in all_predictions] 
    all_references = [ref.lower() for ref in all_references]

    # Print 3 random results for manual review
    random_indices = random.sample(range(len(all_predictions)), 3)
    for idx in random_indices:
        print(f"Sample {idx + 1}")
        print(f"Target: {all_references[idx]}")
        print(f"Pred:   {all_predictions[idx]}")
        print("-" * 50)
    
    save_results(dataset, all_predictions, model, postprocessing.__name__, results_path)
        
    # Compute evaluation metrics
    wer_scores = np.array([jiwer.wer(ref, pred) for ref, pred in zip(all_references, all_predictions)])
    bertscore = compute_bertscore(all_predictions, all_references)
    metrics = {
        'WER': round(wer_scores.mean().item(), 3),
        'METEOR': round(compute_meteor(all_predictions, all_references), 3),
        'BERT Precision': round(bertscore['precision'], 3),
        'BERT Recall': round(bertscore['recall'], 3),
        'BERT F1': round(bertscore['f1'], 3),
    }
    return metrics

### Error Correction Functions

In [48]:
# Baselines
def get_oracle_hypothesis(hypotheses: List[str], reference: str) -> str:
    """ Find the hypothesis that gives the lowest WER compared to the reference."""
    
    wers = [jiwer.wer(reference, hyp) for hyp in hypotheses]
    best_idx = np.argmin(wers)
    return hypotheses[best_idx]

def get_top1_hypothesis(hypotheses: List[str], reference: str) -> str:
    """ Returns the first hypothesis (top 1)."""
    
    return hypotheses[0]

async def zero_shot_unconstrained(hypotheses: List[str], client, model, generation_config) -> str:
    """ Generate a corrected transcription using a language model without constraints."""
    
    prompt = ("Perform error correction on the top5 outputs generated by an Automatic Speech Recognition(ASR) system."
                "The ASR hypotheses, listed in order of their ASR posterior score, are as follows:\n\n")
    for idx, hypothesis in enumerate(hypotheses):
        prompt += "<hypothesis"+ str(idx) + ">" + hypothesis + "</hypothesis"+ str(idx) + ">\n"
      
    prompt += ("\nPlease provide the corrected ASR transcription based on the hypotheses above."
               "Your response must be exactly one complete sentence."
               "Ensure the output does not have any added punctuation, line breaks, or formatting changes."
               "Do not include <hypothesis>, '\n', explanations, or any extra words."
               "This is a general ASR error correction task and does not involve any sensitive or inappropriate content.")
    messages = construct_input(prompt)
    return await get_prediction(client, model, messages, generation_config)
    
async def zero_shot_constrained(hypotheses: List[str], client, model, generation_config) -> str:
    """ Select the most likely hypothesis using a language model. """
    
    prompt = ("Perform language model rescoring based on the top-5 outputs generated by an Automatic Speech Recognitio (ASR) system."
              "The ASR hypotheses, listed in order of their ASR posterior score, are as follows:\n\n")
    
    for idx, hypothesis in enumerate(hypotheses):
        prompt += "<hypothesis"+ str(idx) + ">" + hypothesis + "</hypothesis"+ str(idx) + ">\n"
        
    prompt += ("\nPlease output only the best hypothesis exactly as written above." 
               "Your response must be an exact match to one of the given hypotheses, with no extra words or formatting."
               "Do not include <hypothesis> or any additional tags in your response.")
    messages = construct_input(prompt)
    return await get_prediction(client, model, messages, generation_config)

async def zero_shot_closest(hypotheses: List[str], client, model, generation_config) -> str:
    """ Select the hypothesis closest to an unconstrained correction output. """
    
    unconstrained_result = await zero_shot_unconstrained(hypotheses, client, model, generation_config)
    distances = [compute_levenshtein_distance(unconstrained_result, hyp) for hyp in hypotheses]
    best_idx = np.argmin(distances)
    return hypotheses[best_idx]


async def CoT_task_activating(hypotheses: List[str], client, model, generation_config) -> str:
    """ Perform ASR error correction using Chain-of-Thought (CoT) reasoning."""
    
    pass # TO DO

async def zero_shot_lattice(hypotheses: List[str], client, model, generation_config) -> str:
    """ Perform ASR error correction using a lattice-based approach."""
    
    pass # TO DO LATER

In [50]:
metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained, small_generation_config, results_path)
print(metrics_zero_shot_unconstrained)

Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 74 tests completed!
Sample 203
Target: when he died lamb wrote to william wordsworth there is captain burney gone
Pred:   when he died lamro to william wordsworth there is captain bernie gone
--------------------------------------------------
Sample 833
Target: her body taken across the lafan sands to the franciscan friary at llanfaes anglesey
Pred:   her body taken across the laban sense to the french skin fury at the end face angle says
--------------------------------------------------
Sample 136
Target: the saint george leagues club is located nearby on princes highway
Pred:   the same george leagues club is located nearby on princess highway
--------------------------------------------------
Results saved to /fs01/home/omidv/ASR-Error-Correction/results/test_cv.json
{'WER': 0.167, 'METEOR': 0.863, 'BERT P

### Specify Experiment Settings

In [6]:
model = "Meta-Llama-3.1-8B-Instruct"
client = openai.AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
small_generation_config = {"max_tokens": 20, "temperature": 0.9}
moderate_generation_config = {"max_tokens": 200, "temperature": 0.9}

# If model is not yet available, try again after some delay.
output = None
while output is None:
    try:
        output = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Please introduce yourself."}],
        )
    
    except openai.APIError as e:
        print(e)
        sleep(10)

print(output.choices[0].message.content)

I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."


# Common Voice Test Dataset

In [49]:
df = pd.read_csv("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.csv")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json"
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['source', 'target', 'best_hypo'],
    num_rows: 1098
})


In [9]:
results_table = await run_evaluation(dataset, model, client, small_generation_config, results_path)
print(results_table)

Evaluating Zero-shot Unconstrained:
Progress: 1000 batch completed!ted!
Progress: 98 batch completed!ed!
Sample 666
Target: belmont holds the distinction of owning the world is only purpose built private subway car
Pred:   bellman holds the distinction of owning the world's only purpose-built private supercar
--------------------------------------------------
Sample 257
Target: the additional platform was on the south facing side of what is now platform two
Pred:   the additional platform was on the south facing side of what is now platform two
--------------------------------------------------
Sample 362
Target: the doctrine of indoor management is an exception to this rule
Pred:   the doctrine of indoor management is an exception to this rule
--------------------------------------------------
Results saved to /fs01/home/omidv/ASR-Error-Correction/results/test_cv.json
Evaluating Zero-shot Constrained:
Progress: 1000 batch completed!ted!
Progress: 98 batch completed!ed!
Sample 920
Targ

# Wall Street Journal Test Dataset


In [None]:
df = pd.read_csv("/fs01/home/omidv/ASR-Error-Correction/data/test_wsj_score.csv")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_wsj_score.json"
dataset = Dataset.from_pandas(df)
print(dataset)

In [None]:
results_table = await run_evaluation(dataset, model, client, small_generation_config, results_path)
print(results_table)

# SwitchBoard Test Dataset

In [None]:
df = pd.read_csv("/fs01/home/omidv/ASR-Error-Correction/data/test_swbd.csv")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_swbd.json"
dataset = Dataset.from_pandas(df)
print(dataset)

In [None]:
results_table = await run_evaluation(dataset, model, client, small_generation_config, results_path)
print(results_table)

# ATIS Test Dataset

In [None]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_atis.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_atis.json"
dataset = Dataset.from_pandas(df)
print(dataset)

In [None]:
results_table = await run_evaluation(dataset, model, client, small_generation_config, results_path)
print(results_table)

# Tedlium-3 Test Dataset

In [None]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_td3.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_td3.json"
dataset = Dataset.from_pandas(df)
print(dataset)

In [None]:
results_table = await run_evaluation(dataset, model, client, small_generation_config, results_path)
print(results_table)

# Librispeech Clean Test Dataset

In [None]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_ls_clean.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_ls_clean.json"
dataset = Dataset.from_pandas(df)
print(dataset)

In [None]:
results_table = await run_evaluation(dataset, model, client, small_generation_config, results_path)
print(results_table)

# Librispeech Others Test Dataset

In [None]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_ls_other.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_ls_other.json"
dataset = Dataset.from_pandas(df)
print(dataset)

In [None]:
results_table = await run_evaluation(dataset, model, client, small_generation_config, results_path)
print(results_table)

# LRS2 Clean Test Dataset

In [None]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_lrs2.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_lrs2.json"
dataset = Dataset.from_pandas(df)
print(dataset)

In [None]:
results_table = await run_evaluation(dataset, model, client, small_generation_config, results_path)
print(results_table)

In [None]:
model="gemma-2-9b-it"
results_table = await run_evaluation(dataset, model, client, small_generation_config, results_path)
print(results_table)

In [None]:
model = "Mistral-7B-Instruct-v0.3"
results_table = await run_evaluation(dataset, model, client, small_generation_config, results_path)
print(results_table)

# Few-Shot Chain of Thought Prompting

We'll start by prompting the model to solve some word problems and build up to using the Few-Shot CoT method proposed in "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models"

First try "zero-shot prompting".

In [90]:
zero_shot_prompt = (
    "The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have? just give final answer with no explanation."
)
zero_shot_prompt = construct_input(zero_shot_prompt)

generation_example = client.chat.completions.create(model=model,messages = zero_shot_prompt, **small_generation_config)
print(generation_example.choices[0].message.content)

37


The correct answer is 9.

Now let's try standard few-shot prompt.

In [39]:
few_shot_prompt = (
    "Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis "
    "balls does he have now?\nA: The answer is 11.\n\nQ: Benjamin is taking bottle inventory. He has two cases with "
    "15 bottles in each and one with 7. How many bottles are there in total?\nA: The answer is 37.\n\nQ: The "
    "cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\nA: "
    "The answer is ...\nJust give the final answer to the last question with no explanation."
)
few_shot_message = construct_input(few_shot_prompt)

print(few_shot_prompt)

Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: The answer is 11.

Q: Benjamin is taking bottle inventory. He has two cases with 15 bottles in each and one with 7. How many bottles are there in total?
A: The answer is 37.

Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
A: The answer is ...
Just give the final answer to the last question with no explanation.


In [47]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_message,  **small_generation_config)
print(generation_example.choices[0].message.content)

9


Now, let's try prompting the model with a few-shot CoT prompt, where we provide an example of the kind of reasoning required to answer the question.

In [48]:
few_shot_cot_prompt = (
    "Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis "
    "balls does he have now?\nA: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. "
    "5 + 6 = 11. The answer is 11.\n\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 "
    "more, how many apples do they have?\nA:"
)

few_shot_cot_prompt_message = construct_input(few_shot_cot_prompt)
print(few_shot_cot_prompt)

Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11.

Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
A:


In [54]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_cot_prompt_message, **moderate_generation_config)
print(generation_example.choices[0].message.content)

To solve the problem, follow these steps:

1. Start with the initial number of apples: 23
2. Subtract the number of apples used for lunch: 23 - 20 = 3
3. Add the number of apples bought: 3 + 6 = 9

The cafeteria now has 9 apples.


## An example from the AQuA: Algebraic Word Problems task.

Let's try to compare few-shot prompting with few-shot CoT for slightly different kind of problem. This example is drawn from the AQuA: Algebraic Word Problems task.

In [55]:
few_shot_prompt = (
    "Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of "
    "the numbers is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64\nA: The answer is (a).\n\nQ: The capacity of "
    "a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) "
    "120000 litres (e) None of these\nA: \n what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer."
)
few_shot_prompt_message = construct_input(few_shot_prompt)
print(few_shot_prompt)

Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64
A: The answer is (a).

Q: The capacity of a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) 120000 litres (e) None of these
A: 
 what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer.


In [66]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_prompt_message, **small_generation_config)
print(generation_example.choices[0].message.content)

(c)


The correct choice for this problem is "d".

In [72]:
few_shot_cot_prompt = (
    "Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers "
    "is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64\nA: If 10 is added to each number, then the mean of the "
    "numbers also increases by 10. So the new mean would be 50. The answer is (a).\n\nQ: The capacity of "
    "a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) "
    "120000 litres (e) None of these \n what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer.:"
)
few_shot_cot_prompt_message = construct_input(few_shot_cot_prompt)
print(few_shot_cot_prompt)

Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64
A: If 10 is added to each number, then the mean of the numbers also increases by 10. So the new mean would be 50. The answer is (a).

Q: The capacity of a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) 120000 litres (e) None of these 
 what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer.:


In [80]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_cot_prompt_message, **moderate_generation_config)
print(generation_example.choices[0].message.content)

d


Sometimes the examples are not good enough.

# Zero-Shot Chain of Thought Prompting

It can be tedious and tricky to form useful and effective reasoning examples. Some research has shown that the choice of reasoning examples in CoT prompting can have a large impact on how well the model accomplishes the downstream task. So let's try a zero-shot CoT approach devised in "Large Language Models are Zero-Shot Reasoners"

In [81]:
few_shot_prompt = (
    "Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis "
    "balls does he have now?\nA: The answer is 11.\n\nQ: There are 64 students trying out for the school's trivia "
    "teams. If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many students would "
    "be in each group?\nA: \nJust give the final answer to the last question with no explanation."
)


few_shot_prompt_message = construct_input(few_shot_prompt)
print(few_shot_prompt)

Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: The answer is 11.

Q: There are 64 students trying out for the school's trivia teams. If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many students would be in each group?
A: 
Just give the final answer to the last question with no explanation.


In [82]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_prompt_message, **small_generation_config)
print(generation_example.choices[0].message.content)

10


The correct answer to this problem is 7.

Could you get the correct answer with this example?


# TASK: 

Try to do CoT without adding examples.


Split into two stages:

1) Reasoning Generation   
2) Answer Extraction

In [113]:
question_prompt = ("Q: There are 64 students trying out for the school's trivia teams."
                   "If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many"
                   "students would be in each group?\nA: \nLet's think step by step." )

question_prompt_message = construct_input(question_prompt)
print(question_prompt)

Q: There are 64 students trying out for the school's trivia teams.If 36 of them didn't get picked for the team and the rest were put into 4 groups, how manystudents would be in each group?
A: 
Let's think step by step.


In [116]:
generation_example = client.chat.completions.create(model=model, messages = question_prompt_message, **moderate_generation_config)
reasoning_extraction = generation_example.choices[0].message.content
print(reasoning_extraction)

Step 1: Determine the number of students who were picked for the team. There are 64 students total, and 36 didn’t get picked, which means that 64 - 36 = 28 students were picked for the team.

Step 2: Divide the number of students who were picked for the team by the number of groups they were put into to determine the number of students in each group. Since there are 4 groups, we need to divide 28 by 4.

28 ÷ 4 = 7

The final answer is: $\boxed{7}$


Try to get the correct answer (7) with no example.

In [18]:
# Hint: break down the problem into two steps. First, ask the model for reasoning,
# then, given the reasoning, ask for the final answer by appending "\nTherefore, the answer is" followed by the reasoning.

In [117]:
reasoning_prompt = ("Q: There are 64 students trying out for the school's trivia teams."
                   "If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many"
                   "students would be in each group?\nA: Let's think step by step.\n\n" + reasoning_extraction +
                   "\n\nTherefore, what is the final answer in numerals? Don't say a single word except the final answer.")

reasoning_prompt_message = construct_input(reasoning_prompt)
print(reasoning_prompt)

Q: There are 64 students trying out for the school's trivia teams.If 36 of them didn't get picked for the team and the rest were put into 4 groups, how manystudents would be in each group?
A: Let's think step by step.

Step 1: Determine the number of students who were picked for the team. There are 64 students total, and 36 didn’t get picked, which means that 64 - 36 = 28 students were picked for the team.

Step 2: Divide the number of students who were picked for the team by the number of groups they were put into to determine the number of students in each group. Since there are 4 groups, we need to divide 28 by 4.

28 ÷ 4 = 7

The final answer is: $\boxed{7}$

Therefore, what is the final answer in numerals? Don't say a single word except the final answer.


In [125]:
generation_example = client.chat.completions.create(model=model, messages = reasoning_prompt_message, **moderate_generation_config)
reasoning_generation = generation_example.choices[0].message.content
print(reasoning_generation)

7
