### Environment Setup

In [1]:
import numpy as np
import os
from time import sleep
from typing import List, Callable
import jiwer
import openai
import pandas as pd
from datasets import Dataset
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from bert_score import BERTScorer
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer
import torch
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

#Load the environment variables
from dotenv import load_dotenv
load_dotenv() 

[nltk_data] Downloading package wordnet to /h/omidv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /h/omidv/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Helper Functions

In [2]:
def compute_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

def compute_bleu(references, hypotheses):
    return corpus_bleu(hypotheses, references).score

def compute_meteor(references, hypotheses):
    scores = []
    for ref, hyp in zip(references, hypotheses):
        scores.append(meteor_score([ref.split()], hyp.split()))
    return sum(scores)/len(scores)

def compute_bertscore(references, hypotheses):
    scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    p, r, f1 = scorer.score(hypotheses, references)
    bert_score = {
            'precision': p.mean().item(),
            'recall': r.mean().item(),
            'f1': f1.mean().item()
        }
    return bert_score

def construct_input(question):
    prompt = [{"role": "user", "content": question}]
    return prompt

In [14]:
def evaluate_model(dataset:Dataset, model: str, client: openai.OpenAI, postprocessing: Callable[[List[str]], str], generation_config: dict, 
                   use_llm: bool = True, verbose: int = 0, step: int = 100) -> dict:
    """Evaluate model performance on the entire dataset."""
    all_predictions = []
    all_references = []
    running_wer = []
    for i in tqdm(range(len(dataset))):
        # Get hypotheses for current example
        hypotheses = [h.strip() for h in dataset['source'][i].split('.') if h.strip()]
        reference = dataset['target'][i]
        
        # Generate prompt
        if use_llm:
            llm_prompt = postprocessing(hypotheses)
            messages = construct_input(llm_prompt)

            try:
                generation = client.chat.completions.create(
                    model=model,
                    messages=messages,
                    **generation_config
                )
                prediction = generation.choices[0].message.content
            except Exception as e:
                print(f"Error processing example {i}: {e}")
                prediction = ""
        else:
            prediction = postprocessing(hypotheses, reference)
        reference, prediction = reference.lower(), prediction.lower()
        wer = jiwer.wer(reference, prediction)
        running_wer.append(wer)
        all_predictions.append(prediction)
        all_references.append(reference)
        
        # Print progress update for every %step examples
        if (i + 1) % step == 0:
            print(f"Current average WER: {round(np.mean(running_wer).item(), 3):.3f}")
            if verbose == 1:
                print('-----------------------------------------------------------')
                print("Corrected: %s\nTarget:    %s\n"%(prediction, reference))
                
    # Calculate metrics
    bertscore = compute_bertscore(all_predictions, all_references)
    metrics = {
        'WER': round(np.mean(running_wer).item(), 3),
        'METEOR': round(compute_meteor(all_predictions, all_references), 3),
        'BERT Precision': round(bertscore['precision'], 3),
        'BERT Recall': round(bertscore['recall'], 3),
        'BERT F1': round(bertscore['f1'], 3),
        #'BLEU': round(compute_bleu(all_predictions, all_references), 3),
    }
    return metrics

### Specify Experiment Settings

In [4]:
model="Meta-Llama-3.1-8B-Instruct"
client = openai.OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
small_generation_config = {"max_tokens": 20, "temperature": 0.9}
moderate_generation_config = {"max_tokens": 200, "temperature": 0.9}

# If model is not yet available, try again after some delay.
output = None
while output is None:
    try:
        output = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Please introduce yourself."}],
        )
    
    except openai.APIError as e:
        print(e)
        sleep(10)

print(output.choices[0].message.content)

I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."


In [5]:
prompt = construct_input("What is the capital of Djibouti?")
generation = client.chat.completions.create(model=model, messages = prompt, **small_generation_config)
print(generation.choices[0].message.content)

The capital of Djibouti is Djibouti City.


In [6]:
# Baselines
def get_oracle_hypothesis(hypotheses: List[str], reference: str) -> str:
    """
    Find the hypothesis that gives the lowest WER compared to the reference.
    Returns the best hypothesis and its WER.
    """
    wers = [jiwer.wer(reference, hyp) for hyp in hypotheses]
    best_idx = np.argmin(wers)
    return hypotheses[best_idx]

def get_top1_hypothesis(hypotheses: List[str], reference: str) -> str:
    """Get the first hypothesis (baseline)."""
    return hypotheses[0]

In [7]:
def zero_shot_unconstrained(hypotheses: List[str]) -> str:
    prompt = "Perform error correction on the top5 outputs generated by an Automatic Speech Recognition"
    "(ASR) system. The ASR hypotheses, listed in order of their ASR posterior score, are as follows:\n\n"
    for idx, hypothesis in enumerate(hypotheses):
        prompt += "<hypothesis"+ str(idx) + ">" + hypothesis + "</hypothesis"+ str(idx) + ">\n"
    return prompt + "Please provide the corrected top1 ASR transcription of the given utterance only, do not add any explanation or other words."

def zero_shot_constrained(hypotheses: List[str]) -> str:
    prompt = "Perform language model rescoring based on the top5 outputs generated by an Automatic Speech Recognition"
    "(ASR) system. The ASR hypotheses, listed in order of their ASR posterior score, are as follows:\n\n"
    for idx, hypothesis in enumerate(hypotheses):
        prompt += "<option"+ str(idx) + ">" + hypothesis + "</option"+ str(idx) + ">\n"
    return prompt + "Please output the selected top1 ASR transcription, do not add any explanation or <option> tags."

def zero_shot_closest(hypotheses: List[str]) -> str:
    # TO DO
    
def zero_shot_lattice(hypotheses: List[str]) -> str:
    # TO DO

# Common Voice Test Dataset

In [8]:
# Importing Dataset
df = pd.read_csv("/fs01/home/omidv/data/test_cv.csv")
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = [h.strip() for h in dataset['source'][0].split('.') if h.strip()]
print(hypotheses)

Dataset({
    features: ['source', 'target', 'best_hypo'],
    num_rows: 1098
})
['transit road surveyed by joseph ellicott was named for an important surveying instrument', 'transit wrote surveyed by joseph ellicott was named for an important surveying instrument', 'transit road surveyed by joseph ellikot was named for an important surveying instrument', 'transit road surveyed by joseph ellicott was named for an important surveying instrument', 'transit road surveyed by joseph ellicate was named for an important surveying instrument']


In [12]:
metrics_zero_shot_unconstrained = evaluate_model(dataset, model, client, zero_shot_unconstrained,
                                                 small_generation_config, use_llm=True, verbose = True)

  9%|▉         | 100/1098 [00:57<10:01,  1.66it/s]

Current average WER: 0.189
-----------------------------------------------------------
Corrected: it is commonly used interchangeably with terms political islam or islamic fundamentalism
Target:    it is commonly used interchangeably with the terms political islam or islamic fundamentalism



 18%|█▊        | 200/1098 [01:58<09:11,  1.63it/s]

Current average WER: 0.207
-----------------------------------------------------------
Corrected: in some cases the purpose of this operation needs to curb excessive vomiting
Target:    in some cases the purpose of this operation is to correct excessive vomiting



 27%|██▋       | 300/1098 [03:01<08:11,  1.62it/s]

Current average WER: 0.202
-----------------------------------------------------------
Corrected: a woman sits in the snow with a young child on her lap
Target:    a woman sits in the snow with a young child on her lap



 36%|███▋      | 400/1098 [04:03<06:23,  1.82it/s]

Current average WER: 0.184
-----------------------------------------------------------
Corrected: she saw he was red and hurt and is fallen
Target:    she saw he was rather pleased and her anxiety all went



 46%|████▌     | 500/1098 [05:04<05:43,  1.74it/s]

Current average WER: 0.179
-----------------------------------------------------------
Corrected: on television and radio the point has fronted sport travel and rural affairs programs
Target:    on television and radio vipond has fronted sport travel and rural affairs programs



 55%|█████▍    | 600/1098 [06:05<04:37,  1.80it/s]

Current average WER: 0.179
-----------------------------------------------------------
Corrected: a major local employer is the wespac semis power plant
Target:    a major local employer is the w h sammis power plant



 64%|██████▍   | 700/1098 [07:07<04:10,  1.59it/s]

Current average WER: 0.185
-----------------------------------------------------------
Corrected: these and other indicators reveal a seriously declining performance standard in the city is
Target:    these and other indicators reveal a seriously declining performance standard in the city is schools



 73%|███████▎  | 800/1098 [08:05<02:38,  1.88it/s]

Current average WER: 0.184
-----------------------------------------------------------
Corrected: several shows have also been announced for north america in december
Target:    several shows have also been announced for north america in december



 82%|████████▏ | 900/1098 [09:07<01:49,  1.80it/s]

Current average WER: 0.185
-----------------------------------------------------------
Corrected: there is a second of brooding loneliness in existence
Target:    there is not a second of brooding loneliness in their existence



 91%|█████████ | 1000/1098 [10:08<01:03,  1.54it/s]

Current average WER: 0.187
-----------------------------------------------------------
Corrected: his first movie role was in roger corman's streetwalker playing a pimp named duke
Target:    his first movie role was in roger corman is streetwalkin playing a pimp named duke



100%|██████████| 1098/1098 [11:07<00:00,  1.65it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
metrics_zero_shot_constrained = evaluate_model(dataset, model, client, zero_shot_constrained,
                                                small_generation_config, use_llm=True, verbose = True)

  9%|▉         | 101/1098 [00:56<07:47,  2.13it/s]

Current average WER: 0.136
-----------------------------------------------------------
Corrected: it is commonly used interchangeably with terms political islam or islamic fundamentalism
Target:    it is commonly used interchangeably with the terms political islam or islamic fundamentalism



 18%|█▊        | 200/1098 [01:55<08:40,  1.73it/s]

Current average WER: 0.159
-----------------------------------------------------------
Corrected: in some cases the purpose of this operation needs to curb excessive vomiting
Target:    in some cases the purpose of this operation is to correct excessive vomiting



 27%|██▋       | 300/1098 [02:57<08:00,  1.66it/s]

Current average WER: 0.156
-----------------------------------------------------------
Corrected: a woman sits in the snow with a young child on her lap
Target:    a woman sits in the snow with a young child on her lap



 36%|███▋      | 400/1098 [03:59<06:24,  1.82it/s]

Current average WER: 0.145
-----------------------------------------------------------
Corrected: she saw he was red pleased and hurt and is fallen
Target:    she saw he was rather pleased and her anxiety all went



 46%|████▌     | 500/1098 [04:59<05:37,  1.77it/s]

Current average WER: 0.142
-----------------------------------------------------------
Corrected: on television and radio the point has fronted sport travel and rural affairs programs
Target:    on television and radio vipond has fronted sport travel and rural affairs programs



 55%|█████▍    | 600/1098 [05:59<04:33,  1.82it/s]

Current average WER: 0.145
-----------------------------------------------------------
Corrected: a major local employer is the who semis power plant
Target:    a major local employer is the w h sammis power plant



 64%|██████▍   | 700/1098 [07:00<03:55,  1.69it/s]

Current average WER: 0.147
-----------------------------------------------------------
Corrected: these and how the indicators reveal a seriously declining performance standard in the city is
Target:    these and other indicators reveal a seriously declining performance standard in the city is schools



 73%|███████▎  | 800/1098 [07:59<02:40,  1.86it/s]

Current average WER: 0.149
-----------------------------------------------------------
Corrected: several shows have also been announced for north america in december
Target:    several shows have also been announced for north america in december



 82%|████████▏ | 900/1098 [09:00<01:54,  1.73it/s]

Current average WER: 0.149
-----------------------------------------------------------
Corrected: there is in a second of brooding loneliness in the existence
Target:    there is not a second of brooding loneliness in their existence



 91%|█████████ | 1000/1098 [10:00<01:02,  1.57it/s]

Current average WER: 0.151
-----------------------------------------------------------
Corrected: his first movie role was in roger corman street walking playing a pimp named duke
Target:    his first movie role was in roger corman is streetwalkin playing a pimp named duke



100%|██████████| 1098/1098 [11:00<00:00,  1.66it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
metrics_get_oracle_hypothesis = evaluate_model(dataset, model, client, get_oracle_hypothesis,
                                               small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = evaluate_model(dataset, model, client, get_top1_hypothesis,
                                             small_generation_config, use_llm=False)

 14%|█▍        | 154/1098 [00:00<00:03, 300.94it/s]

Current average WER: 0.102


 22%|██▏       | 243/1098 [00:00<00:03, 266.38it/s]

Current average WER: 0.122


 31%|███       | 336/1098 [00:01<00:02, 292.64it/s]

Current average WER: 0.121


 42%|████▏     | 460/1098 [00:01<00:02, 299.86it/s]

Current average WER: 0.110


 50%|█████     | 553/1098 [00:01<00:01, 302.17it/s]

Current average WER: 0.107


 59%|█████▊    | 644/1098 [00:02<00:01, 297.34it/s]

Current average WER: 0.108


 67%|██████▋   | 735/1098 [00:02<00:01, 300.37it/s]

Current average WER: 0.112


 78%|███████▊  | 859/1098 [00:02<00:00, 300.97it/s]

Current average WER: 0.109


 87%|████████▋ | 952/1098 [00:03<00:00, 299.21it/s]

Current average WER: 0.110


 95%|█████████▌| 1044/1098 [00:03<00:00, 300.10it/s]

Current average WER: 0.112


100%|██████████| 1098/1098 [00:03<00:00, 294.09it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 12%|█▏        | 133/1098 [00:00<00:03, 319.40it/s]

Current average WER: 0.137


 24%|██▍       | 265/1098 [00:00<00:02, 323.89it/s]

Current average WER: 0.156


 33%|███▎      | 366/1098 [00:01<00:02, 329.69it/s]

Current average WER: 0.155


 43%|████▎     | 467/1098 [00:01<00:01, 331.60it/s]

Current average WER: 0.146


 49%|████▊     | 535/1098 [00:01<00:01, 330.41it/s]

Current average WER: 0.142


 58%|█████▊    | 637/1098 [00:01<00:01, 331.49it/s]

Current average WER: 0.144


 67%|██████▋   | 737/1098 [00:02<00:01, 325.09it/s]

Current average WER: 0.147


 76%|███████▌  | 836/1098 [00:02<00:00, 322.51it/s]

Current average WER: 0.145


 85%|████████▌ | 936/1098 [00:02<00:00, 326.54it/s]

Current average WER: 0.146


 94%|█████████▍| 1035/1098 [00:03<00:00, 326.98it/s]

Current average WER: 0.148


100%|██████████| 1098/1098 [00:03<00:00, 326.16it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
print(df[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']])

                    WER   BLEU  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1             0.149  0.142   0.876           0.789        0.800    0.794
Zero-shot Uncon   0.189  0.621   0.847           0.771        0.788    0.780
Zero-shot Constr  0.152  0.621   0.872           0.786        0.799    0.793
Oracle            0.112  0.142   0.903           0.828        0.840    0.834


# Wall Street Journal Test Dataset


In [None]:
# Importing Dataset
df = pd.read_csv("/fs01/home/omidv/data/test_wsj_score.csv")
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = [h.strip() for h in dataset['source'][0].split('.') if h.strip()]
print(hypotheses)

In [None]:
metrics_zero_shot_unconstrained = evaluate_model(dataset, model, client, zero_shot_unconstrained,
                                                 small_generation_config, use_llm=True, verbose = True)

In [None]:
metrics_zero_shot_constrained = evaluate_model(dataset, model, client, zero_shot_constrained,
                                                small_generation_config, use_llm=True, verbose = True)

In [None]:
metrics_get_oracle_hypothesis = evaluate_model(dataset, model, client, get_oracle_hypothesis,
                                               small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = evaluate_model(dataset, model, client, get_top1_hypothesis,
                                             small_generation_config, use_llm=False)

In [None]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
df = df[['WER', 'BLEU', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]
print(df)

# SwitchBoard Test Dataset

In [None]:
# Importing Dataset
df = pd.read_csv("/fs01/home/omidv/data/test_swbd.csv")
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = [h.strip() for h in dataset['source'][0].split('.') if h.strip()]
print(hypotheses)

In [None]:
metrics_zero_shot_unconstrained = evaluate_model(dataset, model, client, zero_shot_unconstrained,
                                                 small_generation_config, use_llm=True, verbose = True)

In [None]:
metrics_zero_shot_constrained = evaluate_model(dataset, model, client, zero_shot_constrained,
                                                small_generation_config, use_llm=True, verbose = True)

In [None]:
metrics_get_oracle_hypothesis = evaluate_model(dataset, model, client, get_oracle_hypothesis,
                                               small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = evaluate_model(dataset, model, client, get_top1_hypothesis,
                                             small_generation_config, use_llm=False)

In [None]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
df = df[['WER', 'BLEU', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]
print(df)

# Few-Shot Chain of Thought Prompting

We'll start by prompting the model to solve some word problems and build up to using the Few-Shot CoT method proposed in "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models"

First try "zero-shot prompting".

In [90]:
zero_shot_prompt = (
    "The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have? just give final answer with no explanation."
)
zero_shot_prompt = construct_input(zero_shot_prompt)

generation_example = client.chat.completions.create(model=model,messages = zero_shot_prompt, **small_generation_config)
print(generation_example.choices[0].message.content)

37


The correct answer is 9.

Now let's try standard few-shot prompt.

In [39]:
few_shot_prompt = (
    "Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis "
    "balls does he have now?\nA: The answer is 11.\n\nQ: Benjamin is taking bottle inventory. He has two cases with "
    "15 bottles in each and one with 7. How many bottles are there in total?\nA: The answer is 37.\n\nQ: The "
    "cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\nA: "
    "The answer is ...\nJust give the final answer to the last question with no explanation."
)
few_shot_message = construct_input(few_shot_prompt)

print(few_shot_prompt)

Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: The answer is 11.

Q: Benjamin is taking bottle inventory. He has two cases with 15 bottles in each and one with 7. How many bottles are there in total?
A: The answer is 37.

Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
A: The answer is ...
Just give the final answer to the last question with no explanation.


In [47]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_message,  **small_generation_config)
print(generation_example.choices[0].message.content)

9


Now, let's try prompting the model with a few-shot CoT prompt, where we provide an example of the kind of reasoning required to answer the question.

In [48]:
few_shot_cot_prompt = (
    "Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis "
    "balls does he have now?\nA: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. "
    "5 + 6 = 11. The answer is 11.\n\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 "
    "more, how many apples do they have?\nA:"
)

few_shot_cot_prompt_message = construct_input(few_shot_cot_prompt)
print(few_shot_cot_prompt)

Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11.

Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
A:


In [54]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_cot_prompt_message, **moderate_generation_config)
print(generation_example.choices[0].message.content)

To solve the problem, follow these steps:

1. Start with the initial number of apples: 23
2. Subtract the number of apples used for lunch: 23 - 20 = 3
3. Add the number of apples bought: 3 + 6 = 9

The cafeteria now has 9 apples.


## An example from the AQuA: Algebraic Word Problems task.

Let's try to compare few-shot prompting with few-shot CoT for slightly different kind of problem. This example is drawn from the AQuA: Algebraic Word Problems task.

In [55]:
few_shot_prompt = (
    "Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of "
    "the numbers is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64\nA: The answer is (a).\n\nQ: The capacity of "
    "a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) "
    "120000 litres (e) None of these\nA: \n what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer."
)
few_shot_prompt_message = construct_input(few_shot_prompt)
print(few_shot_prompt)

Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64
A: The answer is (a).

Q: The capacity of a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) 120000 litres (e) None of these
A: 
 what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer.


In [66]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_prompt_message, **small_generation_config)
print(generation_example.choices[0].message.content)

(c)


The correct choice for this problem is "d".

In [72]:
few_shot_cot_prompt = (
    "Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers "
    "is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64\nA: If 10 is added to each number, then the mean of the "
    "numbers also increases by 10. So the new mean would be 50. The answer is (a).\n\nQ: The capacity of "
    "a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) "
    "120000 litres (e) None of these \n what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer.:"
)
few_shot_cot_prompt_message = construct_input(few_shot_cot_prompt)
print(few_shot_cot_prompt)

Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64
A: If 10 is added to each number, then the mean of the numbers also increases by 10. So the new mean would be 50. The answer is (a).

Q: The capacity of a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) 120000 litres (e) None of these 
 what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer.:


In [80]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_cot_prompt_message, **moderate_generation_config)
print(generation_example.choices[0].message.content)

d


Sometimes the examples are not good enough.

# Zero-Shot Chain of Thought Prompting

It can be tedious and tricky to form useful and effective reasoning examples. Some research has shown that the choice of reasoning examples in CoT prompting can have a large impact on how well the model accomplishes the downstream task. So let's try a zero-shot CoT approach devised in "Large Language Models are Zero-Shot Reasoners"

In [81]:
few_shot_prompt = (
    "Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis "
    "balls does he have now?\nA: The answer is 11.\n\nQ: There are 64 students trying out for the school's trivia "
    "teams. If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many students would "
    "be in each group?\nA: \nJust give the final answer to the last question with no explanation."
)


few_shot_prompt_message = construct_input(few_shot_prompt)
print(few_shot_prompt)

Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: The answer is 11.

Q: There are 64 students trying out for the school's trivia teams. If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many students would be in each group?
A: 
Just give the final answer to the last question with no explanation.


In [82]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_prompt_message, **small_generation_config)
print(generation_example.choices[0].message.content)

10


The correct answer to this problem is 7.

Could you get the correct answer with this example?


# TASK: 

Try to do CoT without adding examples.


Split into two stages:

1) Reasoning Generation   
2) Answer Extraction

In [113]:
question_prompt = ("Q: There are 64 students trying out for the school's trivia teams."
                   "If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many"
                   "students would be in each group?\nA: \nLet's think step by step." )

question_prompt_message = construct_input(question_prompt)
print(question_prompt)

Q: There are 64 students trying out for the school's trivia teams.If 36 of them didn't get picked for the team and the rest were put into 4 groups, how manystudents would be in each group?
A: 
Let's think step by step.


In [116]:
generation_example = client.chat.completions.create(model=model, messages = question_prompt_message, **moderate_generation_config)
reasoning_extraction = generation_example.choices[0].message.content
print(reasoning_extraction)

Step 1: Determine the number of students who were picked for the team. There are 64 students total, and 36 didn’t get picked, which means that 64 - 36 = 28 students were picked for the team.

Step 2: Divide the number of students who were picked for the team by the number of groups they were put into to determine the number of students in each group. Since there are 4 groups, we need to divide 28 by 4.

28 ÷ 4 = 7

The final answer is: $\boxed{7}$


Try to get the correct answer (7) with no example.

In [18]:
# Hint: break down the problem into two steps. First, ask the model for reasoning,
# then, given the reasoning, ask for the final answer by appending "\nTherefore, the answer is" followed by the reasoning.

In [117]:
reasoning_prompt = ("Q: There are 64 students trying out for the school's trivia teams."
                   "If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many"
                   "students would be in each group?\nA: Let's think step by step.\n\n" + reasoning_extraction +
                   "\n\nTherefore, what is the final answer in numerals? Don't say a single word except the final answer.")

reasoning_prompt_message = construct_input(reasoning_prompt)
print(reasoning_prompt)

Q: There are 64 students trying out for the school's trivia teams.If 36 of them didn't get picked for the team and the rest were put into 4 groups, how manystudents would be in each group?
A: Let's think step by step.

Step 1: Determine the number of students who were picked for the team. There are 64 students total, and 36 didn’t get picked, which means that 64 - 36 = 28 students were picked for the team.

Step 2: Divide the number of students who were picked for the team by the number of groups they were put into to determine the number of students in each group. Since there are 4 groups, we need to divide 28 by 4.

28 ÷ 4 = 7

The final answer is: $\boxed{7}$

Therefore, what is the final answer in numerals? Don't say a single word except the final answer.


In [125]:
generation_example = client.chat.completions.create(model=model, messages = reasoning_prompt_message, **moderate_generation_config)
reasoning_generation = generation_example.choices[0].message.content
print(reasoning_generation)

7
