### Environment Setup

In [1]:
import numpy as np
import os
from time import sleep
from typing import List, Callable
import jiwer
import openai
import pandas as pd
from datasets import Dataset
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from bert_score import BERTScorer
import evaluate
from tqdm import tqdm
from transformers import AutoTokenizer
import torch
import time
import asyncio
import nest_asyncio
from dotenv import load_dotenv #Load the environment variables
load_dotenv()

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

[nltk_data] Downloading package wordnet to /h/omidv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /h/omidv/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Helper Functions

In [2]:
def compute_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)

def compute_bleu(references, hypotheses):
    return corpus_bleu(hypotheses, references).score

def compute_meteor(references, hypotheses):
    scores = []
    for ref, hyp in zip(references, hypotheses):
        scores.append(meteor_score([ref.split()], hyp.split()))
    return sum(scores)/len(scores)

def compute_bertscore(references, hypotheses):
    scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    p, r, f1 = scorer.score(hypotheses, references)
    bert_score = {'precision': p.mean().item(),
                  'recall': r.mean().item(),
                  'f1': f1.mean().item()}
    return bert_score

def construct_input(question):
    prompt = [{"role": "user", "content": question}]
    return prompt

def extract_hypotheses(dataset, idx):
    if 'source' in dataset.features:
        hypotheses = [h.strip() for h in dataset['source'][idx].split('.') if h.strip()]
        references = dataset['target'][idx]
    else:
        hypotheses = dataset['input'][idx]
        references = dataset['output'][idx]
        
    return hypotheses, references
    

### Iterative Evaluation

In [3]:
def evaluate_model(dataset:Dataset, model: str, client: openai.OpenAI, postprocessing: Callable[[List[str]], str], generation_config: dict, 
                   use_llm: bool = True, verbose: int = 0, step: int = 100) -> dict:
    """Evaluate model performance on the entire dataset."""
    all_predictions = []
    all_references = []
    running_wer = []
    for idx in tqdm(range(len(dataset))):
        hypotheses = [h.strip() for h in dataset[hypothesis_column][idx].split('.') if h.strip()]
        reference = dataset[reference_column][idx]
        
        # Generate prompt
        if use_llm:
            llm_prompt = postprocessing(hypotheses)
            messages = construct_input(llm_prompt)
            try:
                generation = client.chat.completions.create(
                    model=model,
                    messages=messages,
                    **generation_config
                )
                prediction = generation.choices[0].message.content
            except Exception as e:
                print(f"Error processing example {idx}: {e}")
                prediction = ""
        else:
            prediction = postprocessing(hypotheses, reference)
        reference, prediction = reference.lower(), prediction.lower()
        wer = jiwer.wer(reference, prediction)
        running_wer.append(wer)
        all_predictions.append(prediction)
        all_references.append(reference)
        
        # Print progress update for every %step examples
        if (i + 1) % step == 0:
            print(f"Current average WER: {round(np.mean(running_wer).item(), 3):.3f}")
            if verbose == 1:
                print('-----------------------------------------------------------')
                print("Corrected: %s\nTarget:    %s\n"%(prediction, reference))
                
    # Calculate metrics
    bertscore = compute_bertscore(all_predictions, all_references)
    metrics = {
        'WER': round(np.mean(running_wer).item(), 3),
        'METEOR': round(compute_meteor(all_predictions, all_references), 3),
        'BERT Precision': round(bertscore['precision'], 3),
        'BERT Recall': round(bertscore['recall'], 3),
        'BERT F1': round(bertscore['f1'], 3),
        #'BLEU': round(compute_bleu(all_predictions, all_references), 3),
    }
    return metrics

### Asynchronous Evaluation

In [4]:
nest_asyncio.apply()

async def call_openai_with_retry(messages, model, generation_config, client):
    """Handles API retries with exponential backoff."""
    retry_delay = 0.1  # Initial delay in seconds
    while True:
        try:
            # Attempt to make the API call
            generation = await client.chat.completions.create(
                model=model,
                messages=messages,
                **generation_config
            )
            return generation

        except Exception as e:
            await asyncio.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, 10)  # Exponential backoff up to 10s

async def get_prediction(client: openai.AsyncOpenAI, model: str, messages: List[dict], generation_config: dict) -> str:
    """Asynchronously fetch predictions from OpenAI API."""
    try:
        generation = await call_openai_with_retry(messages, model, generation_config, client)
        return generation.choices[0].message.content if generation else ""
    except Exception as e:
        print(f"Error: {e}")
        return ""

async def process_batch(dataset: Dataset, model: str, client: openai.AsyncOpenAI, postprocessing: Callable[[List[str]], str], 
    generation_config: dict, use_llm: bool) -> List[str]:
    """Processes the dataset asynchronously using OpenAI API with progress tracking."""
    tasks = []
    for idx in tqdm(range(len(dataset))):
        hypotheses, reference = extract_hypotheses(dataset, idx)
        if use_llm:
            llm_prompt = postprocessing(hypotheses)
            messages = construct_input(llm_prompt)
            task = asyncio.create_task(get_prediction(client, model, messages, generation_config))
        else:
            # Synchronous postprocessing for non-LLM mode
            task = asyncio.to_thread(postprocessing, hypotheses, reference)
        
        tasks.append(task)
    if use_llm:
        return await track_progress(tasks)
    else:
        return await asyncio.gather(*tasks) 

async def track_progress(tasks):
    """Tracks progress while tasks are running."""
    total_tasks = len(tasks)
    while True:
        completed = sum(task.done() for task in tasks)
        print(f"Progress: {completed}/{total_tasks} tasks completed", end="\r")
        
        if completed == total_tasks:
            print("\nAll tasks completed.")
            break

        await asyncio.sleep(1)  # Update every second

    return await asyncio.gather(*tasks)  # Collect results after completion

async def evaluate_model_parallel(dataset: Dataset, model: str, client: openai.AsyncOpenAI, postprocessing: Callable[[List[str]], str],
                            generation_config: dict, use_llm: bool = True, verbose=0, step=100):
    """Evaluates the model asynchronously with progress tracking, handling Jupyter compatibility."""
    all_predictions = await process_batch(dataset, model, client, postprocessing, generation_config, use_llm)
    all_predictions = [pred.lower() for pred in all_predictions]  # Normalize predictions
    
    reference_column = 'target' if 'target' in dataset.features else 'output'
    all_references = [ref.lower() for ref in dataset[reference_column]]

    # Compute evaluation metrics
    wer_scores = np.array([jiwer.wer(ref, pred) for ref, pred in zip(all_references, all_predictions)])
    bertscore = compute_bertscore(all_predictions, all_references)
    metrics = {
        'WER': round(wer_scores.mean().item(), 3),
        'METEOR': round(compute_meteor(all_predictions, all_references), 3),
        'BERT Precision': round(bertscore['precision'], 3),
        'BERT Recall': round(bertscore['recall'], 3),
        'BERT F1': round(bertscore['f1'], 3),
    }
    return metrics

### Error Correction Functions

In [5]:
# Baselines
def get_oracle_hypothesis(hypotheses: List[str], reference: str) -> str:
    """
    Find the hypothesis that gives the lowest WER compared to the reference.
    Returns the best hypothesis and its WER.
    """
    wers = [jiwer.wer(reference, hyp) for hyp in hypotheses]
    best_idx = np.argmin(wers)
    return hypotheses[best_idx]

def get_top1_hypothesis(hypotheses: List[str], reference: str) -> str:
    """Get the first hypothesis (baseline)."""
    return hypotheses[0]

def zero_shot_unconstrained(hypotheses: List[str]) -> str:
    prompt = "Perform error correction on the top5 outputs generated by an Automatic Speech Recognition"
    "(ASR) system. The ASR hypotheses, listed in order of their ASR posterior score, are as follows:\n\n"
    for idx, hypothesis in enumerate(hypotheses):
        prompt += "<hypothesis"+ str(idx) + ">" + hypothesis + "</hypothesis"+ str(idx) + ">\n"
    return prompt + "Please provide the corrected top1 ASR transcription of the given utterance only, do not add any explanation or other words."

def zero_shot_constrained(hypotheses: List[str]) -> str:
    prompt = "Perform language model rescoring based on the top5 outputs generated by an Automatic Speech Recognition"
    "(ASR) system. The ASR hypotheses, listed in order of their ASR posterior score, are as follows:\n\n"
    for idx, hypothesis in enumerate(hypotheses):
        prompt += "<option"+ str(idx) + ">" + hypothesis + "</option"+ str(idx) + ">\n"
    return prompt + "Please output the selected top1 ASR transcription, do not add any explanation or <option> tags."

def zero_shot_closest(hypotheses: List[str]) -> str:
    # TO DO
    pass
    
def zero_shot_lattice(hypotheses: List[str]) -> str:
    # TO DO LATER
    pass

def CoT_task_activating(hypotheses: List[str]) -> str:
    # TO DO
    pass

### Specify Experiment Settings

In [6]:
model = "Meta-Llama-3.1-8B-Instruct"
client = openai.AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
small_generation_config = {"max_tokens": 20, "temperature": 0.9}
moderate_generation_config = {"max_tokens": 200, "temperature": 0.9}

# If model is not yet available, try again after some delay.
output = None
while output is None:
    try:
        output = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Please introduce yourself."}],
        )
    
    except openai.APIError as e:
        print(e)
        sleep(10)

print(output.choices[0].message.content)

Error code: 429 - {'detail': {'message': 'Rate limit exceeded', 'resource_key': 'Meta-Llama-3.1-8B-Instruct/rpm/omidv', 'limit': 128, 'wait_seconds': 24.107473850250244}}
Error code: 429 - {'detail': {'message': 'Rate limit exceeded', 'resource_key': 'Meta-Llama-3.1-8B-Instruct/rpm/omidv', 'limit': 128, 'wait_seconds': 12.737320899963379}}
Error code: 429 - {'detail': {'message': 'Rate limit exceeded', 'resource_key': 'Meta-Llama-3.1-8B-Instruct/rpm/omidv', 'limit': 128, 'wait_seconds': 1.3496170043945312}}
I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."


# Common Voice Test Dataset

In [7]:
# Importing Dataset
df = pd.read_csv("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.csv")
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = [h.strip() for h in dataset['source'][0].split('.') if h.strip()]
print(hypotheses)

Dataset({
    features: ['source', 'target', 'best_hypo'],
    num_rows: 1098
})
['transit road surveyed by joseph ellicott was named for an important surveying instrument', 'transit wrote surveyed by joseph ellicott was named for an important surveying instrument', 'transit road surveyed by joseph ellikot was named for an important surveying instrument', 'transit road surveyed by joseph ellicott was named for an important surveying instrument', 'transit road surveyed by joseph ellicate was named for an important surveying instrument']


In [8]:
metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained,
                                                                small_generation_config, use_llm=True)

100%|██████████| 1098/1098 [00:03<00:00, 326.74it/s]


Progress: 1098/1098 tasks completed
All tasks completed.


In [35]:
metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_constrained,
                                                              small_generation_config, use_llm=True)

Progress: 1098/1098 tasks completed
All tasks completed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
metrics_get_oracle_hypothesis = await evaluate_model_parallel(dataset, model, client, get_oracle_hypothesis,
                                                              small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = await evaluate_model_parallel(dataset, model, client, get_top1_hypothesis,
                                                              small_generation_config, use_llm=False)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
print(df[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']])

                    WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1             0.149   0.876           0.789        0.800    0.794
Zero-shot Uncon   0.185   0.853           0.774        0.795    0.784
Zero-shot Constr  0.153   0.872           0.786        0.801    0.794
Oracle            0.112   0.903           0.828        0.840    0.834


# Wall Street Journal Test Dataset


In [23]:
# Importing Dataset
df = pd.read_csv("/fs01/home/omidv/ASR-Error-Correction/data/test_wsj_score.csv")
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = [h.strip() for h in dataset['source'][0].split('.') if h.strip()]
print(hypotheses)

Dataset({
    features: ['source', 'target', 'best_hypo', 'score'],
    num_rows: 836
})
['saatchi officials said the management restructuring might accelerate its efforts to persuade clients to use the firm as a one stop shop for business services', 'sachi officials said the management restructuring might accelerate its efforts to persuade clients to use the firm as a one stop shop for business services', 'saatchi officials said the management restructuring might accelerate its efforts to persuade clients to use the firm as a one stop shop for business services', 'sachi officials said the management restructuring might accelerate its efforts to persuade clients to use the firm as a one stop shop for business services', 'saatchi officials said the management restructuring might accelerate its efforts to persuade clients to use the firm as a one stop shop for business services']


In [40]:
metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained,
                                                                small_generation_config, use_llm=True)
metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_constrained,
                                                              small_generation_config, use_llm=True)
metrics_get_oracle_hypothesis = await evaluate_model_parallel(dataset, model, client, get_oracle_hypothesis,
                                                              small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = await evaluate_model_parallel(dataset, model, client, get_top1_hypothesis,
                                                              small_generation_config, use_llm=False)

Progress: 836/836 tasks completed
All tasks completed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Progress: 836/836 tasks completed
All tasks completed.


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [42]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
df = df[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]
print(df)

                    WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1             0.056   0.959           0.942        0.941    0.941
Zero-shot Uncon   0.201   0.892           0.773        0.845    0.808
Zero-shot Constr  0.138   0.941           0.828        0.891    0.859
Oracle            0.041   0.970           0.959        0.956    0.957


# SwitchBoard Test Dataset

In [24]:
# Importing Dataset
df = pd.read_csv("/fs01/home/omidv/ASR-Error-Correction/data/test_swbd.csv")
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = [h.strip() for h in dataset['source'][0].split('.') if h.strip()]
print(hypotheses)

Dataset({
    features: ['source', 'target', 'best_hypo'],
    num_rows: 1234
})
['you know that did not in the home by choice anymore', 'you know that did not in the home by choice anymore', 'that did not in the home by choice anymore', 'you know that that did not in the home by choice anymore', 'that they are not in the home by choice anymore']


In [46]:
metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained,
                                                                small_generation_config, use_llm=True)
metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_constrained,
                                                              small_generation_config, use_llm=True)
metrics_get_oracle_hypothesis = await evaluate_model_parallel(dataset, model, client, get_oracle_hypothesis,
                                                              small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = await evaluate_model_parallel(dataset, model, client, get_top1_hypothesis,
                                                              small_generation_config, use_llm=False)

Progress: 1234/1234 tasks completed
All tasks completed.
Progress: 1234/1234 tasks completed
All tasks completed.


In [47]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
df = df[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]
print(df)

                    WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1             0.159   0.915           0.781        0.823    0.802
Zero-shot Uncon   0.355   0.815           0.586        0.726    0.655
Zero-shot Constr  0.264   0.888           0.650        0.772    0.710
Oracle            0.124   0.936           0.822        0.859    0.841


# ATIS Test Dataset

In [33]:
# Importing Dataset
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_atis.json")
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = dataset['input'][0]
print(hypotheses)

Dataset({
    features: ['id', 'input', 'output', 'am_score'],
    num_rows: 809
})
['list all us air flights from miami to cleveland leaving on sunday afternoon', 'list all us air flights from miami to cleveland leaving on sunday afternoon', 'list all us air flights from miami to cleveland leaving on sunday afternoon', 'list all us airflights from miami to cleveland leaving on sunday afternoon', 'list all us airflights from miami to cleveland leaving on sunday afternoon']


In [12]:
metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained,
                                                                small_generation_config, use_llm=True)
metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_constrained,
                                                              small_generation_config, use_llm=True)
metrics_get_oracle_hypothesis = await evaluate_model_parallel(dataset, model, client, get_oracle_hypothesis,
                                                              small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = await evaluate_model_parallel(dataset, model, client, get_top1_hypothesis,
                                                              small_generation_config, use_llm=False)

Progress: 1098/1098 tasks completed
All tasks completed.
Progress: 1098/1098 tasks completed
All tasks completed.


In [13]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
df = df[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]
print(df)

                    WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1             0.149   0.876           0.789        0.800    0.794
Zero-shot Uncon   0.186   0.853           0.775        0.795    0.785
Zero-shot Constr  0.149   0.876           0.789        0.804    0.797
Oracle            0.112   0.903           0.828        0.840    0.834


# Tedlium-3 Test Dataset

In [14]:
# Importing Dataset
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_td3.json")
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = dataset['input'][0]
print(hypotheses)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 1155
})
['i would like to share with you a discovery that i made a few months ago while writing an article for italian wired i always keep my thesaurus handy whenever i am writing anything', 'i would like to share with you a discovery that i made a few months ago while writing an article for italian wired i always keep my thesaurus handy whenever i am writing anything but .', 'i would like to share with you a discovery that i made a few months ago while writing an article for italian wired i always keep my thesaurus handy whenever i am writing anything but', 'i would like to share with you a discovery that i made a few months ago while writing an article for italianwired i always keep my thesaurus handy whenever i am writing anything', 'i would like to share with you a discovery that i made a few months ago while writing an article for italian wired i always keep my thesaurus handy whenever i am writing anyt

In [15]:
metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained,
                                                                small_generation_config, use_llm=True)
metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_constrained,
                                                              small_generation_config, use_llm=True)
metrics_get_oracle_hypothesis = await evaluate_model_parallel(dataset, model, client, get_oracle_hypothesis,
                                                              small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = await evaluate_model_parallel(dataset, model, client, get_top1_hypothesis,
                                                              small_generation_config, use_llm=False)

Progress: 1155/1155 tasks completed
All tasks completed.
Progress: 1155/1155 tasks completed
All tasks completed.


In [16]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
df = df[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]
print(df)

                    WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1             0.048   0.972           0.943        0.948    0.945
Zero-shot Uncon   0.362   0.827           0.601        0.775    0.686
Zero-shot Constr  0.267   0.906           0.678        0.847    0.760
Oracle            0.030   0.981           0.962        0.966    0.964


# Librispeech Clean Test Dataset

In [18]:
# Importing Dataset
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_ls_clean.json").iloc[:1000]
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = dataset['input'][0]
print(hypotheses)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 1000
})
['he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour fattened sauce', 'he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour flattened sauce', 'he hoped there would be stew for dinner turnips and carrots and bruise potatoes and fat mutton pieces to be ladled out in thick peppered flour fattened sauce', 'he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick peppered flour fattening sauce', 'he hoped there would be stew for dinner turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out with thick peppered flour fattened sauce']


In [20]:
metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained,
                                                                small_generation_config, use_llm=True)
metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_constrained,
                                                              small_generation_config, use_llm=True)
metrics_get_oracle_hypothesis = await evaluate_model_parallel(dataset, model, client, get_oracle_hypothesis,
                                                              small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = await evaluate_model_parallel(dataset, model, client, get_top1_hypothesis,
                                                              small_generation_config, use_llm=False)

100%|██████████| 1000/1000 [00:11<00:00, 89.68it/s]


Progress: 1000/1000 tasks completed
All tasks completed.


100%|██████████| 1000/1000 [00:11<00:00, 88.61it/s]


Progress: 1000/1000 tasks completed
All tasks completed.


100%|██████████| 1000/1000 [00:11<00:00, 88.32it/s]
100%|██████████| 1000/1000 [00:11<00:00, 83.43it/s]


In [21]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
df = df[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]
print(df)

                    WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1             0.021   0.980           0.972        0.972    0.972
Zero-shot Uncon   0.241   0.893           0.704        0.848    0.774
Zero-shot Constr  0.214   0.913           0.725        0.869    0.795
Oracle            0.009   0.989           0.984        0.985    0.985


# Librispeech Others Test Dataset

In [15]:
# Importing Dataset
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_ls_other.json").iloc[:1000]
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = dataset['input'][0]
print(hypotheses)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 1000
})
["there's iron they say in all our blood and a grain or two perhaps is good but his he makes me harshly feel has got a little too much of steel anon", "there's iron they say in all our blood and a grain or two perhaps is good but this he makes me harshly feel has got a little too much of steel anon", "there's iron they say in all our blood and a grain or two perhaps is good but he makes me harshly feel has got a little too much of steel anon", "there's iron they say in all our blood and a grain or two perhaps is good but as he makes me harshly feel has got a little too much of steel anon", "there's iron they say in all our blood an a grain or two perhaps is good but his he makes me harshly feel has got a little too much of steel anon"]


In [16]:
metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained,
                                                                small_generation_config, use_llm=True)
metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_constrained,
                                                              small_generation_config, use_llm=True)
metrics_get_oracle_hypothesis = await evaluate_model_parallel(dataset, model, client, get_oracle_hypothesis,
                                                              small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = await evaluate_model_parallel(dataset, model, client, get_top1_hypothesis,
                                                              small_generation_config, use_llm=False)

100%|██████████| 1000/1000 [00:10<00:00, 91.69it/s]


Progress: 1000/1000 tasks completed
All tasks completed.


100%|██████████| 1000/1000 [00:10<00:00, 91.52it/s]


Progress: 1000/1000 tasks completed
All tasks completed.


100%|██████████| 1000/1000 [00:11<00:00, 88.31it/s]
100%|██████████| 1000/1000 [00:11<00:00, 88.80it/s]


In [17]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
df = df[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]
print(df)

                    WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1             0.057   0.948           0.914        0.916    0.915
Zero-shot Uncon   0.240   0.865           0.697        0.810    0.752
Zero-shot Constr  0.204   0.898           0.734        0.842    0.786
Oracle            0.036   0.964           0.936        0.940    0.938


# LRS2 Clean Test Dataset

In [12]:
# Importing Dataset
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_lrs2.json").iloc[:1000]
dataset = Dataset.from_pandas(df)
print(dataset)
hypotheses = dataset['input'][0]
print(hypotheses)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 1000
})
['but it really is a rolls royce version', 'but it really is a rolls royce version', 'but it really is a rolls royce version it is', 'but it really is a rolls royce version it is', 'it really is a rolls royce version']


In [13]:
metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained,
                                                                small_generation_config, use_llm=True)
metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_constrained,
                                                              small_generation_config, use_llm=True)
metrics_get_oracle_hypothesis = await evaluate_model_parallel(dataset, model, client, get_oracle_hypothesis,
                                                              small_generation_config, use_llm=False)
metrics_get_top1_hypothesis = await evaluate_model_parallel(dataset, model, client, get_top1_hypothesis,
                                                              small_generation_config, use_llm=False)

100%|██████████| 1000/1000 [00:10<00:00, 95.10it/s]


Progress: 1000/1000 tasks completed
All tasks completed.


100%|██████████| 1000/1000 [00:10<00:00, 92.50it/s]


Progress: 1000/1000 tasks completed
All tasks completed.


100%|██████████| 1000/1000 [00:11<00:00, 84.45it/s]
100%|██████████| 1000/1000 [00:10<00:00, 93.30it/s]


In [14]:
results_table = {
    "Top 1": metrics_get_top1_hypothesis,
    "Zero-shot Uncon": metrics_zero_shot_unconstrained,
    "Zero-shot Constr": metrics_zero_shot_constrained,
    "Oracle": metrics_get_oracle_hypothesis,
}
df = pd.DataFrame.from_dict(results_table, orient='index')
df = df[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]
print(df)

                    WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1             0.132   0.893           0.871        0.845    0.858
Zero-shot Uncon   0.212   0.840           0.821        0.806    0.813
Zero-shot Constr  0.134   0.895           0.870        0.851    0.860
Oracle            0.069   0.939           0.926        0.913    0.919


# Few-Shot Chain of Thought Prompting

We'll start by prompting the model to solve some word problems and build up to using the Few-Shot CoT method proposed in "Chain-of-Thought Prompting Elicits Reasoning in Large Language Models"

First try "zero-shot prompting".

In [90]:
zero_shot_prompt = (
    "The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have? just give final answer with no explanation."
)
zero_shot_prompt = construct_input(zero_shot_prompt)

generation_example = client.chat.completions.create(model=model,messages = zero_shot_prompt, **small_generation_config)
print(generation_example.choices[0].message.content)

37


The correct answer is 9.

Now let's try standard few-shot prompt.

In [39]:
few_shot_prompt = (
    "Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis "
    "balls does he have now?\nA: The answer is 11.\n\nQ: Benjamin is taking bottle inventory. He has two cases with "
    "15 bottles in each and one with 7. How many bottles are there in total?\nA: The answer is 37.\n\nQ: The "
    "cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?\nA: "
    "The answer is ...\nJust give the final answer to the last question with no explanation."
)
few_shot_message = construct_input(few_shot_prompt)

print(few_shot_prompt)

Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: The answer is 11.

Q: Benjamin is taking bottle inventory. He has two cases with 15 bottles in each and one with 7. How many bottles are there in total?
A: The answer is 37.

Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
A: The answer is ...
Just give the final answer to the last question with no explanation.


In [47]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_message,  **small_generation_config)
print(generation_example.choices[0].message.content)

9


Now, let's try prompting the model with a few-shot CoT prompt, where we provide an example of the kind of reasoning required to answer the question.

In [48]:
few_shot_cot_prompt = (
    "Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis "
    "balls does he have now?\nA: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. "
    "5 + 6 = 11. The answer is 11.\n\nQ: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 "
    "more, how many apples do they have?\nA:"
)

few_shot_cot_prompt_message = construct_input(few_shot_cot_prompt)
print(few_shot_cot_prompt)

Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: Roger started with 5 balls. 2 cans of 3 tennis balls each is 6 tennis balls. 5 + 6 = 11. The answer is 11.

Q: The cafeteria had 23 apples. If they used 20 to make lunch and bought 6 more, how many apples do they have?
A:


In [54]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_cot_prompt_message, **moderate_generation_config)
print(generation_example.choices[0].message.content)

To solve the problem, follow these steps:

1. Start with the initial number of apples: 23
2. Subtract the number of apples used for lunch: 23 - 20 = 3
3. Add the number of apples bought: 3 + 6 = 9

The cafeteria now has 9 apples.


## An example from the AQuA: Algebraic Word Problems task.

Let's try to compare few-shot prompting with few-shot CoT for slightly different kind of problem. This example is drawn from the AQuA: Algebraic Word Problems task.

In [55]:
few_shot_prompt = (
    "Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of "
    "the numbers is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64\nA: The answer is (a).\n\nQ: The capacity of "
    "a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) "
    "120000 litres (e) None of these\nA: \n what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer."
)
few_shot_prompt_message = construct_input(few_shot_prompt)
print(few_shot_prompt)

Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64
A: The answer is (a).

Q: The capacity of a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) 120000 litres (e) None of these
A: 
 what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer.


In [66]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_prompt_message, **small_generation_config)
print(generation_example.choices[0].message.content)

(c)


The correct choice for this problem is "d".

In [72]:
few_shot_cot_prompt = (
    "Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers "
    "is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64\nA: If 10 is added to each number, then the mean of the "
    "numbers also increases by 10. So the new mean would be 50. The answer is (a).\n\nQ: The capacity of "
    "a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) "
    "120000 litres (e) None of these \n what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer.:"
)
few_shot_cot_prompt_message = construct_input(few_shot_cot_prompt)
print(few_shot_cot_prompt)

Q: John found that the average of 15 numbers is 40. If 10 is added to each number then the mean of the numbers is? Answer Choices: (a) 50 (b) 45 (c) 65 (d) 78 (e) 64
A: If 10 is added to each number, then the mean of the numbers also increases by 10. So the new mean would be 50. The answer is (a).

Q: The capacity of a tank of dimensions (8 m × 6 m × 2.5 m) is Answer Choices: (a) 120 litres (b) 1200 litres (c) 12000 litres (d) 120000 litres (e) None of these 
 what is the answer to the second Q with a,b,c,d,or e and don't say a single word except the final answer.:


In [80]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_cot_prompt_message, **moderate_generation_config)
print(generation_example.choices[0].message.content)

d


Sometimes the examples are not good enough.

# Zero-Shot Chain of Thought Prompting

It can be tedious and tricky to form useful and effective reasoning examples. Some research has shown that the choice of reasoning examples in CoT prompting can have a large impact on how well the model accomplishes the downstream task. So let's try a zero-shot CoT approach devised in "Large Language Models are Zero-Shot Reasoners"

In [81]:
few_shot_prompt = (
    "Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis "
    "balls does he have now?\nA: The answer is 11.\n\nQ: There are 64 students trying out for the school's trivia "
    "teams. If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many students would "
    "be in each group?\nA: \nJust give the final answer to the last question with no explanation."
)


few_shot_prompt_message = construct_input(few_shot_prompt)
print(few_shot_prompt)

Q: Roger has 5 tennis balls. He buys 2 more cans of tennis balls. Each can has 3 tennis balls. How many tennis balls does he have now?
A: The answer is 11.

Q: There are 64 students trying out for the school's trivia teams. If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many students would be in each group?
A: 
Just give the final answer to the last question with no explanation.


In [82]:
generation_example = client.chat.completions.create(model=model,messages = few_shot_prompt_message, **small_generation_config)
print(generation_example.choices[0].message.content)

10


The correct answer to this problem is 7.

Could you get the correct answer with this example?


# TASK: 

Try to do CoT without adding examples.


Split into two stages:

1) Reasoning Generation   
2) Answer Extraction

In [113]:
question_prompt = ("Q: There are 64 students trying out for the school's trivia teams."
                   "If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many"
                   "students would be in each group?\nA: \nLet's think step by step." )

question_prompt_message = construct_input(question_prompt)
print(question_prompt)

Q: There are 64 students trying out for the school's trivia teams.If 36 of them didn't get picked for the team and the rest were put into 4 groups, how manystudents would be in each group?
A: 
Let's think step by step.


In [116]:
generation_example = client.chat.completions.create(model=model, messages = question_prompt_message, **moderate_generation_config)
reasoning_extraction = generation_example.choices[0].message.content
print(reasoning_extraction)

Step 1: Determine the number of students who were picked for the team. There are 64 students total, and 36 didn’t get picked, which means that 64 - 36 = 28 students were picked for the team.

Step 2: Divide the number of students who were picked for the team by the number of groups they were put into to determine the number of students in each group. Since there are 4 groups, we need to divide 28 by 4.

28 ÷ 4 = 7

The final answer is: $\boxed{7}$


Try to get the correct answer (7) with no example.

In [18]:
# Hint: break down the problem into two steps. First, ask the model for reasoning,
# then, given the reasoning, ask for the final answer by appending "\nTherefore, the answer is" followed by the reasoning.

In [117]:
reasoning_prompt = ("Q: There are 64 students trying out for the school's trivia teams."
                   "If 36 of them didn't get picked for the team and the rest were put into 4 groups, how many"
                   "students would be in each group?\nA: Let's think step by step.\n\n" + reasoning_extraction +
                   "\n\nTherefore, what is the final answer in numerals? Don't say a single word except the final answer.")

reasoning_prompt_message = construct_input(reasoning_prompt)
print(reasoning_prompt)

Q: There are 64 students trying out for the school's trivia teams.If 36 of them didn't get picked for the team and the rest were put into 4 groups, how manystudents would be in each group?
A: Let's think step by step.

Step 1: Determine the number of students who were picked for the team. There are 64 students total, and 36 didn’t get picked, which means that 64 - 36 = 28 students were picked for the team.

Step 2: Divide the number of students who were picked for the team by the number of groups they were put into to determine the number of students in each group. Since there are 4 groups, we need to divide 28 by 4.

28 ÷ 4 = 7

The final answer is: $\boxed{7}$

Therefore, what is the final answer in numerals? Don't say a single word except the final answer.


In [125]:
generation_example = client.chat.completions.create(model=model, messages = reasoning_prompt_message, **moderate_generation_config)
reasoning_generation = generation_example.choices[0].message.content
print(reasoning_generation)

7
