### Environment Setup

In [1]:
import numpy as np
import os
import inspect
import random
from time import sleep
from typing import List, Callable
import jiwer
from jiwer import wer
import re
import openai
import pandas as pd
from datasets import Dataset
from sacrebleu import corpus_bleu
from nltk.translate.meteor_score import meteor_score
from bert_score import BERTScorer
import evaluate
from tqdm.notebook import tqdm
import logging
from transformers import AutoTokenizer
import torch
import string
import time
import asyncio
import nest_asyncio
from dotenv import load_dotenv #Load the environment variables
load_dotenv()

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /h/omidv/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /h/omidv/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Helper Functions

In [2]:
def compute_wer(reference, hypothesis):
    return jiwer.wer(reference, hypothesis)


def compute_bleu(references, hypotheses):
    return corpus_bleu(hypotheses, references).score


def compute_meteor(references, hypotheses):
    scores = []
    for ref, hyp in zip(references, hypotheses):
        scores.append(meteor_score([ref.split()], hyp.split()))
    return sum(scores)/len(scores)


def compute_bertscore(references, hypotheses):
    scorer = BERTScorer(lang="en", rescale_with_baseline=True)
    p, r, f1 = scorer.score(hypotheses, references)
    bert_score = {'precision': p.mean().item(),
                  'recall': r.mean().item(),
                  'f1': f1.mean().item()}
    return bert_score


def compute_levenshtein_distance(s1: str, s2: str) -> int:
    """Compute the Levenshtein distance between two strings."""
    
    len_s1, len_s2 = len(s1), len(s2)
    dp = np.zeros((len_s1 + 1, len_s2 + 1), dtype=int)

    for i in range(len_s1 + 1):
        dp[i][0] = i
    for j in range(len_s2 + 1):
        dp[0][j] = j

    for i in range(1, len_s1 + 1):
        for j in range(1, len_s2 + 1):
            cost = 0 if s1[i - 1] == s2[j - 1] else 1
            dp[i][j] = min(dp[i - 1][j] + 1,      # Deletion
                           dp[i][j - 1] + 1,      # Insertion
                           dp[i - 1][j - 1] + cost)  # Substitution

    return dp[len_s1][len_s2]


def construct_input(question):
    prompt = [{"role": "user", "content": question}]
    return prompt


def extract_hypotheses(dataset, idx):
    if 'source' in dataset.features:
        hypotheses = [h.strip() for h in dataset['source'][idx].split('.') if h.strip()]
        references = dataset['target'][idx]
    else:
        hypotheses = dataset['input'][idx]
        references = dataset['output'][idx]
        
    return hypotheses, references


def remove_punctuation(text):
    return text.translate(str.maketrans("", "", string.punctuation))


def clean_deepseek_output(text):
    return re.sub(r"<think>.*?</think>\s*", "", text, flags=re.DOTALL)


def clean_asr_output(text):
    text = re.split(r'\n+', text, maxsplit=1)[0]
    text = text.strip()
    return text


def preprocess(text):
    return clean_asr_output(remove_punctuation(text.lower()))


def save_results(dataset: Dataset, corrections: list, model_name: str, function_name: str, file_path: str):
    correction_column = f"corrected_by_{model_name}_{function_name}"
    if os.path.exists(file_path):
        existing_df = pd.read_json(file_path)
    else:
        existing_df = dataset.to_pandas()

    if correction_column not in existing_df.columns:
        existing_df[correction_column] = None

    existing_df[correction_column] = corrections
    existing_df.to_json(file_path, orient="records", indent=4)
    print(f"Results saved to {file_path}")
    
    
async def run_evaluation(dataset, model, client, generation_config, results_path, disable_zsun=False, disable_zsco=False, disable_zscl=False):
    metrics_zero_shot_unconstrained = None
    metrics_zero_shot_constrained = None
    metrics_zero_shot_closest = None
    if not disable_zsun:
        print("Evaluating Zero-shot Unconstrained:")
        metrics_zero_shot_unconstrained = await evaluate_model_parallel(dataset, model, client, zero_shot_unconstrained, generation_config, results_path)
    
    if not disable_zsco:
        print("Evaluating Zero-shot Constrained:")
        metrics_zero_shot_constrained = await evaluate_model_parallel(dataset, model, client, zero_shot_constrained, generation_config, results_path)
    
    if not disable_zscl:
        print("Evaluating Zero-shot Closest:")
        metrics_zero_shot_closest = await evaluate_model_parallel(dataset, model, client, zero_shot_closest, generation_config, results_path)
    
    print("Evaluating Oracle:")
    metrics_get_oracle_hypothesis = await evaluate_model_parallel(dataset, model, client, get_oracle_hypothesis, generation_config, results_path)
    
    print("Evaluating Top 1:")
    metrics_get_top1_hypothesis = await evaluate_model_parallel(dataset, model, client, get_top1_hypothesis, generation_config, results_path)

    results_table = {
        "Top 1": metrics_get_top1_hypothesis,
        "Oracle": metrics_get_oracle_hypothesis,
    }

    if metrics_zero_shot_unconstrained is not None:
        results_table["Zero-shot Uncon"] = metrics_zero_shot_unconstrained
    if metrics_zero_shot_constrained is not None:
        results_table["Zero-shot Constr"] = metrics_zero_shot_constrained
    if metrics_zero_shot_closest is not None:
        results_table["Zero-shot Closest"] = metrics_zero_shot_closest

    results_table = pd.DataFrame.from_dict(results_table, orient='index')
    results_table = results_table[['WER', 'METEOR', 'BERT Precision', 'BERT Recall', 'BERT F1']]

    # Save as JSON
    csv_path = results_path.replace(".json", f"_{model}.csv")
    results_table.to_csv(csv_path)
    print(f"Benchmark saved to {csv_path}")
    return results_table

### Asynchronous Evaluation

In [3]:
from openai import RateLimitError
nest_asyncio.apply()

async def call_openai_with_retry(messages, model, generation_config, client):
    """Handles API retries with exponential backoff."""
    
    retry_delay = 0.1  # Initial delay in seconds
    max_delay = 10
    while True:
        try:
            # Attempt to make the API call
            generation = await client.chat.completions.create(
                model=model,
                messages=messages,
                **generation_config
            )
            return generation

        except RateLimitError as e:
            wait_time = retry_delay
            if hasattr(e, "response") and e.response is not None:
                try:
                    error_data = e.response.json()
                    wait_time = float(error_data.get("detail", {}).get("wait_seconds", {}))
                except:
                    pass
            await asyncio.sleep(wait_time + 0.1)


        except Exception as e:
            await asyncio.sleep(retry_delay)
            retry_delay = min(retry_delay * 2, max_delay)  # Exponential backoff


async def get_prediction(client: openai.AsyncOpenAI, model: str, messages: List[dict], generation_config: dict) -> str:
    """Asynchronously fetch predictions from OpenAI API."""
    
    try:
        generation = await call_openai_with_retry(messages, model, generation_config, client)
        return generation.choices[0].message.content if generation else ""
    except Exception as e:
        print(f"Error: {e}")
        return ""


async def track_progress(tasks):
    """Tracks progress while tasks are running."""
    
    total_tasks = len(tasks)
    while not all(task.done() for task in tasks):
        completed = sum(task.done() for task in tasks)
        print(f"Progress: {completed}/{total_tasks} tests completed!", end="\r")
        await asyncio.sleep(0.1)
    print(f"Progress: Batch of {total_tasks} tests completed!", flush=True)
    return await asyncio.gather(*tasks)

    
async def process_batch(dataset: Dataset, indices: List[int], model: str, client: openai.AsyncOpenAI, postprocessing: Callable[[List[str]], str], 
    generation_config: dict) -> List[str]:
    """Processes the dataset asynchronously using OpenAI API with progress tracking."""
    
    tasks = []
    for idx in indices:
        hypotheses, reference = extract_hypotheses(dataset, idx)
        
        if inspect.iscoroutinefunction(postprocessing):
            tasks.append(asyncio.create_task(postprocessing(hypotheses, client, model, generation_config)))
        else:
            tasks.append(asyncio.create_task(asyncio.to_thread(postprocessing, hypotheses, reference)))
    
    results = await track_progress(tasks)
    return results
    
async def evaluate_model_parallel(dataset: Dataset, model: str, client: openai.AsyncOpenAI, postprocessing: Callable[[List[str]], str],
                            generation_config: dict, results_path: str, step: int=256, experimental=False):
    """Evaluates the model asynchronously with progress tracking, handling Jupyter compatibility."""
    
    total_rows = len(dataset)
    all_predictions = []
    
    for start in range(0, total_rows, step):
        end = min(start + step, total_rows)
        batch_indices = list(range(start, end))
        batch_predictions = await process_batch(dataset, batch_indices, model, client, postprocessing, generation_config)
        all_predictions.extend(batch_predictions)
    
    
    # Normalize for evaluation
    if 'DeepSeek' in model:
        all_predictions = [clean_deepseek_output(pred) for pred in all_predictions] 
    all_predictions = [clean_asr_output(remove_punctuation(pred.lower())) for pred in all_predictions]
    
    
    all_references = dataset['target'] if 'target' in dataset.features else dataset['output']
    all_references = [clean_asr_output(remove_punctuation(ref.lower())) for ref in all_references]

    # Print 3 random results for manual review
    random_indices = random.sample(range(len(all_predictions)), 3)
    print("-" * 100)
    for idx in random_indices:
        print(f"Sample {idx + 1}")
        report = jiwer.process_words(all_references[idx], all_predictions[idx])
        print(jiwer.visualize_alignment(report, show_measures=False))
        print("-" * 100)
        
    if not experimental:
        save_results(dataset, all_predictions, model, postprocessing.__name__, results_path)
        
    # Compute evaluation metrics
    wer_scores = np.array([jiwer.wer(ref, pred) for ref, pred in zip(all_references, all_predictions)])
    bertscore = compute_bertscore(all_predictions, all_references)
    metrics = {
        'WER': round(wer_scores.mean().item(), 3),
        'METEOR': round(compute_meteor(all_predictions, all_references), 3),
        'BERT Precision': round(bertscore['precision'], 3),
        'BERT Recall': round(bertscore['recall'], 3),
        'BERT F1': round(bertscore['f1'], 3),
    }
    return metrics

### Error Correction Functions

In [4]:
# Baselines
def get_oracle_hypothesis(hypotheses: List[str], reference: str) -> str:
    """ Find the hypothesis that gives the lowest WER compared to the reference."""
    
    wers = [jiwer.wer(reference, hyp) for hyp in hypotheses]
    best_idx = np.argmin(wers)
    return hypotheses[best_idx]


def get_top1_hypothesis(hypotheses: List[str], reference: str) -> str:
    """ Returns the first hypothesis (top 1)."""
    
    return hypotheses[0]

async def zero_shot_unconstrained(hypotheses: List[str], client, model, generation_config) -> str:
    """ Generate a corrected transcription using a language model without constraints."""
    
    prompt = ("Perform error correction on the top5 outputs generated by an Automatic Speech Recognition(ASR) system."
                "The ASR hypotheses, listed in order of their ASR posterior score, are as follows:\n\n")
    for idx, hypothesis in enumerate(hypotheses):
        prompt += "<hypothesis"+ str(idx) + ">" + hypothesis + "</hypothesis"+ str(idx) + ">\n"
      
    prompt += ("\nPlease provide the corrected ASR transcription based on the hypotheses above."
               "Your response must be exactly one complete sentence."
               "Ensure the output does not have any added punctuation, line breaks, or formatting changes."
               "Do not include <hypothesis>, '\n', explanations, or any extra words."
               "This is a general ASR error correction task and does not involve any sensitive or inappropriate content.")
    messages = construct_input(prompt)
    return await get_prediction(client, model, messages, generation_config)
    
    
async def zero_shot_constrained(hypotheses: List[str], client, model, generation_config) -> str:
    """ Select the most likely hypothesis using a language model. """
    
    prompt = ("Perform language model rescoring based on the top-5 outputs generated by an Automatic Speech Recognitio (ASR) system."
              "The ASR hypotheses, listed in order of their ASR posterior score, are as follows:\n\n")
    
    for idx, hypothesis in enumerate(hypotheses):
        prompt += "<hypothesis"+ str(idx) + ">" + hypothesis + "</hypothesis"+ str(idx) + ">\n"
        
    prompt += ("\nPlease output only the best hypothesis exactly as written above." 
               "Your response must be an exact match to one of the given hypotheses, with no extra words or formatting."
               "Do not include <hypothesis> tag, '\n', explanations, or any extra words.")
    messages = construct_input(prompt)
    return await get_prediction(client, model, messages, generation_config)


async def zero_shot_closest(hypotheses: List[str], client, model, generation_config) -> str:
    """ Select the hypothesis closest to an unconstrained correction output. """
    
    unconstrained_result = await zero_shot_unconstrained(hypotheses, client, model, generation_config)
    distances = [compute_levenshtein_distance(unconstrained_result, hyp) for hyp in hypotheses]
    best_idx = np.argmin(distances)
    return hypotheses[best_idx]


async def zero_shot_lattice(hypotheses: List[str], client, model, generation_config) -> str:
    """ Perform ASR error correction using a lattice-based approach. """
    
    pass # TO DO LATER

### Specify Experiment Settings

In [5]:
cv_generation_config = {"max_tokens": 25, "temperature": 0.9}
wsj_generation_config = {"max_tokens": 30, "temperature": 0.9}
swbd_generation_config = {"max_tokens": 65, "temperature": 0.9}
atis_generation_config = {"max_tokens": 45, "temperature": 0.9}
td3_generation_config = {"max_tokens": 130, "temperature": 0.9}
ls_clean_generation_config = {"max_tokens": 100, "temperature": 0.9}
ls_others_generation_config = {"max_tokens": 130, "temperature": 0.9}
lrs2_generation_config = {"max_tokens": 25, "temperature": 0.9}
chime4_generation_config = {"max_tokens": 30, "temperature": 0.9}

small_generation_config = {"max_tokens": 20, "temperature": 0.9}
moderate_generation_config = {"max_tokens": 200, "temperature": 0.9}

In [6]:
model = "Meta-Llama-3.1-8B-Instruct"
client = openai.AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# If model is not yet available, try again after some delay.
output = None
while output is None:
    try:
        output = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Please introduce yourself."}],
        )
    
    except openai.APIError as e:
        print(e)
        sleep(10)

print(output.choices[0].message.content)

I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."


# Llama 8B

### Common Voice Test Dataset

In [42]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json"
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2000
})


In [43]:
results_table = await run_evaluation(dataset, model, client, cv_generation_config, results_path)
print(results_table)

Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 208 tests completed!
----------------------------------------------------------------------------------------------------
Sample 454
Target: it can be constructed as the intersection of all balanced sets containing s
Pred:   it can be constructed as the intersection of all balanced sets containing s
----------------------------------------------------------------------------------------------------
Sample 325
Target: the most contentious point was regarding the control of the island of bornholm
Pred:   the more sensitive point was regarding the control of the island of elba
------------------------------------------------------------------------------------

### Wall Street Journal Test Dataset

In [44]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_wsj_score.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_wsj_score.json"
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['input', 'output', 'score'],
    num_rows: 836
})


In [45]:
results_table = await run_evaluation(dataset, model, client, wsj_generation_config, results_path)
print(results_table)

Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 68 tests completed!
----------------------------------------------------------------------------------------------------
Sample 587
Target: the launch had been planned for earlier this year but was scrubbed by the space agency about five times because of design and other delays
Pred:   the launch had been planned for earlier this year but was scrubbed by the space agency about five times because of design and other delays
----------------------------------------------------------------------------------------------------
Sample 385
Target: riches will come again to bimini
Pred:   riches will come again to bimini
----------------------------------------------------------------------------------------------------
Sample 574
Target: astronomers say that the earth is fate is sealed
Pred:   astronomers say that the ear

### SwitchBoard Test Dataset

In [46]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_swbd.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_swbd.json"
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2000
})


In [47]:
results_table = await run_evaluation(dataset, model, client, swbd_generation_config, results_path)
print(results_table)

Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 208 tests completed!
----------------------------------------------------------------------------------------------------
Sample 1468
Target: well the hobbies that i pursue in my spare time are crafts
Pred:   well the hobbies that i pursue in my spare time are crafts
----------------------------------------------------------------------------------------------------
Sample 950
Target: and i do not know have there been any good b books published on that i know world war two my dad was a paratrooper in airborne one oh hundred and one but i do not think they have done anything good on viet nam
Pred:   and i do not know have there been any good books published 

### ATIS Test Dataset

In [48]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_atis.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_atis.json"
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['id', 'input', 'output', 'am_score'],
    num_rows: 809
})


In [49]:
results_table = await run_evaluation(dataset, model, client, atis_generation_config, results_path)
print(results_table)

Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 41 tests completed!
----------------------------------------------------------------------------------------------------
Sample 538
Target: phoenix till to milwaukee on sunday
Pred:   phoenix is to milwaukee on sunday
----------------------------------------------------------------------------------------------------
Sample 404
Target: what airline is a a
Pred:   what airline is american airlines
----------------------------------------------------------------------------------------------------
Sample 578
Target: list all sunday flights from cleveland to nashville and their fares
Pred:   list all sunday flights from cleveland to nashville and their fares
----------------------------------------------------------------------------------------------------
Results saved to /fs01/home/omidv/ASR-Error-Correction/resul

### Tedlium-3 Test Dataset

In [50]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_td3.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_td3.json"
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 1155
})


In [51]:
results_table = await run_evaluation(dataset, model, client, td3_generation_config, results_path)
print(results_table)

Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 131 tests completed!
----------------------------------------------------------------------------------------------------
Sample 589
Target: than the one week vacation because there are no new memories added you have not changed the story
Pred:   than the one week vacation because there are no new memories added you have changed the story
----------------------------------------------------------------------------------------------------
Sample 905
Target: biggest wiki in the world second biggest wiki in the world with nearly eighty thousand articles is the world of warcraft wiki
Pred:   biggest wiki in the world second biggest wiki in the world with nearly eighty thousand articles is the world of warcraft wiki
----------------------------------------------------------------

### Librispeech Clean Test Dataset

In [52]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_ls_clean.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_ls_clean.json"
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 2620
})


In [53]:
results_table = await run_evaluation(dataset, model, client, ls_clean_generation_config, results_path)
print(results_table)

Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 60 tests completed!
----------------------------------------------------------------------------------------------------
Sample 914
Target: i will briefly describe them to you and you shall read the account of them at your leisure in the sacred registers
Pred:   i will briefly describe them to you and you shall read the account of them at your leisure in the sacred registers
----------------------------------------------------------------------------------------------------
Sample 192
Target: we have spoken of pearls rich and luxuriant bea

### Librispeech Others Test Dataset

In [54]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_ls_other.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_ls_other.json"
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 2939
})


In [55]:
results_table = await run_evaluation(dataset, model, client, ls_others_generation_config, results_path)
print(results_table)

Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 123 tests completed!
----------------------------------------------------------------------------------------------------
Sample 1510
Target: mister wicker waited patiently beside him for a few moments for chris to get up his courage
Pred:   mister wicker waited patiently beside him for a few moments for chris to get up his courage
----------------------------------------------------------------------------------------------------
Sample 2131
Target: it wouldnt do you know after that story came out f

Benchmark saved to /fs01/home/omidv/ASR-Error-Correction/results/test_ls_other_Meta-Llama-3.1-8B-Instruct.csv
                     WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1              0.045   0.958           0.933        0.935    0.934
Zero-shot Uncon    0.091   0.932           0.896        0.902    0.899
Zero-shot Constr   0.070   0.940           0.909        0.915    0.912
Zero-shot Closest  0.066   0.944           0.912        0.919    0.916
Oracle             0.027   0.973           0.953        0.956    0.955


### LRS2 Clean Test Dataset

In [56]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_lrs2.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_lrs2.json"
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2259
})


In [57]:
results_table = await run_evaluation(dataset, model, client, lrs2_generation_config, results_path)
print(results_table)

Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 211 tests completed!
----------------------------------------------------------------------------------------------------
Sample 362
Target: let us try reforming
Pred:   so let us try reforming
----------------------------------------------------------------------------------------------------
Sample 1305
Target: lovely little picture
Pred:   a lovely colorful picture
----------------------------------------------------------------------------------------------------
Sample 1552
Target: along with a baronetcy by no less a personage than king george iii
Pred:   along with a baronetcy by no lesser personage than king ge

### Chime4

In [12]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_chime4.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_chime4.json"
dataset = Dataset.from_pandas(df)
print(dataset)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 1320
})


In [13]:
results_table = await run_evaluation(dataset, model, client, chime4_generation_config, results_path)
print(results_table)

Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 40 tests completed!
----------------------------------------------------------------------------------------------------
Sample 573
Target: we are not prepared to be advocates for the k g b
Pred:   we are not prepared to be advocates for the kgb
----------------------------------------------------------------------------------------------------
Sample 445
Target: that is fine
Pred:   that is fine
----------------------------------------------------------------------------------------------------
Sample 953
Target: closely held times publishing also owns two washington based publications congressional quarterly which covers capitol hill and governing which covers state and local governments
Pred:   closely held times publishing also own

# Gemma2 9B

In [7]:
model="gemma-2-9b-it"
client = openai.AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

output = None
while output is None:
    try:
        output = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Please introduce yourself."}],
        )
    
    except openai.APIError as e:
        print(e)
        sleep(10)

print(output.choices[0].message.content)

Hello! I am Gemma, an open-weights AI assistant. I'm a large language model, which means I've been trained on a massive amount of text data. This allows me to communicate and generate human-like text in response to a wide range of prompts and questions. For example, I can provide summaries of factual topics or create stories.

Because I am open-weights, my weights are publicly available. This means anyone can access and study how I work, which promotes transparency and collaboration in the AI community.

What can I do for you today?



In [21]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, cv_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2000
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 208 tests completed!
----------------------------------------------------------------------------------------------------
Sample 282
Target: the latvian team playing without them failed to qualify for olympics
Pred:   the latter band team playing without them felt a qualified form and dickson
----------------------------------------------------------------------------------------------------
Sample 934
Target: her right hand holds a patera which she is tipping onto a cylindrical altar
Pred:   her right hand holds a patera which she is tipping onto a cylindrical altar
----

In [7]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_wsj_score.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_wsj_score.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, wsj_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'score'],
    num_rows: 836
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 68 tests completed!
----------------------------------------------------------------------------------------------------
Sample 637
Target: british computer brands like sinclair and acorn have bloomed but quickly faded
Pred:   british computer brands like sinclair and acorn have bloomed but quickly faded 

----------------------------------------------------------------------------------------------------
Sample 514
Target: the warming trend may have melted the snow cover on some crops
Pred:   the warming trend may have melted the snow cover on some crops  

----------------------------------------------------------------------------------------------------
Sample 36
Target: in a minute the deal is closed
Pred:   in a minute the deal is clo

In [8]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_swbd.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_swbd.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, swbd_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2000
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 208 tests completed!
----------------------------------------------------------------------------------------------------
Sample 1964
Target: it is it is some sort of government program i have heard a lot of people say good things about it but i just do not know the details on it
Pred:   it is some sort of government program that i have heard a lot of people say good things about it but i just do not know the details on it 

----------------------------------------------------------------------------------------------------
Sample 1884
Target: yeah i guess the news just f

In [9]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_atis.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_atis.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, atis_generation_config, results_path)
print(results_table)

Dataset({
    features: ['id', 'input', 'output', 'am_score'],
    num_rows: 809
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 41 tests completed!
----------------------------------------------------------------------------------------------------
Sample 425
Target: which flights depart from tampa in the early evening and arrive in cincinnati
Pred:   which flights depart from tampa in the early evening and arrive in cincinnati
----------------------------------------------------------------------------------------------------
Sample 717
Target: on friday night i would like a flight from newark to los angeles
Pred:   on friday night i would like a flight from newark to los angeles 

----------------------------------------------------------------------------------------------------
Sample 358
Target: what are the flights between d c a and milwaukee
Pred:   

In [10]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_td3.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_td3.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, td3_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 1155
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 131 tests completed!
----------------------------------------------------------------------------------------------------
Sample 570
Target: it is a bad story how do we know that because we asked these people after their colonoscopy and much later too how bad was the whole thing in total and it was much worse for a than for b in memory
Pred:   it is a bad story how do we know that because we asked these people after their colonoscopy and much later too how bad was the whole thing in total and it was much worse for a than for b in memory 

----------------------------------------------------------------------------------------------------
Sample 813
Target: i felt like i was physically pres

In [11]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_ls_other.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_ls_other.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, ls_others_generation_config, results_path)
print(results_table)

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 2939
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 123 tests completed!
----------------------------------------------------------------------------------------------------
Sample 2216
Target: when they saw i understood them not and made them no answer one of them came forward and said to me in arabic peace be with thee o my brother
Pred:   when they saw i understood them not and made them no answer one of them came forward and said to me in arabic peace be with thee o 

Results saved to /fs01/home/omidv/ASR-Error-Correction/results/test_ls_other.json
Benchmark saved to /fs01/home/omidv/ASR-Error-Correction/results/test_ls_other_gemma-2-9b-it.csv
                     WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1              0.045   0.958           0.933        0.935    0.934
Zero-shot Uncon    0.078   0.936           0.897        0.909    0.903
Zero-shot Constr   0.079   0.940           0.897        0.909    0.903
Zero-shot Closest  0.073   0.939           0.904        0.912    0.908
Oracle             0.027   0.973           0.953        0.956    0.955


In [8]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_lrs2.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_lrs2.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, lrs2_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2259
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 211 tests completed!
----------------------------------------------------------------------------------------------------
Sample 1994
Target: so we have always known the structure of the ceiling is a bit suspect
Pred:   so we have always known the structure of the ceiling is a bit suspect
----------------------------------------------------------------------------------------------------
Sample 1919
Target: a working dog is an essential part of livestock farming
Pred:   a working dog is an essential part of livestock farming
-------

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating Zero-shot Constrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 211 tests completed!
----------------------------------------------------------------------------------------------------
Sample 1671
Target: there are perches everywhere
Pred:   there are purchase everywhere
----------------------------------------------------------------------------------------------------
Sample 372
Target: we do not want steam any more
Pred:   we do not want to stay anymore
----------------------------------------------------------------------------------------------------
Sample 1288
Target: she is terrified that she can not live with an innocent girl going to prison
Pred:   she is terrified that s

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating Zero-shot Closest:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 211 tests completed!
----------------------------------------------------------------------------------------------------
Sample 91
Target: as well is that
Pred:   as well as us
----------------------------------------------------------------------------------------------------
Sample 1510
Target: her mother was a member of the mercian royal family
Pred:   her mother was a member of the mercian royal family
----------------------------------------------------------------------------------------------------
Sample 1788
Target: by the time you read this
Pred:   by the time you read this
---------------------------------------

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating Oracle:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 211 tests completed!
----------------------------------------------------------------------------------------------------
Sample 386
Target: i was there in parliament with
Pred:   i was there in parliament with
----------------------------------------------------------------------------------------------------
Sample 265
Target: are we protecting that well enough
Pred:   will be protecting that well enough
----------------------------------------------------------------------------------------------------
Sample 828
Target: my legs will not be out
Pred:   my legs will not be out
--------------------------------------------------------

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Evaluating Top 1:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 211 tests completed!
----------------------------------------------------------------------------------------------------
Sample 350
Target: public houses are closing down within the valleys
Pred:   public houses are closing down within the valleys
----------------------------------------------------------------------------------------------------
Sample 256
Target: into the center of london
Pred:   into the central london
----------------------------------------------------------------------------------------------------
Sample 204
Target: and my department feel the canteen staff is incompetent
Pred:   indeed and my department feel th

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Benchmark saved to /fs01/home/omidv/ASR-Error-Correction/results/test_lrs2_gemma-2-9b-it.csv
                     WER  METEOR  BERT Precision  BERT Recall  BERT F1
Top 1              0.127   0.895           0.875        0.853    0.864
Oracle             0.068   0.940           0.929        0.918    0.924
Zero-shot Uncon    0.138   0.894           0.874        0.848    0.861
Zero-shot Constr   0.120   0.904           0.883        0.864    0.873
Zero-shot Closest  0.135   0.894           0.877        0.849    0.863


In [23]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_chime4.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_chime4.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, chime4_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 1320
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 40 tests completed!
----------------------------------------------------------------------------------------------------
Sample 1086
Target: the company which runs retail automotive stores told shearson lehman brothers its financial adviser to terminate discussions to sell the firm
Pred:   the company which runs the retail automotive stores told shearson lehman brothers its financial advisor to terminate discussions to sell the firm
----------------------------------------------------------------------------------------------------
Sample 241
Target: the unsold balance late yesterday was about thirty six point three million dollars according to shear

# Llama 70B

In [25]:
model = "Meta-Llama-3.1-70B-Instruct"
client = openai.AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

output = None
while output is None:
    try:
        output = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Please introduce yourself."}],
        )
    
    except openai.APIError as e:
        print(e)
        sleep(10)

print(output.choices[0].message.content)

I'm an artificial intelligence model known as Llama. Llama stands for "Large Language Model Meta AI."


In [26]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, cv_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2000
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 208 tests completed!
----------------------------------------------------------------------------------------------------
Sample 904
Target: morobung just north of the city
Pred:   more urban just north of the city
----------------------------------------------------------------------------------------------------
Sample 1099
Target: they do a lot of good work
Pred:   they do a lot of good work
----------------------------------------------------------------------------------------------------
Sample 1864
Target: pandharpur is a holy and famous pilgrimage place dedicated 

In [27]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_lrs2.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_lrs2.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, lrs2_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2259
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 211 tests completed!
----------------------------------------------------------------------------------------------------
Sample 433
Target: when it comes down to livestock in the garden
Pred:   when it comes down to sort of livestock in the garden
----------------------------------------------------------------------------------------------------
Sample 480
Target: i can not believe it is to do with eyesight
Pred:   i just cannot believe it is anything to do with the eyesight
--------------------------------------------------------

In [28]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_chime4.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_chime4.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, chime4_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 1320
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 40 tests completed!
----------------------------------------------------------------------------------------------------
Sample 565
Target: where else in the third world is there so much energy and progress as in china
Pred:   where else in the third world is there so much energy and progress as in china
----------------------------------------------------------------------------------------------------
Sample 330
Target: the low was one thousand two hundred seventy point one nine and the high was one thousand two hundred seventy three point eight eight
Pred:   the low was one thousand two hundred and seventy point nineteen and the high was one thous

# Playground

In [17]:
model="gemma-2-9b-it"
client = openai.AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

output = None
while output is None:
    try:
        output = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Please introduce yourself."}],
        )
    
    except openai.APIError as e:
        print(e)
        sleep(10)

print(output.choices[0].message.content)

Hello! I am Gemma, an open-weights AI assistant developed by the Gemma team at Google DeepMind.

I am a large language model, which means I've been trained on a massive amount of text data. This allows me to understand and generate human-like text, engage in conversations, answer your questions, and even create stories, poems, or articles.

As an open-weights model, my weights are publicly available. This means anyone can access, study, and modify me, fostering transparency and collaboration in the AI community.



In [16]:
async def CoT_task_activating(hypotheses: List[str], client, model, generation_config) -> str:
    """ Perform ASR error correction using Chain-of-Thought (CoT) reasoning."""
    
    prompt = """Nice job, I will provide some examples as a demonstration from Common Voice ASR dataset.
                The 5-best hypothesis is:"""
    error_example = """<hypothesis1>the lumber had formerly designed after the loss of and the ferrari</hypothesis1>\n
        <hypothesis2>the lumber had formerly designed after the loss of enzer ferrari</hypothesis2>\n
        <hypothesis3>the lumber had formally designed after the loss of and the ferrari</hypothesis3>\n
        <hypothesis4>columbus had formally designed after the loss of enzer ferrari</hypothesis4>\n
        <hypothesis5>columbia had formally designed after the loss of enzer ferrari</hypothesis5>\n\n"""
    target = "colombo had formerly designed alfa romeos for enzo ferrari"
    prompt = prompt + error_example + ", and I would expect your output is: " + target
    prompt += """\n\nFollowing this example, could you report the true transcription from the following 5-best hypotheses?\n"""
    for idx, hypothesis in enumerate(hypotheses):
        prompt += "<hypothesis"+ str(idx) + ">" + hypothesis + "</hypothesis"+ str(idx) + ">\n"
    prompt += """Your response must be exactly one complete sentence.
               Ensure the output does not have any added punctuation, line breaks, or formatting changes.
               Do not include <hypothesis>, '\n', explanations, or any extra words."""
    messages = [
    {
        "role": "user",
        "content": "Do you know speech recognition?"
    },
    {
        "role": "assistant",
        "content": """Yes, I do! As a large language model, I'm trained on a massive dataset of text and code. This includes a significant amount of transcribed speech.

        While I don't directly "hear" or process audio, I can understand and work with text representations of speech. This means I can:

        * **Convert transcribed speech to text:** If you give me a text file containing a speech recording, I can analyze it and understand the spoken words.
        * **Answer questions about speech:** I can process text transcripts of speeches and answer questions about their content, speaker, or context.
        * **Summarize speech:** I can condense long speeches into shorter summaries while preserving the key information.
        * **Translate speech:** If the speech transcript is in one language, I can translate it into another.

        Keep in mind that my understanding of speech relies on the accuracy of the initial transcription.

        Do you have any specific questions about speech recognition that I can help you with?"""
    },
    {
        "role": "user",
        "content": "Do you know language model for speech recognition?"
    },
    {
        "role": "assistant",
        "content": """Yes, I do!

        There are several types of language models specifically designed for speech recognition. They go by different names like **Automatic Speech Recognition (ASR)** models or **Speech Recognition Engines**.  

        Here are some key points about language models for speech recognition:

        * **How they work:** These models are trained on massive datasets of paired audio and text. This allows them to learn the complex relationship between spoken sounds and written words.

        * **Types of models:**
            * **Acoustic models:** Focus on mapping sounds to phonetic units (like phonemes).
            * **Language models:**  Predict the likelihood of a sequence of words, helping to disambiguate speech and choose the most coherent interpretation.
            * **Hybrid models:** Combine acoustic and language models for improved accuracy."""
    },
    {
        "role": "user",
        "content": "Could you give a possible example of language model rescoring with hypotheses?"
    },
    {
        "role": "assistant",
        "content": """You bet! Let's imagine a scenario where a speech recognition engine has produced several possible transcripts (hypotheses) for a spoken utterance:

        **Spoken Utterance:** "The quick brown fox jumps over the lazy dog."

        **Initial Hypothesis (from ASR engine):**

        1. "The quick brown fox jumps over the lazy dig."
        2. "The quick brown fox jumps over thee lazy dog."
        3. "The quick brown fox jumps over the lazy gob."

        **Here's how language model rescoring could work:**

        1. **Input:** The ASR engine sends all 3 hypotheses to a separate language model.

        2. **Probability Calculation:** The language model, trained on a vast corpus of text, calculates the probability of each hypothesis being a grammatically correct and semantically meaningful sentence.

        3. **Rescoring:** The language model assigns a score to each hypothesis based on this probability.  

           *  The hypothesis  "The quick brown fox jumps over the lazy dog."  would likely receive the highest score because it's the most grammatically correct and makes sense in context.

        4. **Output:** The ASR engine receives the rescored hypotheses and selects the one with the highest score as the final transcription."""
    },
    {
        "role": "user",
        "content": prompt
    }
]
    return await get_prediction(client, model, messages, generation_config)

In [18]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json"
dataset = Dataset.from_pandas(df)
results_table = await evaluate_model_parallel(dataset, model, client, CoT_task_activating, cv_generation_config, results_path, experimental=True)
print(results_table)

Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 208 tests completed!
----------------------------------------------------------------------------------------------------
Sample 355

----------------------------------------------------------------------------------------------------
Sample 446

----------------------------------------------------------------------------------------------------
Sample 231
sentence 1
REF: however  this was an error as the proper bangi form would have been **** ** *** kingala
HYP: however there was an error as the proper bangi form would have been king of the     law
                 S                                                          I  I   I       S

---------------------------------------------------

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'WER': 0.146, 'METEOR': 0.88, 'BERT Precision': 0.809, 'BERT Recall': 0.823, 'BERT F1': 0.816}


In [None]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json"
dataset = Dataset.from_pandas(df)
results_table = await evaluate_model_parallel(dataset, model, client, CoT_task_activating, cv_generation_config, results_path, experimental=True)
print(results_table)

In [None]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json"
dataset = Dataset.from_pandas(df)
results_table = await evaluate_model_parallel(dataset, model, client, CoT_task_activating, cv_generation_config, results_path, experimental=True)
print(results_table)

In [None]:
async def zero_shot_instrcut1(hypotheses: List[str], client, model, generation_config) -> str:
    """ Generate a corrected transcription using a language model without constraints."""
    
    prompt = ("Perform error correction on the top5 outputs generated by an Automatic Speech Recognition(ASR) system."
                "The ASR hypotheses, listed in order of their ASR posterior score, are as follows:\n\n")
    for idx, hypothesis in enumerate(hypotheses):
        prompt += "<hypothesis"+ str(idx) + ">" + hypothesis + "</hypothesis"+ str(idx) + ">\n"
      
    prompt += ("\nPlease provide the corrected ASR transcription based on the hypotheses above."
               "Your response must be exactly one complete sentence."
               "Ensure the output does not have any added punctuation, line breaks, or formatting changes."
               "Do not include <hypothesis>, '\n', explanations, or any extra words."
               "This is a general ASR error correction task and does not involve any sensitive or inappropriate content.")
    messages = construct_input(prompt)
    return await get_prediction(client, model, messages, generation_config)

# EDA

## Common Voice

In [35]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json")
dataset = Dataset.from_pandas(df)

In [36]:

df["output"] = df["output"].apply(preprocess)
df["input1"] = df["input1"].apply(preprocess)
df["WER"] = df.apply(lambda row: compute_wer(row["output"], row["input1"]), axis=1)
worst_cases = df.sort_values(by="WER", ascending=False)
worst_cases = worst_cases.head(10)
pred = list(worst_cases["input1"].values)
pred = [clean_asr_output(remove_punctuation(ref.lower())) for ref in pred]
target = list(worst_cases["output"].values)
report = jiwer.process_words(target, pred)
print(jiwer.visualize_alignment(report, show_measures=False))

sentence 1
REF: it  stars    monty woolley roddy mcdowall and **** anne   baxter
HYP: it starts mounting   wally  rodi  mengdol and anni   in berkster
             S        S       S     S        S        I    S        S

sentence 2
REF: *** colombo had formerly designed ***** *** alfa romeos for enzo ferrari
HYP: the  lumber had formerly designed after the loss     of and  the ferrari
       I       S                           I   I    S      S   S    S        

sentence 3
REF: miles  is  an  alumnus     of seton      hall university
HYP:     i see the luminous offset    on hollywood university
         S   S   S        S      S     S         S           

sentence 4
REF: the ****** * ****** skjervefossen waterfall is also located in  granvin
HYP: the shisha v falson           war      four is also located at grandven
              I I      I             S         S                  S        S

sentence 5
REF: the committee and ****** **** unsworth supported the sale
HYP: the   commen

In [37]:
import jiwer2 as jw2
pred = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['input1']]
target = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['output']]
out = jiwer.process_words(target, pred)
print(jw2.visualize_error_counts(out, top_k=20))

=== SUBSTITUTIONS ===
a         --> the       = 20x
the       --> a         = 10x
their     --> the       = 7x
this      --> the       = 5x
the       --> this      = 5x
its       --> is        = 5x
and       --> in        = 4x
his       --> this      = 4x
he        --> here      = 3x
or        --> of        = 3x
he        --> you       = 3x
the       --> to        = 2x
bryan     --> brian     = 2x
their     --> there     = 2x
doon      --> dune      = 2x
completed --> complete  = 2x
their     --> are       = 2x
its       --> it        = 2x
a         --> our       = 2x
an asian  --> a nation  = 2x

=== INSERTIONS ===
the   = 32x
a     = 11x
to    = 10x
it    = 8x
and   = 6x
in    = 5x
for   = 5x
is    = 5x
will  = 5x
new   = 4x
north = 4x
of    = 3x
they  = 3x
but   = 3x
co    = 3x
under = 3x
south = 2x
on    = 2x
here  = 2x
just  = 2x

=== DELETIONS ===
is        = 31x
the       = 17x
a         = 16x
in        = 4x
an        = 4x
it        = 2x
wrote     = 2x
of        = 2x
west      =

## LRS2

In [50]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/results/test_lrs2.json")
dataset = Dataset.from_pandas(df)

In [51]:
df["output"] = df["output"].apply(preprocess)
df["input1"] = df["input1"].apply(preprocess)
df["WER"] = df.apply(lambda row: compute_wer(row["output"], row["input1"]), axis=1)
worst_cases = df.sort_values(by="WER", ascending=False)
worst_cases = worst_cases.head(20)
pred = list(worst_cases["input1"].values)
pred = [clean_asr_output(remove_punctuation(ref.lower())) for ref in pred]
target = list(worst_cases["output"].values)
report = jiwer.process_words(target, pred)
print(jiwer.visualize_alignment(report, show_measures=False))

sentence 1
REF: *** ****** ******* *** ** ******* so we are going to start on that next week
HYP: you bought another one at auction so we can ***** ** start on that next week
       I      I       I   I  I       I         S     D  D                        

sentence 2
REF: you ***** were    a soldier    far from  home
HYP: you would have sold    your father from homes
             I    S    S       S      S          S

sentence 3
REF: ** so overall you  are minus five
HYP: it is    over all your minus five
      I  S       S   S    S           

sentence 4
REF: **** what else could i say
HYP: well   it   is  good i say
        I    S    S     S      

sentence 5
REF: you  mean  i   am   older
HYP: you would be like boulder
             S  S    S       S

sentence 6
REF: ** if  you fall in love
HYP: it is your full in love
      I  S    S    S        

sentence 7
REF:  and those are the ones
HYP: that    is *** the  one
        S     S   D        S

sentence 8
REF: it is a priceless  ar

In [40]:
import jiwer2 as jw2
pred = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['input1']]
target = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['output']]
out = jiwer.process_words(target, pred)
print(jw2.visualize_error_counts(out, top_k=20))

=== SUBSTITUTIONS ===
cos      --> because  = 10x
a        --> the      = 8x
the      --> a        = 8x
was      --> is       = 6x
zero     --> thousand = 5x
the      --> that     = 4x
it       --> that     = 4x
had      --> have     = 3x
round    --> around   = 3x
there    --> that     = 3x
were     --> are      = 2x
and      --> or       = 2x
it       --> this     = 2x
say      --> see      = 2x
has      --> is       = 2x
under    --> to       = 2x
he       --> it       = 2x
triumph  --> triumphs = 2x
in       --> and      = 2x
her      --> us       = 2x

=== INSERTIONS ===
and      = 93x
so       = 26x
that     = 24x
but      = 20x
the      = 19x
now      = 19x
well     = 18x
a        = 14x
it       = 13x
because  = 12x
in       = 10x
thousand = 10x
to       = 10x
you know = 10x
have     = 9x
i        = 9x
as well  = 8x
of       = 8x
very     = 8x
for      = 8x

=== DELETIONS ===
is         = 9x
it         = 7x
and        = 6x
are        = 5x
to         = 5x
of         = 4x
have    

In [53]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/results/test_chime4.json")
dataset = Dataset.from_pandas(df)

In [54]:
pred = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['input1']]
target = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['output']]
out = jiwer.process_words(target, pred)
print(jw2.visualize_error_counts(out, top_k=10))

=== SUBSTITUTIONS ===
nineteen eighty seven   --> hundred and eightyseven = 20x
a                       --> the                     = 18x
adviser                 --> advisor                 = 16x
m                       --> micc                    = 15x
seventy                 --> seventythree            = 12x
twenty five             --> and twentyfive          = 12x
forty                   --> fortyfive               = 10x
m                       --> mci                     = 10x
nineteen eighty six     --> hundred and eightysix   = 9x
t                       --> twa                     = 9x

=== INSERTIONS ===
and                       = 130x
one thousand nine         = 51x
the                       = 19x
one thousand nine hundred = 8x
of                        = 8x
on                        = 6x
elder                     = 4x
pre                       = 4x
to                        = 4x
hundred and sixtynine     = 4x

=== DELETIONS ===
dollars = 121x
six     = 22x
is      = 18x
two 

In [60]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/results/test_ls_other.json")
dataset = Dataset.from_pandas(df)
import jiwer2 as jw2
pred = [clean_asr_output(remove_punctuation(utter[0].lower())) for utter in dataset['input']]
target = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['output']]
out = jiwer.process_words(target, pred)
print(jw2.visualize_error_counts(out, top_k=10))

=== SUBSTITUTIONS ===
a         --> the       = 13x
brahman   --> brahmin   = 13x
the       --> a         = 10x
murdoch   --> murdock   = 9x
hermon    --> hermann   = 8x
dickie    --> dicky     = 7x
cinderlad --> lad       = 7x
an        --> and       = 6x
anyone    --> one       = 6x
their     --> the       = 5x

=== INSERTIONS ===
the  = 10x
a    = 9x
any  = 8x
i    = 6x
and  = 5x
to   = 4x
it   = 4x
of   = 4x
they = 3x
in   = 3x

=== DELETIONS ===
and  = 7x
of   = 7x
a    = 5x
the  = 5x
it   = 4x
boy  = 3x
m    = 3x
is   = 3x
that = 3x
am   = 3x


In [56]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/results/test_atis.json")
dataset = Dataset.from_pandas(df)
pred = [clean_asr_output(remove_punctuation(utter[0].lower())) for utter in dataset['input']]
target = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['output']]
out = jiwer.process_words(target, pred)
print(jw2.visualize_error_counts(out, top_k=10))

=== SUBSTITUTIONS ===
d       --> dc      = 35x
nonstop --> stop    = 19x
fare    --> fair    = 13x
u       --> us      = 10x
first   --> onest   = 7x
a       --> to      = 7x
t       --> twa     = 7x
j       --> jfk     = 6x
p       --> pm      = 6x
fares   --> fairs   = 6x

=== INSERTIONS ===
non               = 19x
hundred           = 12x
point             = 3x
new               = 3x
hundred and       = 3x
twentyeightth one = 2x
one thousand      = 2x
for               = 2x
bye               = 1x
it                = 1x

=== DELETIONS ===
c       = 35x
p m     = 19x
s       = 11x
m       = 11x
flights = 7x
a       = 7x
a m     = 7x
w a     = 6x
f k     = 6x
seventh = 4x


In [57]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/results/test_swbd.json")
dataset = Dataset.from_pandas(df)
pred = [clean_asr_output(remove_punctuation(utter[0].lower())) for utter in dataset['input']]
target = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['output']]
out = jiwer.process_words(target, pred)
print(jw2.visualize_error_counts(out, top_k=10))

=== SUBSTITUTIONS ===
they --> i    = 14x
that --> it   = 13x
in   --> and  = 13x
the  --> a    = 11x
and  --> in   = 11x
a    --> the  = 8x
t    --> tv   = 7x
them --> him  = 7x
were --> are  = 6x
you  --> yeah = 6x

=== INSERTIONS ===
and  = 33x
it   = 27x
a    = 27x
that = 20x
the  = 16x
i    = 15x
have = 12x
yeah = 11x
you  = 10x
is   = 9x

=== DELETIONS ===
i        = 131x
and      = 118x
you know = 64x
the      = 54x
it       = 50x
a        = 48x
that     = 41x
it is    = 34x
so       = 26x
but      = 26x


In [58]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/results/test_td3.json")
dataset = Dataset.from_pandas(df)
pred = [clean_asr_output(remove_punctuation(utter[0].lower())) for utter in dataset['input']]
target = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['output']]
out = jiwer.process_words(target, pred)
print(jw2.visualize_error_counts(out, top_k=10))

=== SUBSTITUTIONS ===
co        --> cotwo     = 15x
could     --> can       = 7x
this      --> the       = 6x
a         --> the       = 5x
ok        --> okay      = 5x
the       --> a         = 5x
twenty    --> twentyone = 5x
and       --> in        = 4x
a         --> one       = 4x
and       --> now       = 3x

=== INSERTIONS ===
and                       = 16x
a                         = 11x
that                      = 8x
it                        = 7x
one thousand nine hundred = 5x
but                       = 5x
of                        = 4x
so                        = 3x
one thousand nine         = 3x
you know                  = 3x

=== DELETIONS ===
and      = 47x
the      = 25x
i        = 18x
two      = 18x
a        = 17x
you know = 11x
is       = 10x
that     = 10x
now      = 8x
to       = 8x


In [59]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/results/test_wsj_score.json")
dataset = Dataset.from_pandas(df)
pred = [clean_asr_output(remove_punctuation(utter[0].lower())) for utter in dataset['input']]
target = [clean_asr_output(remove_punctuation(utter.lower())) for utter in dataset['output']]
out = jiwer.process_words(target, pred)
print(jw2.visualize_error_counts(out, top_k=10))

=== SUBSTITUTIONS ===
u                     --> us                    = 8x
nineteen eighty six   --> hundred and eightysix = 6x
seventy               --> seventyfive           = 5x
incorporated          --> inc                   = 5x
barry                 --> berry                 = 4x
eighths               --> eight                 = 4x
u                     --> un                    = 3x
cease                 --> ceasefire             = 3x
up                    --> upfront               = 3x
i                     --> ibm                   = 3x

=== INSERTIONS ===
one thousand nine         = 17x
one                       = 5x
zero                      = 4x
hundred                   = 3x
and                       = 3x
end                       = 2x
one thousand nine hundred = 2x
hundred and eightynine    = 2x
a                         = 2x
if                        = 1x

=== DELETIONS ===
dollars = 35x
five    = 11x
and     = 9x
is      = 8x
s       = 8x
six     = 6x
fire    = 4x
four 

# Mistral 7B

In [38]:
model = "Mistral-7B-Instruct-v0.3"
client = openai.AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

output = None
while output is None:
    try:
        output = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Please introduce yourself."}],
        )
    
    except openai.APIError as e:
        print(e)
        sleep(10)

print(output.choices[0].message.content)

 Hello! I am a model trained by Mistral AI, dedicated to providing informative and friendly responses. I am here to help answer your questions, provide explanations, or engage in interesting discussions. I strive to make our interactions enriching and enjoyable. How can I assist you today?


In [39]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, cv_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2000
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 208 tests completed!
----------------------------------------------------------------------------------------------------
Sample 654
Target: vandi accepted the role and made her first appearance on television
Pred:   they accepted the role and made her first appearance on television
----------------------------------------------------------------------------------------------------
Sample 1399
Target: the first step is to calculate the percentage of each group of the total
Pred:   the first step is to calculate the percentage of each group of the total data
--------------

In [40]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_lrs2.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_lrs2.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, lrs2_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2259
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 211 tests completed!
----------------------------------------------------------------------------------------------------
Sample 612
Target: in the meantime
Pred:   in the meantime
----------------------------------------------------------------------------------------------------
Sample 278
Target: you will have stable government and the strong economy on which our schools
Pred:   you will have a stable government and a strong economy on which our schools are built
-------------------------------------------------------------------

In [41]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_chime4.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_chime4.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, chime4_generation_config, results_path)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 1320
})
Evaluating Zero-shot Unconstrained:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 40 tests completed!
----------------------------------------------------------------------------------------------------
Sample 407
Target: he also said that the company for the first time was developing drugs specifically for the over the counter consumer health care market
Pred:   he also said the company for the first time is developing drugs specifically for the over the counter consumer health care market
----------------------------------------------------------------------------------------------------
Sample 599
Target: among men fifty six percent said the u s was doing too little in space exploration only a quarter of women agreed
Pred:   am

# Deepseek R1 Qwen

In [29]:
deepseek_generation_config = {"max_tokens": 2500, "temperature": 0.9}

In [30]:
model = "DeepSeek-R1-Distill-Qwen-1.5B"
client = openai.AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

output = None
while output is None:
    try:
        output = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Please introduce yourself."}],
        )
    
    except openai.APIError as e:
        print(e)
        sleep(10)

print(output.choices[0].message.content)

<think>

</think>

Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have.


In [31]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, deepseek_generation_config, results_path, disable_zsco=True, disable_zsun=True)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2000
})
Evaluating Zero-shot Closest:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 208 tests completed!
----------------------------------------------------------------------------------------------------
Sample 463
Target: its lead single work topped the charts in canada france and the united states
Pred:   it is lead single work topped the charts in canada france and the united states
----------------------------------------------------------------------------------------------------
Sample 346
Target: he is currently filming the killing season for foxtel
Pred:   he is currently filming the killing season for foxtail
----------------------------------------

In [32]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_lrs2.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_lrs2.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, deepseek_generation_config, results_path, disable_zsco=True, disable_zsun=True)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2259
})
Evaluating Zero-shot Closest:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 211 tests completed!
----------------------------------------------------------------------------------------------------
Sample 1281
Target: chicken and eggs
Pred:   chicken and eggs and
----------------------------------------------------------------------------------------------------
Sample 2013
Target: ladies and gentlemen
Pred:   ladies and gentlemen
----------------------------------------------------------------------------------------------------
Sample 1954
Target: how much have you spent
Pred:   how much have you spent
--------

In [33]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_chime4.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_chime4.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, deepseek_generation_config, results_path, disable_zsco=True, disable_zsun=True)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 1320
})
Evaluating Zero-shot Closest:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 40 tests completed!
----------------------------------------------------------------------------------------------------
Sample 6
Target: two other issues began trading recently on the big board
Pred:   and two other issues began trading recently on the big board
----------------------------------------------------------------------------------------------------
Sample 549
Target: lately computer retailing has been tough on everybody
Pred:   lately computer retailing has been tough on everybody
----------------------------------------------------------------------------------------------------
Sample 658
Target: shearson lehman hutton incorporated is index

# Deepseek R1 Llama

In [34]:
model = "DeepSeek-R1-Distill-Llama-8B"
client = openai.AsyncOpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

output = None
while output is None:
    try:
        output = await client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": "Please introduce yourself."}],
        )
    
    except openai.APIError as e:
        print(e)
        sleep(10)

print(output.choices[0].message.content)

<think>

</think>

Greetings! I'm DeepSeek-R1, an artificial intelligence assistant created by DeepSeek. I'm at your service and would be delighted to assist you with any inquiries or tasks you may have.


In [35]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_cv.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_cv.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, deepseek_generation_config, results_path, disable_zsco=True, disable_zsun=True)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2000
})
Evaluating Zero-shot Closest:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 208 tests completed!
----------------------------------------------------------------------------------------------------
Sample 494
Target: an important implication of marginal propensity to save is measurement of the multiplier
Pred:   an important implication of margin of propensity to save is the measurement of the multiplier
----------------------------------------------------------------------------------------------------
Sample 296
Target: an asian man and an asian woman are blowing bubbles from pink plastic hoops
Pred:   a nation man and a nation woman of blind bubbles

In [36]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_lrs2.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_lrs2.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, deepseek_generation_config, results_path, disable_zsco=True, disable_zsun=True)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 2259
})
Evaluating Zero-shot Closest:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 211 tests completed!
----------------------------------------------------------------------------------------------------
Sample 1504
Target: and not cast iron
Pred:   and not cast iron with them
----------------------------------------------------------------------------------------------------
Sample 1892
Target: they were not so keen on praying for long times
Pred:   so they were not so keen on praying for long times
----------------------------------------------------------------------------------------------------
Sample 1428
Target:

In [37]:
df = pd.read_json("/fs01/home/omidv/ASR-Error-Correction/data/test_chime4.json")
results_path = "/fs01/home/omidv/ASR-Error-Correction/results/test_chime4.json"
dataset = Dataset.from_pandas(df)
print(dataset)
results_table = await run_evaluation(dataset, model, client, deepseek_generation_config, results_path, disable_zsco=True, disable_zsun=True)
print(results_table)

Dataset({
    features: ['input', 'output', 'input1', 'input2'],
    num_rows: 1320
})
Evaluating Zero-shot Closest:
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 256 tests completed!
Progress: Batch of 40 tests completed!
----------------------------------------------------------------------------------------------------
Sample 1256
Target: but the investigation could make some lenders wary
Pred:   but the investigation could make some lenders worried
----------------------------------------------------------------------------------------------------
Sample 587
Target: under the proposed transaction the los angeles group would acquire the k h j license and then sell itself to disney
Pred:   under the proposed transaction the los angeles group would acquire the khj license and then sell itself to disney
----------------------------------------------------