# Installing Necessary Libraries
## Installation of Required Libraries
This cell installs essential libraries such as transformers, datasets, torch, and nltk. These libraries are required for evaluating the NLP model on test set.

In [15]:
# Install necessary libraries
!pip install transformers datasets torch nltk



# Importing Libraries and Mounting Google Drive
This cell imports all the necessary Python libraries required for NLP model evaluation, such as torch, transformers, and nltk. Additionally, it mounts Google Drive to access the trained model.

In [16]:
import os
import random
import torch
import json
import requests
import numpy as np
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import Dataset, load_metric
from nltk.translate.gleu_score import sentence_gleu

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Downloading the test set
## Downloading the Disfl QA test set
This cell defines a function to download the test set from the github repository and stores them locally.

In [5]:
# Download datasets
def download_file(url, filename):
    response = requests.get(url)
    response.raise_for_status()  # Check for HTTP errors
    with open(filename, 'wb') as f:
        f.write(response.content)

urls = {
    "test": "https://raw.githubusercontent.com/google-research-datasets/Disfl-QA/main/test.json",
}

for name, url in urls.items():
    download_file(url, f"{name}.json")

print("Files downloaded successfully.")

Files downloaded successfully.


# Loading the test set
## Loading Disfl QA test set
This cell defines a function to load the JSON test set and extracts required fields ("original" and "disfluent").

In [6]:
# Load data
def load_disflqa_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return [{"original": v["original"], "disfluent": v["disfluent"]} for v in data.values()]

test_data = load_disflqa_data('test.json')

test_dataset = Dataset.from_list(test_data)

# Loading the best model
This cell loads the best "BART" model that is already saved for testing.

In [7]:
1# Define paths
#model_path_bart = '/content/drive/MyDrive/DisflQA_Models/BART'
model_path_bart = './models/DisflQA_Models/BART'

#tokenizer_path_bart = '/content/drive/MyDrive/DisflQA_Tokenizers/BART'
tokenizer_path_bart = './models/DisflQA_Tokenizers/BART'

# Load tokenizers
# Load models and tokenizers
models_and_tokenizers = {
    "BART": (BartForConditionalGeneration.from_pretrained(model_path_bart).to(device), BartTokenizer.from_pretrained(tokenizer_path_bart)),
}

# Tokenization Function
## Tokenizing Disfl QA Data
This cell defines a function for tokenizing the input data. It converts the disfluent questions into the format required by the model and tokenizes both inputs and targets using a specified tokenizer.

In [8]:
for name, (model, tokenizer) in models_and_tokenizers.items():
    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))
        model.config.pad_token_id = tokenizer.eos_token_id
    models_and_tokenizers[name] = (model, tokenizer)

# Predicting in Batches
## Batch Prediction with Trained Model
This cell defines a function to generate predictions from the trained
"BART" model in batches. It processes inputs and decodes the outputs to generate predictions for each input in the test dataset.

In [13]:
# Predict in batches
def predict_in_batches(model, dataset, tokenizer, batch_size=8):
    predictions = []
    num_batches = len(dataset) // batch_size + int(len(dataset) % batch_size != 0)
    for i in range(num_batches):
        try:
            # Attempt to select the batch
            batch = dataset.select(range(i * batch_size, (i + 1) * batch_size))
        except IndexError as e:
            # Handle the exception, which might occur on the last batch
            print(f"IndexError: {e} - likely due to the last batch at index {i}.")
            # Handle the last batch case, e.g., selecting only the remaining data
            batch = dataset.select(range(i * batch_size, len(dataset)))
        except Exception as e:
            # Catch any other unexpected exceptions
            print(f"Unexpected error: {e}")
            continue
        inputs = tokenizer(batch['disfluent'], return_tensors='pt', padding=True, truncation=True).to(model.device)
        model.eval()
        with torch.no_grad():
            # Using max_new_tokens to control the length of generated output
            outputs = model.generate(**inputs,
                                            max_new_tokens=100,
                                            #attention_mask=attention_mask,
                                            num_beams=8,  # Set the number of beams here
                                            #max_length=300,
                                            early_stopping=True)

            preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            predictions.extend(preds)
    return predictions

# Computing Evaluation Metrics
## Computing Evaluation Metrics (BLEU, GLEU, Accuracy)
This cell defines functions to compute various evaluation metrics, including BLEU, GLEU, and accuracy, based on the model predictions and the ground truth labels.
### BLEU (Bilingual Evaluation Understudy Score):
Measures the precision of n-grams between the generated and reference texts.
Higher scores indicate better performance in generating text similar to reference texts.
### GLEU (General Language Understanding Evaluation):
Evaluates the quality of generated text by comparing it to reference text.
Similar to BLEU but considers both precision and recall, focusing on fluency and grammaticality.
### Accuracy:
Measures the proportion of correct predictions made by the model.

In [17]:
# Compute metrics
def compute_gleu(predictions, references):
    return np.mean([sentence_gleu([ref.split()], pred.split()) for pred, ref in zip(predictions, references)])

def compute_accuracy(predictions, references):
    return sum(p == r for p, r in zip(predictions, references)) / len(references)

def compute_metrics(predictions, labels):
    formatted_preds = [pred.split() for pred in predictions]
    formatted_refs = [[ref.split()] for ref in labels]
    bleu_metric = load_metric('bleu')
    bleu = bleu_metric.compute(predictions=formatted_preds, references=formatted_refs)
    gleu = compute_gleu(predictions, labels)
    accuracy = compute_accuracy(predictions, labels)
    return {"bleu": bleu["bleu"], "gleu": gleu, "accuracy": accuracy}

# Evaluate and save predictions
predictions = {}
for name, models_and_tokenizer in models_and_tokenizers.items():
    predictions[name] = predict_in_batches(models_and_tokenizer[0], test_dataset, models_and_tokenizer[1])
    labels = test_dataset["original"]
    test_results = compute_metrics(predictions[name], labels)
    print(f"{name} Evaluation Results on Test Set: {test_results}")
    with open(f"{name}_test_results.json", "w") as f:
        json.dump(test_results, f)

BART Evaluation Results on Test Set: {'bleu': 0.8800623261064311, 'gleu': 0.8724521491785246, 'accuracy': 0.6662091682679111}


The BART model demonstrated strong performance on the test set, with a BLEU score of 0.8801 and a GLEU score of 0.8725, indicating that the generated outputs closely matched the reference answers. The accuracy of 66.62% suggests that the model is reliable in generating correct answers for a significant portion of the test cases. Overall, these results reflect the BART model's effectiveness in handling the task, maintaining high-quality outputs across different evaluation metrics.

In [19]:
random_index = random.randint(0, len(test_dataset) - 1)

print("Original:", test_dataset[random_index]["original"])
print("Disfluent:", test_dataset[random_index]["disfluent"])
print("BART:", predictions["BART"][random_index])

Original: Who upon arriving gave the original viking settlers a common identity?
Disfluent: What or rather who upon arriving gave the original viking settlers a common identity?
BART: Who upon arriving gave the original viking settlers a common identity?
