In [1]:
# Install required libraries
%pip install nltk sacrebleu

Note: you may need to restart the kernel to use updated packages.


In [2]:
import math
import sacrebleu
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk import ngrams
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nsadi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Example data
reference_sentence = "she was interested in world history because she read the book."
hypothesis_sentence = "she read the book because she was interested in world history."

1. Bilingual Evaluation Understudy (BLEU): BLEU score is calculated by measuring the overlap of n-grams between the candidate translation and the references. The calculation could be represented as:

In [4]:
#Higher is better
def calculate_bleu(hypothesis: str, reference: str) -> float:
    """
    Calculate BLEU score between a hypothesis and a reference.

    Parameters:
    - hypothesis (str): The generated hypothesis or translation.
    - reference (str): The reference or ground truth translation.

    Returns:
    - float: The BLEU score, where higher values indicate better similarity to the reference - 0.0 (no similarity) and 1.0 (perfect similarity to the reference).
    """
    hypothesis_split = hypothesis.split()
    reference_split = reference.split()

    # Using NLTK's sentence_bleu function to calculate BLEU score
    bleu_score = sentence_bleu([reference_split], hypothesis_split)
   
    return bleu_score


In [5]:
bleu_score = calculate_bleu(hypothesis_sentence, reference_sentence)

print(f"BLEU Score: {bleu_score:.3f}")

BLEU Score: 0.502


2. Translation Edit Rate Plus (TERp): TERp is calculated by measuring the matching flaw between machine-generated translations and human-created translation. The calculation could be represented as:

In [6]:
#Higher is better
def calculate_terp(hypothesis: str, reference: str, phrase_table: dict = None, edit_costs: dict = None) -> float:
    """
    Calculate TERp (Translation Edit Rate with partial credit) score between a hypothesis and a reference.

    Parameters:
    - hypothesis (str): The generated hypothesis or translation.
    - reference (str): The reference or ground truth translation.
    - phrase_table (dict, optional): A dictionary representing a phrase table for paraphrase information.
    - edit_costs (dict, optional): A dictionary containing weights for edit operations in the TERp calculation.

    Returns:
    - float: The TERp score, a value indicating the similarity between the hypothesis and the reference,
             where higher values are better.
    """
    hypothesis_tokens = hypothesis.split()
    reference_tokens = reference.split()

    # TERp by Stem Matches, Synonym Matches, and Phrase Substitutions
    stem_matches = calculate_stem_matches(hypothesis_tokens, reference_tokens)
    synonym_matches = calculate_synonym_matches(hypothesis_tokens, reference_tokens)

    phrase_substitutions = calculate_phrase_substitutions(hypothesis_tokens, reference_tokens, phrase_table, edit_costs) if phrase_table is not None and edit_costs is not None else 0


    # Calculate TERp score
    terp_score = (stem_matches + synonym_matches + phrase_substitutions) / (2*len(hypothesis_tokens))

    return terp_score

def calculate_stem_matches(hypothesis_tokens: list, reference_tokens: list) -> int:
    """
    Calculate the number of stem matches between two tokenized sequences.

    Parameters:
    - hypothesis_tokens (list): List of tokens in the hypothesis.
    - reference_tokens (list): List of tokens in the reference.

    Returns:
    - int: The number of stem matches.
    """
    stemmer = PorterStemmer()
    stem_matches = sum(1 for hyp_token, ref_token in zip(hypothesis_tokens, reference_tokens)
                      if stemmer.stem(hyp_token.lower()) == stemmer.stem(ref_token.lower()))
    return stem_matches

def calculate_synonym_matches(hypothesis_tokens: list, reference_tokens: list) -> int:
    """
    Calculate the number of synonym matches between two tokenized sequences.

    Parameters:
    - hypothesis_tokens (list): List of tokens in the hypothesis.
    - reference_tokens (list): List of tokens in the reference.

    Returns:
    - int: The number of synonym matches.
    """
    synonym_matches = sum(1 for hyp_token, ref_token in zip(hypothesis_tokens, reference_tokens)
                          if are_synonyms(hyp_token.lower(), ref_token.lower()))
    return synonym_matches

def are_synonyms(word1: str, word2: str) -> bool:
    """
    Check if two words are synonyms.

    Parameters:
    - word1 (str): The first word.
    - word2 (str): The second word.

    Returns:
    - bool: True if the words are synonyms, False otherwise.
    """
    synsets1 = wordnet.synsets(word1)
    synsets2 = wordnet.synsets(word2)
    
    return any(set1.wup_similarity(set2) > 0.7 for set1 in synsets1 for set2 in synsets2)

def calculate_phrase_substitutions(hypothesis_tokens: list, reference_tokens: list, phrase_table: dict, edit_costs: dict) -> float:
    """
    Calculate the cost of phrase substitutions between two tokenized sequences.

    Parameters:
    - hypothesis_tokens (list): List of tokens in the hypothesis.
    - reference_tokens (list): List of tokens in the reference.
    - phrase_table (dict): A dictionary representing a phrase table for paraphrase information.
    - edit_costs (dict): A dictionary containing weights for edit operations in the TERp calculation.

    Returns:
    - float: The total cost of phrase substitutions.
    """
    substitution_cost = 0

    for i in range(len(hypothesis_tokens)):
        for j in range(len(reference_tokens)):
            if (hypothesis_tokens[i], reference_tokens[j]) in phrase_table:
                # Retrieve paraphrase information from the phrase table
                paraphrase_info = phrase_table[(hypothesis_tokens[i], reference_tokens[j])]
                
                # Calculate the cost using the provided formula
                cost = (
                    edit_costs['w1'] +
                    edit_costs['w2'] * paraphrase_info['edit'] * math.log(paraphrase_info['probability']) +
                    edit_costs['w3'] * paraphrase_info['edit'] * paraphrase_info['probability'] +
                    edit_costs['w4'] * paraphrase_info['edit']
                )

                # Ensure the substitution cost is not negative
                substitution_cost += max(0, cost)

    return substitution_cost

def terp_alignment(hypothesis: str, reference: str, phrase_table: dict = None, edit_costs: dict = None) -> list:
    """
    Generate a word-level alignment between a hypothesis and a reference.

    Parameters:
    - hypothesis (str): The generated hypothesis or translation.
    - reference (str): The reference or ground truth translation.
    - phrase_table (dict, optional): A dictionary representing a phrase table for paraphrase information.
    - edit_costs (dict, optional): A dictionary containing weights for edit operations in the TERp calculation.

    Returns:
    - list of tuples: A list of tuples representing the word-level alignment, each tuple contains
                      (hypothesis_token, reference_token, alignment_type).
    """
    alignment = []

    for hyp_token, ref_token in zip(hypothesis.split(), reference.split()):
        if hyp_token == ref_token:
            alignment.append((hyp_token, ref_token, "Exact Match"))
        else:
            alignment.append((hyp_token, ref_token, "Mismatch"))

    return alignment

In [14]:
# Example usage:
hypothesis_sentence = "This is an example sentence."
reference_sentence = "This is an example sentence."

# Calculate TERp score
terp_score = calculate_terp(hypothesis_sentence, reference_sentence)
print(f"TERp Score: {terp_score}")

# Generate alignment
alignment = terp_alignment(hypothesis_sentence, reference_sentence)
print("Alignment:", alignment)

TERp Score: 0.8
Alignment: [('This', 'This', 'Exact Match'), ('is', 'is', 'Exact Match'), ('an', 'an', 'Exact Match'), ('example', 'example', 'Exact Match'), ('sentence.', 'sentence.', 'Exact Match')]


In [8]:
# Lower is better
def calculate_ter(hypothesis: str, reference: str) -> float:
    """
    Calculate TER (Translation Edit Rate) score between a hypothesis and a reference.

    Parameters:
    - hypothesis (str): The generated hypothesis or translation.
    - reference (str): The reference or ground truth translation.

    Returns:
    - float: The TER score, where lower values indicate better similarity to the reference.
    """
    # Tokenize the input sentences into lists of words
    hypothesis_tokens = hypothesis.split()
    reference_tokens = reference.split()

    # Compute Levenshtein distance between hypothesis and reference
    distance = levenshtein_distance(hypothesis_tokens, reference_tokens)

    # Compute TER score
    ter_score = distance / len(reference_tokens)
    
    return ter_score

def levenshtein_distance(s1: list, s2: list) -> int:
    """
    Compute the Levenshtein distance between two sequences.

    Parameters:
    - s1 (list): List of tokens in the first sequence.
    - s2 (list): List of tokens in the second sequence.

    Returns:
    - int: The Levenshtein distance between the two sequences.
    """
    # Initialize a matrix to store the distances
    matrix = [[0] * (len(s2) + 1) for _ in range(len(s1) + 1)]

    # Initialize the first row and column
    for i in range(len(s1) + 1):
        matrix[i][0] = i
    for j in range(len(s2) + 1):
        matrix[0][j] = j

    # Fill in the matrix
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            cost = 0 if s1[i - 1] == s2[j - 1] else 1
            matrix[i][j] = min(
                matrix[i - 1][j] + 1,  # Deletion
                matrix[i][j - 1] + 1,  # Insertion
                matrix[i - 1][j - 1] + cost  # Substitution
            )

    # Return the final edit distance
    return matrix[len(s1)][len(s2)]


In [9]:
ter_score = calculate_ter(hypothesis_sentence, reference_sentence)

print(f"TER Score: {ter_score}")

TER Score: 0.4


3. Paraphrase In N-gram Changes (PINC): PINC is calculated by computing the percentage of n-grams that appear in the candidate sentence but not in the source sentence. The calculation could be represented as:

In [10]:
# Lower is better
def calculate_pinc(hypothesis: str, reference: str, n: int) -> float:
    """
    Calculate PINC (Precision-based n-gram Inclusion Count) score between a hypothesis and a reference.

    Parameters:
    - hypothesis (str): The generated hypothesis or translation.
    - reference (str): The reference or ground truth translation.
    - n (int): The size of n-grams for which to calculate the PINC score.

    Returns:
    - float: The PINC score, where lower values indicate better similarity to the reference.
    """
    hypothesis_split = hypothesis.split()
    reference_split = reference.split()

    hypothesis_ngrams = set(ngrams(hypothesis_split, n))
    reference_ngrams = set(ngrams(reference_split, n))
    new_ngrams = hypothesis_ngrams - reference_ngrams
    hypo_ngram_len = len(hypothesis_ngrams)
    pinc_score = len(new_ngrams) / hypo_ngram_len if hypo_ngram_len != 0 else 0

    return pinc_score

In [11]:
pinc_score = calculate_pinc(hypothesis_sentence, reference_sentence, 2)

print(f"PINC Score: {pinc_score}")

PINC Score: 0.75
