Evaluate Translation Quality with METEOR Score
Medium
NLP

Develop a function to compute the METEOR score for evaluating machine translation quality. Given a reference translation and a candidate translation, calculate the score based on unigram matches, precision, recall, F-mean, and a penalty for word order fragmentation.

Example:
Input:
meteor_score('Rain falls gently from the sky', 'Gentle rain drops from the sky')
Output:
0.625
Reasoning:
The function identifies 4 unigram matches ('rain', 'gently'/'gentle', 'from', 'sky'), computes precision (4/6) and recall (4/5), calculates an F-mean, and then apply a small penalty for two chunks.



In [None]:
import re
from collections import defaultdict

def _stem(w: str) -> str:
    # very light stemmer good enough for common cases (gently→gentle, falls→fall, drops→drop)
    if len(w) > 4 and w.endswith("ing"): w = w[:-3]
    elif len(w) > 3 and w.endswith("ed"): w = w[:-2]
    elif len(w) > 3 and w.endswith("ly"): w = w[:-2]
    if len(w) > 3 and w.endswith("s"): w = w[:-1]
    return w

def _tokens(s: str):
    return [_stem(t) for t in re.findall(r"\w+", s.lower())]

def meteor_score(reference, candidate, alpha=0.9, beta=3, gamma=0.5, round_to=3):
    ref = _tokens(reference)
    cand = _tokens(candidate)
    if not ref or not cand: return 0.0

    # build index lists for reference tokens
    ref_pos = defaultdict(list)
    for i, w in enumerate(ref):
        ref_pos[w].append(i)

    # greedy 1-1 alignment in candidate order, preferring increasing ref indices
    used = set()
    last = -1
    aligned_ref_idx = []
    for w in cand:
        if w not in ref_pos: continue
        # pick the first available ref index > last (keeps chunks small)
        choices = [i for i in ref_pos[w] if i not in used and i > last]
        if not choices:
            # fallback: any unused occurrence
            choices = [i for i in ref_pos[w] if i not in used]
            if not choices: continue
        i = choices[0]
        used.add(i)
        last = i
        aligned_ref_idx.append(i)

    m = len(aligned_ref_idx)
    if m == 0: return 0.0

    P = m / len(cand)
    R = m / len(ref)
    # METEOR F-mean (recall-weighted)
    Fmean = (P * R) / ((1 - alpha) * P + alpha * R)

    # chunks = number of contiguous runs in aligned_ref_idx
    chunks = 1
    for i in range(1, m):
        if aligned_ref_idx[i] != aligned_ref_idx[i-1] + 1:
            chunks += 1
    frag = chunks / m
    penalty = gamma * (frag ** beta)

    score = Fmean * (1 - penalty)
    return round(score, round_to)

# Quick check with the example
print(meteor_score('Rain falls gently from the sky', 'Gentle rain drops from the sky'))  # 0.625