# BLEU, Short for “Bilingual Evaluation Understudy”

# NIST, National Institute of Standards and Technology

In [1]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.nist_score import sentence_nist

# BLEU uses N-gram precision technique + brevity penalty + others
# Drawbacks of BLEU
# 1. It doesn't consider meaning (semantic)
# 2. It doesn't directly consider sentence structure (syntac)
# 3. It doesn't handle morphologically rich languages well
# 4. It doesn't map well to human judgements

# NIST uses N-gram technique + brevity penalty + rareness & commonness penalty

In [2]:
import warnings
warnings.simplefilter('ignore', UserWarning)

## Comparison

In [3]:
reference_sentence = list('i ate three hazelnuts with others'.split())
hypothesis_sentence = list('i ate three hazelnuts with others'.split())

bleu_score = sentence_bleu([reference_sentence], hypothesis_sentence)
nist_score = sentence_nist([reference_sentence], hypothesis_sentence)

print('bleu_score:', bleu_score)
print('nist_score:', nist_score)

bleu_score: 1.0
nist_score: 2.584962500721156


In [4]:
reference_sentence = list('i ate three hazelnuts with others'.split())
hypothesis_sentence = list('i ate three hazelnuts with'.split())

bleu_score = sentence_bleu([reference_sentence], hypothesis_sentence)
nist_score = sentence_nist([reference_sentence], hypothesis_sentence)

print('bleu_score:', bleu_score)
print('nist_score:', nist_score)

bleu_score: 0.8187307530779819
nist_score: 2.2469203412969425


In [5]:
reference_sentence = list('i ate three hazelnuts with others'.split())
hypothesis_sentence = list('i ate three hazelnuts with them'.split())

bleu_score = sentence_bleu([reference_sentence], hypothesis_sentence)
nist_score = sentence_nist([reference_sentence], hypothesis_sentence)

print('bleu_score:', bleu_score)
print('nist_score:', nist_score)

bleu_score: 0.7598356856515925
nist_score: 2.15413541726763


In [6]:
reference_sentence = list('i ate three hazelnuts with others'.split())
hypothesis_sentence = list('i ate three hazelnuts with cat'.split())

bleu_score = sentence_bleu([reference_sentence], hypothesis_sentence)
nist_score = sentence_nist([reference_sentence], hypothesis_sentence)

print('bleu_score:', bleu_score)
print('nist_score:', nist_score)

bleu_score: 0.7598356856515925
nist_score: 2.15413541726763


In [7]:
reference_sentence = list('i ate three hazelnuts with others'.split())
hypothesis_sentence = list('i ate three hazelnuts with me'.split())

bleu_score = sentence_bleu([reference_sentence], hypothesis_sentence)
nist_score = sentence_nist([reference_sentence], hypothesis_sentence)

print('bleu_score:', bleu_score)
print('nist_score:', nist_score)

bleu_score: 0.7598356856515925
nist_score: 2.15413541726763


In [8]:
reference_sentence = list('i ate three hazelnuts with others'.split())
hypothesis_sentence = list('i consumed three hazelnuts with others'.split())

bleu_score = sentence_bleu([reference_sentence], hypothesis_sentence) # penalized!
nist_score = sentence_nist([reference_sentence], hypothesis_sentence) # not penalized!

print('bleu_score:', bleu_score)
print('nist_score:', nist_score)

bleu_score: 0.537284965911771
nist_score: 2.15413541726763


In [9]:
reference_sentence = list('i ate three hazelnuts with others'.split())
hypothesis_sentence = list('i ate three filberts with others'.split())

bleu_score = sentence_bleu([reference_sentence], hypothesis_sentence) # heavily penalized!
nist_score = sentence_nist([reference_sentence], hypothesis_sentence) 

print('bleu_score:', bleu_score)
print('nist_score:', nist_score)

bleu_score: 7.262123179505913e-78
nist_score: 2.15413541726763


In [10]:
reference_sentence = list('i ate three hazelnuts with others'.split())
hypothesis_sentence = list('i ate three eggs with others'.split())

bleu_score = sentence_bleu([reference_sentence], hypothesis_sentence) # heavily penalized!
nist_score = sentence_nist([reference_sentence], hypothesis_sentence) # not penalized!

print('bleu_score:', bleu_score)
print('nist_score:', nist_score)

bleu_score: 7.262123179505913e-78
nist_score: 2.15413541726763


## Others

In [11]:
reference_sentence = [list('i m not a morning person'.split())]
output_sentence = list('i m not a morning person'.split())

bleu_score = sentence_bleu(reference_sentence, output_sentence)
bleu_score

1.0

In [12]:
reference_sentence = [list('i m not a morning person'.split())]
output_sentence = list('i m not a morning'.split())

bleu_score = sentence_bleu(reference_sentence, output_sentence)
bleu_score

0.8187307530779819

In [13]:
reference_sentence = [list('i m not a morning person'.split())]
output_sentence = list('i m not a person'.split())

bleu_score = sentence_bleu(reference_sentence, output_sentence)
bleu_score

0.5789300674674098

In [14]:
reference_sentence = [list('i m not a morning person'.split())]
output_sentence = list('i m a morning person'.split())

bleu_score = sentence_bleu(reference_sentence, output_sentence)
bleu_score

7.070696784820904e-78

In [15]:
reference_sentence = [list('you re doing it right .'.split())]
output_sentence = list('you re doing it right .'.split())

bleu_score = sentence_bleu(reference_sentence, output_sentence)
bleu_score

1.0

In [16]:
reference_sentence = [list('they are collecting contributions for the church .'.split())]
output_sentence = list('they are collecting contributions for the inconvenience .'.split())

bleu_score = sentence_bleu(reference_sentence, output_sentence)
bleu_score

0.7071067811865475

In [17]:
reference_sentence = [list('she is always complaining of her job .'.split())]
output_sentence = list('she is always complaining about her job .'.split())

bleu_score = sentence_bleu(reference_sentence, output_sentence)
bleu_score

0.5

---