<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/NLP_Evaluation_Metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Using nltk library

In [None]:
!pip -q install nltk==3.5

In [8]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.chrf_score import chrf_precision_recall_fscore_support
from nltk.translate.meteor_score import single_meteor_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


label = 'this is a small test'.split()
prediction    = 'this is test'.split()
BLEU_1 = sentence_bleu([label], prediction, weights=(1, 0, 0, 0))
BLEU_2 = sentence_bleu([label], prediction, weights=(1, 1, 0, 0))
BLEU_3 = sentence_bleu([label], prediction, weights=(1, 1, 1, 0))
BLEU_4 = sentence_bleu([label], prediction, weights=(1, 1, 1, 1))
print('BLEU_1:%.4f, BLEU_2:%.4f, BLEU_3:%.4f, BLEU_4:%.4f'%(BLEU_1, BLEU_2, BLEU_3, BLEU_4))

label = 'this is a small test'.split()
prediction    = 'this is test'.split()
prec, rec, f1, tp = chrf_precision_recall_fscore_support(label, prediction, n=1) # where n = n-gram
print('prec:%.4f, rec:%.4f, f1:%.4f, tp:%.4f'%(prec, rec, f1, tp))


label = 'this is a small test'
prediction    = 'this is test'
meteor = single_meteor_score(label, prediction)
print('Meteor:%.4f'%meteor)

BLEU_1:0.5134, BLEU_2:0.2567, BLEU_3:0.0000, BLEU_4:0.0000
prec:1.0000, rec:0.6000, f1:0.6250, tp:3.0000
Meteor:0.5324


# Using python coco captioning library

In [9]:
!pip -q install pycocoevalcap

[K     |████████████████████████████████| 104.3 MB 75 kB/s 
[?25h

In [148]:
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.cider.cider import Cider
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.spice.spice import Spice

import numpy as np

labels = [['this is a small test'],['a large jetliner flying over a traffic filled street']]
predictions = [['this is test'],['plane is flying through the sky']]

labels = dict(zip(np.arange(len(labels)).astype(np.float), labels))
predictions = dict(zip(np.arange(len(predictions)).astype(np.float), predictions))

(bleu1_avg, bleu1_per_sentence) = Bleu(n=1).compute_score(labels, predictions) # n = n-gram
(bleu4_avg, bleu4_per_sentence) = Bleu(n=4).compute_score(labels, predictions)
(cider_avg, cider_per_sentence) = Cider().compute_score(labels, predictions)
(meteor_avg, meteor_per_sentence) = Meteor().compute_score(labels, predictions)
(rouge_avg, rouge_per_sentence) = Rouge().compute_score(labels, predictions)

(spice_avg, cider_per_sentence) = Spice().compute_score(labels, predictions)

print('BLEU_1:%.4f, CIDEr:%.4f, METEOR:%.4f, ROUGE:%.4f, SPICE:%.4f'
        %(bleu1_avg[0], cider_avg, meteor_avg, rouge_avg, spice_avg))



{'testlen': 9, 'reflen': 14, 'guess': [9], 'correct': [4]}
ratio: 0.6428571428112246
{'testlen': 9, 'reflen': 14, 'guess': [9, 7, 5, 3], 'correct': [4, 1, 0, 0]}
ratio: 0.6428571428112246
BLEU_1:0.2550, CIDEr:1.6123, METEOR:0.1570, ROUGE:0.4232, SPICE:0.3333
