## Imports

In [1]:
import sys
sys.path.append('..')

from torchmetrics import BLEUScore, WordErrorRate

import torch

from math import log, exp, e

import numpy as np
from src.utils import Metrics
from src import DataLoader

## Loading hypothesis and reference files

### Data Loaders

In [2]:
hyp1_file = '../data/data_v1/newstest.hyp1'
hyp2_file = '../data/data_v1/newstest.hyp2'
hyp3_file = '../data/data_v1/newstest.hyp3'
ref_file = '../data/data_v1/newstest.en'

In [3]:
hyp1_data_loader = DataLoader(hyp1_file)
hyp2_data_loader = DataLoader(hyp2_file)
hyp3_data_loader = DataLoader(hyp3_file)
ref_data_loader = DataLoader(ref_file)

In [4]:
hyp1_tokenized = hyp1_data_loader.tokenize(mode="lines_words")
hyp2_tokenized = hyp2_data_loader.tokenize(mode="lines_words")
hyp3_tokenized = hyp3_data_loader.tokenize(mode="lines_words")
ref_tokenized = ref_data_loader.tokenize(mode="lines_words")

hyp1_sentences = hyp1_data_loader.tokenize(mode="sentences")
hyp2_sentences = hyp2_data_loader.tokenize(mode="sentences")
hyp3_sentences = hyp3_data_loader.tokenize(mode="sentences")
ref_sentences = ref_data_loader.tokenize(mode="sentences")

In [5]:
metrics = Metrics()

## WER

### Hypothesis 1

In [6]:
# our implementation
metrics.WER(hyp1_sentences, ref_sentences)

0.3644912405408207

In [7]:
# reference implementation
WER1 = WordErrorRate()
#WER1(hyp1_data_loader.load_data(), [ref_data_loader.load_data()])
WER1.update(preds=hyp1_sentences, target=ref_sentences)
WER1.compute()

tensor(0.3645)

### Hypothesis 2

In [8]:
# our implementation
metrics.WER(hyp2_sentences, ref_sentences)

0.37188087727871816

In [9]:
# reference implementation
WER2 = WordErrorRate()
WER2.update(preds=hyp2_sentences, target=ref_sentences)
WER2.compute()

tensor(0.3719)

### Hypothesis 3

In [10]:
# our implementation
metrics.WER(hyp3_sentences, ref_sentences)

0.6282968294163815

In [11]:
# reference implementation
WER3 = WordErrorRate()
WER3.update(preds=hyp3_sentences, target=ref_sentences)
WER3.compute()

tensor(0.6283)

## PER

### Hypothesis 1

In [12]:
metrics.PER(hyp1_sentences, ref_sentences)

0.25061086676440536

### Hypothesis 2

In [13]:
metrics.PER(hyp2_sentences, ref_sentences)

0.2557643609222977

### Hypothesis 3

In [14]:
metrics.PER(hyp3_sentences, ref_sentences)

0.4657988656389296

## BLEU

In [15]:
bscore = BLEUScore(4)

### Hypothesis 1

In [16]:
# our implementation
metrics.bleu_score(4,hyp1_sentences, ref_sentences)

0.4850221157121662

In [17]:
# reference implementation
bscore([hyp1_data_loader.load_data()], [[ref_data_loader.load_data()]])

tensor(0.5428)

### Hypothesis 2

In [18]:
# our implementation
metrics.bleu_score(4, hyp2_sentences,ref_sentences)

0.47679649673421626

In [19]:
# reference implementation
bscore([hyp2_data_loader.load_data()], [[ref_data_loader.load_data()]])

tensor(0.5361)

### Hypothesis 3

In [20]:
# our implementation
metrics.bleu_score(4, hyp3_sentences, ref_sentences)

0.18564978528059253

In [21]:
# reference implementation
bscore([hyp3_data_loader.load_data()],[[ref_data_loader.load_data()]])

tensor(0.2517)