In [None]:
# ML NLP Metrics Calculation
# This notebook contains four examples from the IU Chest X-ray dataset - https://openi.nlm.nih.gov/faq#collection

In [None]:
# Required packages - uncomment to install
# !python -m pip install datasets nltk sacrebleu rouge_score

In [None]:
import glob
from warnings import simplefilter

import nltk
import pandas as pd
from datasets import load_metric
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.meteor_score import meteor_score

simplefilter("ignore")

## Test Cases

In [None]:
test_cases = list(
    zip(sorted(glob.glob("./txt/ref_*.txt")), sorted(glob.glob("./txt/gen_*.txt")))
)

In [None]:
for n, (ref, gen) in enumerate(test_cases):
    print(f"Test Case {n+1}")
    print(f"(Ref: {ref} + Gen: {gen})")
    print()

    with open(ref, "r") as f:
        ref_case = f.readlines()

    ref_case = ".".join(map(str, ref_case))
    ref_case = ref_case.lower()

    ref_case_tokens = nltk.word_tokenize(ref_case)

    print("Radiologist report:")
    print(ref_case)
    # print(ref_case_tokens)
    print()

    with open(gen, "r") as f:
        gen_case = f.readlines()

    gen_case = ".".join(map(str, gen_case))

    gen_case_tokens = nltk.word_tokenize(gen_case)

    print("Generated report:")
    print(gen_case)
    # print(gen_case_tokens)
    print("---------\n")

    print(f"Raw BLEU: {round(corpus_bleu([[ref_case_tokens]],[gen_case_tokens]), 4)}")
    print()

    print(
        "Individual 1-gram: %f"
        % corpus_bleu([[ref_case_tokens]], [gen_case_tokens], weights=(1, 0, 0, 0))
    )
    print(
        "Individual 2-gram: %f"
        % corpus_bleu([[ref_case_tokens]], [gen_case_tokens], weights=(0, 1, 0, 0))
    )
    print(
        "Individual 3-gram: %f"
        % corpus_bleu([[ref_case_tokens]], [gen_case_tokens], weights=(0, 0, 1, 0))
    )
    print(
        "Individual 4-gram: %f"
        % corpus_bleu([[ref_case_tokens]], [gen_case_tokens], weights=(0, 0, 0, 1))
    )
    print()

    print("Adjusted weighting")
    print(
        "Individual 1-gram: %f"
        % corpus_bleu([[ref_case_tokens]], [gen_case_tokens], weights=(1, 0, 0, 0))
    )
    print(
        "Individual 2-gram: %f"
        % corpus_bleu([[ref_case_tokens]], [gen_case_tokens], weights=(0.5, 0.5, 0, 0))
    )
    print(
        "Individual 3-gram: %f"
        % corpus_bleu(
            [[ref_case_tokens]], [gen_case_tokens], weights=(0.33, 0.33, 0.33, 0)
        )
    )
    print(
        "Individual 4-gram: %f"
        % corpus_bleu(
            [[ref_case_tokens]], [gen_case_tokens], weights=(0.25, 0.25, 0.25, 0.25)
        )
    )
    print("---------\n")

    bleu = load_metric("bleu")
    dataset_bleu = bleu.compute(
        predictions=[[gen_case_tokens]], references=[[ref_case_tokens]]
    )

    print(dataset_bleu)
    print()

    sacrebleu = load_metric("sacrebleu")
    #  SacreBLEU operates on raw text, not tokens
    dataset_sacrebleu = sacrebleu.compute(
        predictions=[[gen_case]], references=[[ref_case]]
    )

    print(dataset_sacrebleu)
    print()

    rouge = load_metric("rouge")
    dataset_rouge = rouge.compute(predictions=[[gen_case]], references=[[ref_case]])

    print(dataset_rouge)
    print()

    print(
        f"nltk - Meteor: {round(meteor_score([ref_case_tokens], gen_case_tokens), 4)}"
    )
    print()

    meteor = load_metric("meteor")
    dataset_meteor = meteor.compute(predictions=[[gen_case]], references=[[ref_case]])

    print(dataset_meteor)
    print("======\n")

In [None]:
# end of script