In [1]:
# pip install rouge-metric

In [2]:
### I downloaded some different rogue code from the official implementation and it seems to be working
### https://github.com/andersjo/pyrouge/tree/master/tools/ROUGE-1.5.5
from rouge_metric import PyRouge

### Load summary results
# hypotheses = [
#     'how are you\ni am fine',  # document 1: hypothesis
#     'it is fine today\nwe won the football game',  # document 2: hypothesis
# ]
# references = [[
#     'how do you do\nfine thanks',  # document 1: reference 1
#     'how old are you\ni am three',  # document 1: reference 2
# ], [
#     'it is sunny today\nlet us go for a walk',  # document 2: reference 1
#     'it is a terrible day\nwe lost the game',  # document 2: reference 2
# ]]

hypotheses = [" Tokyo is the one of the biggest city in the world."]
references = [["The capital of Japan, Tokyo, is the center of Japanese economy."]]


# Evaluate document-wise ROUGE scores
rouge = PyRouge(rouge_n=(1, 2, 4), rouge_l=True, rouge_w=True,
                rouge_w_weight=1.2, rouge_s=True, rouge_su=True, skip_gap=4)
scores = rouge.evaluate(hypotheses, references)
scores

{'rouge-1': {'r': 0.2727272727272727,
  'p': 0.2727272727272727,
  'f': 0.2727272727272727},
 'rouge-2': {'r': 0.1, 'p': 0.1, 'f': 0.10000000000000002},
 'rouge-4': {'r': 0.0, 'p': 0.0, 'f': 0.0},
 'rouge-l': {'r': 0.2727272727272727,
  'p': 0.2727272727272727,
  'f': 0.2727272727272727},
 'rouge-w-1.2': {'r': 0.15210311608148122,
  'p': 0.24570650158950905,
  'f': 0.18789251377101182},
 'rouge-s4': {'r': 0.1, 'p': 0.1, 'f': 0.10000000000000002},
 'rouge-su4': {'r': 0.14, 'p': 0.14, 'f': 0.14}}

In [3]:
from rouge import Rouge

def calculate_rouge_scores(hypothesis, references, n=1):
    rouge = Rouge()
    scores = rouge.get_scores(hypothesis, references, avg=True)
    return scores

def print_rouge_scores(scores):
    print("ROUGE-1 (R-1): ", scores['rouge-1']['f'])
    print("ROUGE-2 (R-2): ", scores['rouge-2']['f'])
    print("ROUGE-L (R-L): ", scores['rouge-l']['f'])

# Read the contents of the files
reference_file = "D:\\Desktop\\master_scientific_computing\\second_semester\\problems with data\\Problems-With-Data--Summer-2024\\exercise-2\\references.txt"
with open(reference_file, "r", encoding="utf-8") as f:
    reference_texts = f.readlines()

schumann_summary_file = "D:\\Desktop\\master_scientific_computing\\second_semester\\problems with data\\Problems-With-Data--Summer-2024\\exercise-2\\task4\\summaries_schumannetal_8.txt"
with open(schumann_summary_file, "r", encoding="utf-8") as f:
    summary_texts = f.readlines()

# Calculate ROUGE scores
scores = calculate_rouge_scores(summary_texts, reference_texts)
print_rouge_scores(scores)



ROUGE-1 (R-1):  0.23873000499352504
ROUGE-2 (R-2):  0.08073371898649771
ROUGE-L (R-L):  0.22251439787273927


In [4]:
summary_texts[0]

'nec corp. and UNK said to join forces\n'

Let's try a few things first. Let's compare Schumann's algorithm with text using NLI and get a small subset of faithful/unfaithful examples. We considered first 100 examples

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

# Load the NLI model
def load_model():
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    return tokenizer, model, device

# Classify NLI
def classify_nli(tokenizer, model, device, premise, hypothesis):
    inputs = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.softmax(outputs.logits, dim=-1)[0]
    entailment_score = prediction[0]
    contradiction_score = prediction[2]
    if entailment_score > contradiction_score:
        predicted_label = "entailment"
    else:
        predicted_label = "non-entailment"
    return predicted_label

# Add numeric IDs to each line of the input files
def add_numeric_ids(input_file, output_file):
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        lines = infile.readlines()
        for idx, line in enumerate(lines, start=1):
            outfile.write(f"{idx}\t{line.strip()}\n")

In [16]:
# Process the generated summaries and classify them
def classify_summaries(input_file, summaries_file, entailed_file, non_entailed_file):
    tokenizer, model, device = load_model()
    
    entailed_ids = []
    non_entailed_ids = []
    
    with open(input_file, "r") as infile, \
         open(summaries_file, "r") as sumfile, \
         open(entailed_file, "w") as entailed_outfile, \
         open(non_entailed_file, "w") as non_entailed_outfile:
        
        input_lines = infile.readlines()
        summary_lines = sumfile.readlines()
        
        for input_line, summary_line in tqdm(zip(input_lines, summary_lines), desc="Classifying summaries"):
            input_id, input_text = input_line.strip().split("\t", 1)
            summary_id, summary_text = summary_line.strip().split("\t", 1)
            
            if input_id != summary_id:
                print(f"IDs do not match for input ID {input_id} and summary ID {summary_id}")
                continue
            
            predicted_label = classify_nli(tokenizer, model, device, input_text, summary_text)
            
            if predicted_label == "entailment":
                entailed_outfile.write(f"{summary_text}\n")  # Write the summary text
                entailed_ids.append(input_id)
            else:
                non_entailed_outfile.write(f"{summary_text}\n")  # Write the summary text
                non_entailed_ids.append(input_id)
    
    return entailed_ids, non_entailed_ids


# Main function
if __name__ == "__main__":

    input_file = "input_with_id.txt"
    summaries_file = "summaries_schumannetal_8_with_id.txt"
    
    # Step 2: Classify summaries
    entailed_file = "entailed_summaries.txt"
    non_entailed_file = "non_entailed_summaries.txt"
    
    entailed_ids, non_entailed_ids = classify_summaries(input_file, summaries_file, entailed_file, non_entailed_file)
    
    # Step 3: Create reference files
    with open("references_E.txt", "w") as ref_e_file, open("references_with_id.txt", "r") as ref_file:
        ref_lines = ref_file.readlines()
        for id in entailed_ids:
            for line in ref_lines:
                ref_id, ref_statement = line.strip().split("\t", 1)
                if id == ref_id:
                    ref_e_file.write(f"{ref_statement}\n")
                    break
    
    with open("references_NE.txt", "w") as ref_ne_file, open("references_with_id.txt", "r") as ref_file:
        ref_lines = ref_file.readlines()
        for id in non_entailed_ids:
            for line in ref_lines:
                ref_id, ref_statement = line.strip().split("\t", 1)
                if id == ref_id:
                    ref_ne_file.write(f"{ref_statement}\n")
                    break
    
    print("Classification completed.")


Classifying summaries: 1951it [08:08,  3.99it/s]


Classification completed.


Now let's compare the rouge score using faithful examples (labeled faithful by the NLI model) we found in the code-cell above:

In [17]:
# Read the contents of the files
reference_file = "references_E.txt"
with open(reference_file, "r", encoding="utf-8") as f:
    reference_texts = f.readlines()

schumann_summary_file = "entailed_summaries.txt"
with open(schumann_summary_file, "r", encoding="utf-8") as f:
    summary_texts = f.readlines()

# Calculate ROUGE scores
scores = calculate_rouge_scores(summary_texts, reference_texts)
print_rouge_scores(scores)

ROUGE-1 (R-1):  0.2487153056897773
ROUGE-2 (R-2):  0.0877460622278349
ROUGE-L (R-L):  0.23179858449498694


Now let's evaluate the rogue score using the unfaithful examples (labeled so by the NLI model).

In [18]:
# Read the contents of the files
reference_file = "references_NE.txt"
with open(reference_file, "r", encoding="utf-8") as f:
    reference_texts = f.readlines()

schumann_summary_file = "non_entailed_summaries.txt"
with open(schumann_summary_file, "r", encoding="utf-8") as f:
    summary_texts = f.readlines()

# Calculate ROUGE scores
scores = calculate_rouge_scores(summary_texts, reference_texts)
print_rouge_scores(scores)

ROUGE-1 (R-1):  0.21202731763255223
ROUGE-2 (R-2):  0.06198131333169726
ROUGE-L (R-L):  0.19768662950439753


We found that the rouge scores increases when we only consider the subset of faithful examples.