<a href="https://colab.research.google.com/github/ndeschmann/COI-sum/blob/main/Automated_Summarization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Install libraries
%pip install transformers
%pip install datasets
%pip install torch
%pip install pandas
%pip install scikit-learn
%pip install rouge
%pip install beautifulsoup4
%pip install sentencepiece
%pip install accelerate -U

In [None]:
#Import libraries
import transformers
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import json
from bs4 import BeautifulSoup
import re
import sentencepiece

In [None]:
#Mount Google Drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


#Cleaning

In [None]:
def decode_json(s, _w=json.decoder.WHITESPACE.match):
    decoder = json.JSONDecoder()
    pos = 0
    while pos < len(s):
        try:
            obj, pos = decoder.raw_decode(s, pos)
            yield obj
        except json.JSONDecodeError as e:
            # Tries to find the next '{' to continue decoding
            brace_pos = s.find('{', pos)
            if brace_pos == -1:
                break
            pos = brace_pos + 1

def extract_info(entry):
    try:
        content_html = entry.get('content', '')
        description_en = entry.get('description_en', '')

        # Removing HTML tags and formatting from the content
        content_text = BeautifulSoup(content_html, 'html.parser').get_text()

        # Removing backslash tags and additional formatting
        content_text = re.sub(r'\\[^\s]+', '', content_text)
        content_text = re.sub(r'\s+', ' ', content_text).strip()

        # Storing the extracted information in a dictionary
        extracted_info = {
            "content_text": content_text,
            "description_en": description_en
        }

        return extracted_info

    except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
        return None

extracted_data = []

with open('/content/drive/MyDrive/ACCORD_Summarisation/docv_10k_post202208.json', 'r', encoding='utf-8') as json_file:
    for entry_or_list in decode_json(json_file.read()):
        if not entry_or_list:
            continue

        # If it's a list, process each entry in the list
        if isinstance(entry_or_list, list):
            for entry in entry_or_list:
                extracted_info = extract_info(entry)
                if extracted_info:
                    extracted_data.append(extracted_info)
        else:
            extracted_info = extract_info(entry_or_list)
            if extracted_info:
                extracted_data.append(extracted_info)

with open('/content/drive/MyDrive/ACCORD_Summarisation/docv_10k_clean.json', 'w', encoding='utf-8') as output_file:
    json.dump(extracted_data, output_file, ensure_ascii=False, indent=2)


#Preprocessing and Split

In [None]:
from datasets import load_dataset, DatasetDict

file_path = "/content/drive/MyDrive/ACCORD_Summarisation/docv_10k_clean.json"

dataset = load_dataset('json', data_files=file_path, split='train')

def preprocess_function(examples):
    return {
        'input_text': examples['content_text'],
        'target_text': examples['description_en']
    }

dataset = dataset.map(preprocess_function)

train_test_split = dataset.train_test_split(test_size=0.2)
test_val_split = train_test_split['test'].train_test_split(test_size=0.5)

split_datasets = DatasetDict({
    'train': train_test_split['train'],
    'validation': test_val_split['train'],
    'test': test_val_split['test']
})

split_datasets.save_to_disk("/content/drive/MyDrive/ACCORD_Summarisation/split_datasets_EN_10k")

#First Model Training:
#**Fine-tune Training BART CNN**

https://huggingface.co/facebook/bart-large-cnn

In [None]:
from datasets import load_from_disk
from transformers import BartTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer, DataCollatorForSeq2Seq

split_datasets = load_from_disk("/content/drive/MyDrive/ACCORD_Summarisation/split_datasets_EN_10k")

model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Tokenizing the datasets
def tokenize_function(examples):
    model_inputs = tokenizer(examples['input_text'], max_length=1024, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['target_text'], max_length=128, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = split_datasets.map(tokenize_function, batched=True)

# Defining the training arguments
training_args = TrainingArguments(
    output_dir='./content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN_10k',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN_10k/logs',
    evaluation_strategy="epoch",
    gradient_accumulation_steps=4
)

# Initializing the Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initializing the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator
)

trainer.train()

trainer.save_model('/content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN_10k')

# Evaluating the model on the test set
results = trainer.evaluate(tokenized_datasets['test'])
print(results)

In [None]:
#Saving the tokenizer
from datasets import load_from_disk
from transformers import BartTokenizer, BartForConditionalGeneration

model_name = 'facebook/bart-large-cnn'
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

tokenizer.save_pretrained('/content/drive/MyDrive/ACCORD_Summarisation/tokenizer')


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

('/content/drive/MyDrive/ACCORD_Summarisation/tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/ACCORD_Summarisation/tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/ACCORD_Summarisation/tokenizer/vocab.json',
 '/content/drive/MyDrive/ACCORD_Summarisation/tokenizer/merges.txt',
 '/content/drive/MyDrive/ACCORD_Summarisation/tokenizer/added_tokens.json')

##Testing the Summary

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import pipeline
from datasets import load_from_disk

model_path = '/content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN_10k'
tokenizer_path = '/content/drive/MyDrive/ACCORD_Summarisation/tokenizer'

tokenizer = BartTokenizer.from_pretrained(tokenizer_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

test_datasets = load_from_disk("/content/drive/MyDrive/ACCORD_Summarisation/split_datasets_EN_10k")

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

test_samples = test_datasets['test'].select(range(5))

def extract_text(sample, field_name):
    return sample[field_name]

def decode_text(sample, max_sequence_length=1024):
    input_text = extract_text(sample, 'input_text')
    target_text = extract_text(sample, 'target_text')

    # Truncating or splitting the text if it's too long
    if len(input_text) > max_sequence_length:
        input_text = input_text[:max_sequence_length]

    return input_text, target_text

for sample in test_samples:
    original_text, actual_summary = decode_text(sample)
    generated_summary = summarizer(original_text, max_length=50, min_length=30, length_penalty=2.0, num_beams=4)[0]['summary_text']

    print(f"Original Text: {original_text}")
    print(f"Actual Summary: {actual_summary}")
    print(f"Generated Summary: {generated_summary}")
    print("\n" + "-"*50 + "\n")


Original Text: Colombia’s Path to “Total Peace”. President Gustavo Petro cannot fall back on the FARC blueprint NO. 54 SEPTEMBER 2022 Introduction Colombia’s Path to “Total Peace” President Gustavo Petro cannot fall back on the FARC blueprint Günther Maihold With their joint announcement about the desire to resume peace talks, Colombia’s new president and the country’s second-largest guerrilla group, the ELN (Ejército de Liberación Nacional), have sent a clear political signal. The pacification of the ELN is to take place under the aegis of a “leftist” government and be accompanied by a com- prehensive and ambitious reform project. This is a renewed attempt to end the civil war following the conclusion of a peace agreement with the FARC rebels in 2016. How- ever, the agreement with the FARC can serve as a blueprint only to a limited extent, not just because of the different historical origins of the two guerrilla groups but also owing to the strongly decentralized internal structure of

##Testing on given links

In [None]:
# Load the trained BART model from the specified directory
from transformers import BartForConditionalGeneration, BartTokenizer
trained_model = BartForConditionalGeneration.from_pretrained('/content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN_10k')


In [None]:
#Summarization from a link
%pip install readability-lxml

In [None]:
#Summarization from a link
from readability import Document
import requests
from transformers import BartForConditionalGeneration, BartTokenizer

def fetch_article_content(url):
    response = requests.get(url)
    print(response.status_code)
    if response.status_code == 200:
        doc = Document(response.text)
        return doc.summary()
    else:
        return "Error: Unable to fetch article."
tokenizer_path = '/content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN/tokenizer'
tokenizer = BartTokenizer.from_pretrained(tokenizer_path)
article_url = "https://www.ecoi.net/de/dokument/2100746.html"
article_content = fetch_article_content(article_url)

inputs = tokenizer(article_content, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = trained_model.generate(inputs['input_ids'], max_length=60, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Generated Summary:", summary)


200
Generated Summary: Yevgenia Kara-Murza says her husband's placement in solitary confinement is "torture" The 42-year-old was initially arrested in April 2022 after returning to


In [None]:
#Summarization from a file
from transformers import BartTokenizer, BartForConditionalGeneration

tokenizer_path = '/content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN/tokenizer'
tokenizer = BartTokenizer.from_pretrained(tokenizer_path)
trained_model_path = '/content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN'
trained_model = BartForConditionalGeneration.from_pretrained(trained_model_path)

# Read content from the provided text file
file_path = '/content/CORE_Pakistan-Afghanistan.txt'
with open(file_path, 'r', encoding='utf-8') as file:
    article_content = file.read()

inputs = tokenizer(article_content, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = trained_model.generate(inputs['input_ids'], max_length=60, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Generated Summary:", summary)

Generated Summary: Infographic on the return of asylum seekers and returnees to Afghanistan (as of 24 November 2023)


In [None]:
#summarization from PDF
%pip install PyMuPDF

In [None]:
#summarization from PDF
from transformers import BartTokenizer, BartForConditionalGeneration
import fitz

tokenizer_path = '/content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN/tokenizer'
tokenizer = BartTokenizer.from_pretrained(tokenizer_path)
trained_model_path = '/content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN_1k'
trained_model = BartForConditionalGeneration.from_pretrained(trained_model_path)

pdf_file_path = '/content/SyriaLearningPaper-June2022.pdf'

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        text += page.get_text()
    return text

article_content = extract_text_from_pdf(pdf_file_path)

inputs = tokenizer(article_content, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = trained_model.generate(inputs['input_ids'], max_length=60, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Generated Summary:", summary)


Generated Summary: Northwestern Syria: Report on the health care situation


##Generate Summaries for Rouge Score

In [None]:
#generate summaries from testset
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_from_disk

model_path = '/content/drive/MyDrive/ACCORD_Summarisation/bart-summarization-model_EN_1k'
tokenizer_path = '/content/drive/MyDrive/ACCORD_Summarisation/tokenizer'

tokenizer = BartTokenizer.from_pretrained(tokenizer_path)
model = BartForConditionalGeneration.from_pretrained(model_path)

test_datasets = load_from_disk("/content/drive/MyDrive/ACCORD_Summarisation/split_datasets_EN_1k")

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

def extract_text(sample, field_name):
    return sample[field_name]

def decode_text(sample, max_sequence_length=1024):
    input_text = extract_text(sample, 'input_text')
    target_text = extract_text(sample, 'target_text')

    if len(input_text) > max_sequence_length:
        input_text = input_text[:max_sequence_length]

    return input_text, target_text

output_file_path = '/content/drive/MyDrive/ACCORD_Summarisation/generated_summaries_1k.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    for sample in test_datasets['test']:
        original_text, actual_summary = decode_text(sample)
        generated_summary = summarizer(original_text, max_length=50, min_length=10, length_penalty=2.0, num_beams=4)[0]['summary_text']
        output_file.write(f"{generated_summary}\n")

print(f"Summaries have been written to: {output_file_path}")


Summaries have been written to: /content/drive/MyDrive/ACCORD_Summarisation/generated_summaries_1k.txt


In [None]:
#BART-CNN API Übersetzungen
from transformers import pipeline, BartTokenizer, BartForConditionalGeneration
from datasets import load_from_disk

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

test_datasets = load_from_disk("/content/drive/MyDrive/ACCORD_Summarisation/split_datasets_EN")

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

def extract_text(sample, field_name):
    return sample[field_name]

def decode_text(sample, max_sequence_length=1024):
    input_text = extract_text(sample, 'input_text')
    target_text = extract_text(sample, 'target_text')

    if len(input_text) > max_sequence_length:
        input_text = input_text[:max_sequence_length]

    return input_text, target_text

output_file_path = '/content/drive/MyDrive/ACCORD_Summarisation/API_generated_summaries.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:

    for sample in test_datasets['test']:
        original_text, actual_summary = decode_text(sample)
        generated_summary = summarizer(original_text, max_length=50, min_length=10, length_penalty=2.0, num_beams=4)[0]['summary_text']
        output_file.write(f"{generated_summary}\n")

print(f"Summaries have been written to: {output_file_path}")



vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Summaries have been written to: /content/drive/MyDrive/ACCORD_Summarisation/API_generated_summaries.txt


##Compute Rouge Score

In [None]:
from rouge import Rouge
from datasets import load_from_disk
import json

def read_generated_summaries(txt_file):
    with open(txt_file, 'r', encoding='utf-8') as f:
        summaries = [line.strip() for line in f]
    return summaries

def compute_rouge_scores(original_summaries, generated_summaries):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summaries, original_summaries, avg=True)
    return scores

if __name__ == "__main__":
    test_dataset = load_from_disk("/content/drive/MyDrive/ACCORD_Summarisation/split_datasets_EN_1k/test")

    original_summaries = test_dataset["target_text"]

    generated_summaries = read_generated_summaries('/content/drive/MyDrive/ACCORD_Summarisation/generated_summaries_1k.txt')

    # Compute ROUGE scores
    rouge_scores = compute_rouge_scores(original_summaries, generated_summaries)

    print("ROUGE Scores:")
    print(rouge_scores)


ROUGE Scores:
{'rouge-1': {'r': 0.5577604240189546, 'p': 0.5968643454498719, 'f': 0.5629759507293809}, 'rouge-2': {'r': 0.42006011755737765, 'p': 0.44845884964837845, 'f': 0.4230178199871783}, 'rouge-l': {'r': 0.5355822147166035, 'p': 0.5731556141227195, 'f': 0.5408601846384697}}


#Second Model Training:
#**Fine-tuning mT5 small**

https://huggingface.co/google/mt5-small

In [None]:
from transformers import MT5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, load_from_disk

split_datasets = load_from_disk("/content/drive/MyDrive/ACCORD_Summarisation/split_datasets_EN_10k")

model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')
tokenizer = T5Tokenizer.from_pretrained('google/mt5-small')

def tokenize_function(examples):
    model_inputs = tokenizer(examples["input_text"], max_length=1024, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["target_text"], max_length=60, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = split_datasets.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/ACCORD_Summarisation/mt5_summarization_10k',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/ACCORD_Summarisation/mt5_summarization_10k/logs',
    evaluation_strategy="epoch",
    gradient_accumulation_steps=4,
    save_total_limit=5
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator
)

trainer.train()

trainer.save_model('/content/drive/MyDrive/ACCORD_Summarisation/mt5_summarization_10k')

# Evaluating the model on the test set
results = trainer.evaluate(tokenized_datasets['test'])
print(results)


In [None]:
#download tokenizer
from datasets import load_from_disk
from transformers import MT5ForConditionalGeneration, T5Tokenizer

model_name = 'google/mt5-small'
model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

tokenizer.save_pretrained('/content/drive/MyDrive/ACCORD_Summarisation/mt5_summarization_10k/tokenizer')


In [None]:
#testing the output
from datasets import load_from_disk
from transformers import MT5ForConditionalGeneration, T5Tokenizer
from transformers import pipeline

model_path = '/content/drive/MyDrive/ACCORD_Summarisation/mt5_summarization_1k'
tokenizer_path = '/content/drive/MyDrive/ACCORD_Summarisation/mt5_summarization_10k/tokenizer'

tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)
model = MT5ForConditionalGeneration.from_pretrained(model_path)

test_datasets = load_from_disk("/content/drive/MyDrive/ACCORD_Summarisation/split_datasets_EN_1k")

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

test_samples = test_datasets['test'].select(range(5))

def extract_text(sample, field_name):
    return sample[field_name]

def decode_text(sample, max_sequence_length=1024):
    input_text = extract_text(sample, 'input_text')
    target_text = extract_text(sample, 'target_text')

    if len(input_text) > max_sequence_length:
        input_text = input_text[:max_sequence_length]

    return input_text, target_text

for sample in test_samples:
    original_text, actual_summary = decode_text(sample)
    generated_summary = summarizer(original_text, max_length=50, min_length=30, length_penalty=2.0, num_beams=4)[0]['summary_text']

    print(f"Original Text: {original_text}")
    print(f"Actual Summary: {actual_summary}")
    print(f"Generated Summary: {generated_summary}")
    print("\n" + "-"*50 + "\n")


Original Text: Peace R esearch Institute O slo (PR IO ) PO Box 9229 G rønland, N O -0134 O slo, N orw ay V isiting A ddress: H ausm anns gate 3 w w w .prio.org Facebook: PR IO .org Tw itter: PR IO U pdates ISBN : 978-82-343-0350-0 (print) 978-82-343-0351-7 (online) C over: President K iir and M achar discuss political issues. Photo: U N M ISS via Flickr / C C BY-N C -N D Eli Stamnes Norwegian Institute of International Affairs (NUPI) Cedric de Coning Norwegian Institute of International Affairs (NUPI) Peace R esearch Institute O slo (PR IO ) PO Box 9229 G rønland, N O -0134 O slo, N orw ay V isiting A ddress: H ausm anns gate 3 w w w .prio.org Facebook: PR IO .org Tw itter: PR IO U pdates FAIR CASE BRIEF 06 The Revitalised Agreement on the Resolution of the Conflict in the Republic of South Sudan (R-ARCSS) Peace Research Institute Oslo (PRIO) Hausmanns gate 3 PO Box 9229 Grønland NO-0134 Oslo, Norway Tel +47 22 54 77 00 www.prio.org The Peace Research Institute Oslo (PRIO) is a non- pr

##Generate Summaries for Rouge Score

In [None]:
#Generate Summaries from Testset
from transformers import MT5ForConditionalGeneration, T5Tokenizer
from datasets import load_from_disk

model_name = "/content/drive/MyDrive/ACCORD_Summarisation/mt5_summarization_10k"
tokenizer_name = "/content/drive/MyDrive/ACCORD_Summarisation/mt5_summarization_10k/tokenizer"
tokenizer = T5Tokenizer.from_pretrained(tokenizer_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

test_datasets = load_from_disk("/content/drive/MyDrive/ACCORD_Summarisation/split_datasets_EN_10k")

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

def extract_text(sample, field_name):
    return sample[field_name]

def decode_text(sample, max_sequence_length=1024):
    input_text = extract_text(sample, 'input_text')
    target_text = extract_text(sample, 'target_text')

    if len(input_text) > max_sequence_length:
        input_text = input_text[:max_sequence_length]

    return input_text, target_text

output_file_path = '/content/drive/MyDrive/ACCORD_Summarisation/MT5_generated_summaries_10k.txt'
with open(output_file_path, 'w', encoding='utf-8') as output_file:

    for sample in test_datasets['test']:
        original_text, actual_summary = decode_text(sample)

        generated_summary = summarizer(original_text, max_length=50, min_length=10, length_penalty=2.0, num_beams=4)[0]['summary_text']

        output_file.write(f"{generated_summary}\n")

print(f"Summaries have been written to: {output_file_path}")

##Compute Rouge Scores

In [None]:
from rouge import Rouge
from datasets import load_from_disk
import json

def read_generated_summaries(txt_file):
    with open(txt_file, 'r', encoding='utf-8') as f:
        summaries = [line.strip() for line in f]
    return summaries

def compute_rouge_scores(original_summaries, generated_summaries):
    rouge = Rouge()
    scores = rouge.get_scores(generated_summaries, original_summaries, avg=True)
    return scores

if __name__ == "__main__":
    test_dataset = load_from_disk("/content/drive/MyDrive/ACCORD_Summarisation/split_datasets_EN_10k/test")
    original_summaries = test_dataset["target_text"]
    generated_summaries = read_generated_summaries('/content/drive/MyDrive/ACCORD_Summarisation/MT5_generated_summaries_10k.txt')

    rouge_scores = compute_rouge_scores(original_summaries, generated_summaries)

    print("ROUGE Scores:")
    print(rouge_scores)


ROUGE Scores:
{'rouge-1': {'r': 0.11530526005268565, 'p': 0.13208413513133194, 'f': 0.11751626333883382}, 'rouge-2': {'r': 0.024128702079940473, 'p': 0.02333621444202798, 'f': 0.022826149352379324}, 'rouge-l': {'r': 0.10732885722845674, 'p': 0.12337510382447453, 'f': 0.10936835437763527}}
