In [12]:
import pandas as pd
from summarizer import Summarizer
from datasets import load_dataset
from rouge_score import rouge_scorer
import csv

In [13]:
#Initialize the dataset
multi_lexsum = load_dataset("allenai/multi_lexsum", name="v20230518")
validation = list(multi_lexsum["validation"])

In [14]:
# Initialize BERT summarizer model
model = Summarizer()


In [15]:
#importing generated summary rogue score
generated_score = pd.read_csv("rogue_score_openai.csv")
case_list = generated_score["Case_ID"]

In [16]:
#get refrence summary
def get_summary_long(target_id):
    # Iterate over each entry in the 'validation' list
    for entry in validation:
        # Check if the 'id' key in the entry matches the 'target_id'
        if entry['id'] == target_id:
            # Return the long summary from the matching entry
            return entry['summary/long']
    # Optional: Return a message if no matching entry is found
    return "No summary found for the given ID."

In [17]:
#get full text
def get_full_text(target_id):
    # Iterate over each entry in the 'validation' list
    for entry in validation:
        # Check if the 'id' key in the entry matches the 'target_id'
        if entry['id'] == target_id:
            # Return the long summary from the matching entry
            return entry['sources']
    # Optional: Return a message if no matching entry is found
    return "No summary found for the given ID."

In [18]:
def process(iteration, csv_file_path='bert_summary_rogue.csv'):
    full_text = str(get_full_text(case_list[iteration]))

   # Check if the length of the text exceeds 800,000 characters
    if len(full_text) > 800000:
        data = {
            'Case_ID': case_list[iteration],
            'refrence_summary': 'Limit_Exceeded',
            'generated_summary': 'Limit_Exceeded',
            'ROUGE-1_Precision': 'Limit_Exceeded',
            'ROUGE-1_Recall': 'Limit_Exceeded',
            'ROUGE-1_F1': 'Limit_Exceeded',
            'ROUGE-2_Precision': 'Limit_Exceeded',
            'ROUGE-2_Recall': 'Limit_Exceeded',
            'ROUGE-2_F1': 'Limit_Exceeded',
            'ROUGE-L_Precision': 'Limit_Exceeded',
            'ROUGE-L_Recall': 'Limit_Exceeded',
            'ROUGE-L_F1': 'Limit_Exceeded'
        }
    else:
        refrence_summary = get_summary_long(case_list[iteration])
        # Generate summary using BERT
        generated_summary = model(full_text, num_sentences=50)

        # Initialize ROUGE scorer
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        score = scorer.score(generated_summary, refrence_summary)

        data = {
            'Case_ID': case_list[iteration],
            'refrence_summary': refrence_summary,
            'generated_summary': generated_summary,
            'ROUGE-1_Precision': round(score['rouge1'].precision, 4),
            'ROUGE-1_Recall': round(score['rouge1'].recall, 4),
            'ROUGE-1_F1': round(score['rouge1'].fmeasure, 4),
            'ROUGE-2_Precision': round(score['rouge2'].precision, 4),
            'ROUGE-2_Recall': round(score['rouge2'].recall, 4),
            'ROUGE-2_F1': round(score['rouge2'].fmeasure, 4),
            'ROUGE-L_Precision': round(score['rougeL'].precision, 4),
            'ROUGE-L_Recall': round(score['rougeL'].recall, 4),
            'ROUGE-L_F1': round(score['rougeL'].fmeasure, 4)
        }

    # Write data to CSV
    with open(csv_file_path, 'a', newline='') as csvfile:
        fieldnames = [
            'Case_ID', 'refrence_summary', 'generated_summary',
            'ROUGE-1_Precision', 'ROUGE-1_Recall', 'ROUGE-1_F1',
            'ROUGE-2_Precision', 'ROUGE-2_Recall', 'ROUGE-2_F1',
            'ROUGE-L_Precision', 'ROUGE-L_Recall', 'ROUGE-L_F1',
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow(data)

In [9]:
# process(57)

In [19]:
case_list[189]

'EE-CA-0352'

In [None]:
#run the process for set number of iteration
for i in range(0, len(case_list)):
    print(i)
    process(i)
