In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [None]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Preprocess the input and output text
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], max_length=1024, truncation=True, padding='max_length')
    labels = tokenizer(examples['summary'], max_length=128, truncation=True, padding='max_length')

    # Set labels for training
    inputs['labels'] = labels['input_ids']
    return inputs

### BART Baseline Evaluation

In [None]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large').to(device)

In [None]:
test_df = pd.read_csv('./entertainment_test.csv')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    inputs = tokenizer(row['text'], return_tensors="pt", truncation=True, max_length=1024).to(device)
    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Calculate ROUGE scores
    score = scorer.score(row['summary'], predicted_summary)
    rouge_scores.append(score)

average_scores_arch = {metric: sum([score[metric].fmeasure for score in rouge_scores]) / len(rouge_scores) for metric in rouge_scores[0]}
average_scores_arch

100%|██████████| 6668/6668 [1:31:04<00:00,  1.22it/s]


{'rouge1': 0.25456955875129933,
 'rouge2': 0.16404162263024455,
 'rougeL': 0.2139045876625417}

In [None]:
test_df = pd.read_csv('./architecture_test.csv')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    inputs = tokenizer(row['text'], return_tensors="pt", truncation=True, max_length=1024).to(device)
    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Calculate ROUGE scores
    score = scorer.score(row['summary'], predicted_summary)
    rouge_scores.append(score)

average_scores_arch = {metric: sum([score[metric].fmeasure for score in rouge_scores]) / len(rouge_scores) for metric in rouge_scores[0]}
average_scores_arch

100%|██████████| 419/419 [05:42<00:00,  1.22it/s]


{'rouge1': 0.24576866517263188,
 'rouge2': 0.1547942113323618,
 'rougeL': 0.2059425862827461}

In [None]:
test_df = pd.read_csv('./food_test.csv')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    inputs = tokenizer(row['text'], return_tensors="pt", truncation=True, max_length=1024).to(device)
    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Calculate ROUGE scores
    score = scorer.score(row['summary'], predicted_summary)
    rouge_scores.append(score)

average_scores_arch = {metric: sum([score[metric].fmeasure for score in rouge_scores]) / len(rouge_scores) for metric in rouge_scores[0]}
average_scores_arch

100%|██████████| 1482/1482 [20:15<00:00,  1.22it/s]


{'rouge1': 0.2333310144247373,
 'rouge2': 0.14703136749240425,
 'rougeL': 0.1979435435894947}

In [None]:
test_df = pd.read_csv('./sports_test.csv')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    inputs = tokenizer(row['text'], return_tensors="pt", truncation=True, max_length=1024).to(device)
    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Calculate ROUGE scores
    score = scorer.score(row['summary'], predicted_summary)
    rouge_scores.append(score)

average_scores_arch = {metric: sum([score[metric].fmeasure for score in rouge_scores]) / len(rouge_scores) for metric in rouge_scores[0]}
average_scores_arch

100%|██████████| 6163/6163 [1:24:47<00:00,  1.21it/s]


{'rouge1': 0.23407144187032236,
 'rouge2': 0.14129025812317933,
 'rougeL': 0.19308868959917938}

In [None]:
test_df = pd.read_csv('./technology_test.csv')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    inputs = tokenizer(row['text'], return_tensors="pt", truncation=True, max_length=1024).to(device)
    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Calculate ROUGE scores
    score = scorer.score(row['summary'], predicted_summary)
    rouge_scores.append(score)

average_scores_tech = {metric: sum([score[metric].fmeasure for score in rouge_scores]) / len(rouge_scores) for metric in rouge_scores[0]}
average_scores_tech

100%|██████████| 5448/5448 [1:14:13<00:00,  1.22it/s]


{'rouge1': 0.25317686616376717,
 'rouge2': 0.16564610536302368,
 'rougeL': 0.21556226523124777}