In [None]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import load_dataset, Dataset

import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv('./data_merge.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin,category
0,0,http://www.forbes.com/sites/steveschaefer/2016...,https://web.archive.org/web/2016120719id_/http...,Biotech Stocks Smacked After Trump Boasts He'l...,1970-01-01 00:33:36.120719,The post-election rebound in biotech and healt...,It turns out President Trump may be just as bi...,32.052632,0.894737,1.315789,medium,medium,abstractive,technology
1,1,http://www.9news.com.au/national/2015/08/21/15...,https://web.archive.org/web/2015082119id_/http...,Joe Hockey confirms GST will apply to online p...,1970-01-01 00:33:35.082119,Treasurer Joe Hockey has announced a ten perce...,Treasurer Joe Hockey has announced a ten perce...,10.35,1.0,20.0,low,high,extractive,technology
2,2,http://www.foxnews.com/tech/2014/04/14/review-...,https://web.archive.org/web/2014041419id_/http...,Review: Siri-like Cortana fills Windows phone gap,1970-01-01 00:33:34.041419,Microsoft corporate vice president Joe Belfior...,"With the new Cortana virtual assistant, Window...",59.217391,1.0,23.0,high,high,extractive,technology
3,3,http://www.theguardian.com/technology/2014/nov...,https://web.archive.org/web/2014112119id_/http...,Now e-cigarettes can give you malware,1970-01-01 00:33:34.112119,E-cigarettes may be better for your health tha...,"Better for your lungs, worse for your hard dri...",22.041667,0.75,1.75,medium,low,mixed,technology
4,4,http://www.foxnews.com/scitech/2010/09/29/chin...,https://web.archive.org/web/2010092919id_/http...,China's Super Train Trounces Speed Records,1970-01-01 00:33:30.092919,258 miles per hour. That's how fast China's la...,A new Chinese high-speed train broke a world s...,11.111111,0.925926,5.296296,low,medium,mixed,technology


In [None]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# Preprocess the input and output text
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], max_length=1024, truncation=True, padding='max_length')
    labels = tokenizer(examples['summary'], max_length=128, truncation=True, padding='max_length')

    # Set labels for training
    inputs['labels'] = labels['input_ids']
    return inputs

In [None]:
from datasets import Dataset

# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df[['text', 'summary']])

# Tokenize the dataset
tokenized_datasets = hf_dataset.map(preprocess_function, batched=True, remove_columns=["text", "summary"])


Map: 100%|██████████| 53978/53978 [02:26<00:00, 367.63 examples/s]


In [None]:
from sklearn.model_selection import train_test_split
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [None]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results_injected",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,    
)


In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model with a progress bar (tqdm is automatically included)
trainer.train()


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Epoch,Training Loss,Validation Loss
1,0.3632,0.340811
2,0.3151,0.333827
3,0.289,0.332534


### Injected BART Evaluation

In [None]:
path = "./results_injected/checkpoint-8097"
model = BartForConditionalGeneration.from_pretrained(path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
from rouge_score import rouge_scorer

test_df = pd.read_csv('./technology_test.csv')

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    inputs = tokenizer(row['text'], return_tensors="pt", truncation=True, max_length=1024).to(device)
    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Calculate ROUGE scores
    score = scorer.score(row['summary'], predicted_summary)
    rouge_scores.append(score)

average_scores_tech = {metric: sum([score[metric].fmeasure for score in rouge_scores]) / len(rouge_scores) for metric in rouge_scores[0]}
average_scores_tech

100%|██████████| 5448/5448 [33:00<00:00,  2.75it/s]


{'rouge1': 0.4070902623642649,
 'rouge2': 0.2892701118525887,
 'rougeL': 0.36235780344345825}

In [None]:
test_df = pd.read_csv('./sports_test.csv')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    inputs = tokenizer(row['text'], return_tensors="pt", truncation=True, max_length=1024).to(device)
    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Calculate ROUGE scores
    score = scorer.score(row['summary'], predicted_summary)
    rouge_scores.append(score)

average_scores_arch = {metric: sum([score[metric].fmeasure for score in rouge_scores]) / len(rouge_scores) for metric in rouge_scores[0]}
average_scores_arch

100%|██████████| 6163/6163 [36:55<00:00,  2.78it/s]


{'rouge1': 0.3831651536464214,
 'rouge2': 0.2509568424128598,
 'rougeL': 0.3301597491044689}

In [None]:
test_df = pd.read_csv('./food_test.csv')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    inputs = tokenizer(row['text'], return_tensors="pt", truncation=True, max_length=1024).to(device)
    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Calculate ROUGE scores
    score = scorer.score(row['summary'], predicted_summary)
    rouge_scores.append(score)

average_scores_arch = {metric: sum([score[metric].fmeasure for score in rouge_scores]) / len(rouge_scores) for metric in rouge_scores[0]}
average_scores_arch

100%|██████████| 1482/1482 [09:11<00:00,  2.69it/s]


{'rouge1': 0.35844792246022766,
 'rouge2': 0.2439043821636203,
 'rougeL': 0.31600797529629693}

In [None]:
test_df = pd.read_csv('./architecture_test.csv')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    inputs = tokenizer(row['text'], return_tensors="pt", truncation=True, max_length=1024).to(device)
    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Calculate ROUGE scores
    score = scorer.score(row['summary'], predicted_summary)
    rouge_scores.append(score)

average_scores_arch = {metric: sum([score[metric].fmeasure for score in rouge_scores]) / len(rouge_scores) for metric in rouge_scores[0]}
average_scores_arch

100%|██████████| 419/419 [02:32<00:00,  2.75it/s]


{'rouge1': 0.37207757497468597,
 'rouge2': 0.2431804538260892,
 'rougeL': 0.3216438117869889}

In [None]:
test_df = pd.read_csv('./entertainment_test.csv')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge_scores = []

for idx, row in tqdm(test_df.iterrows(), total=test_df.shape[0]):
    inputs = tokenizer(row['text'], return_tensors="pt", truncation=True, max_length=1024).to(device)
    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], max_length=150, min_length=40, length_penalty=2.0, num_beams=4)
    predicted_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Calculate ROUGE scores
    score = scorer.score(row['summary'], predicted_summary)
    rouge_scores.append(score)

average_scores_arch = {metric: sum([score[metric].fmeasure for score in rouge_scores]) / len(rouge_scores) for metric in rouge_scores[0]}
average_scores_arch

100%|██████████| 6668/6668 [40:20<00:00,  2.75it/s] 


{'rouge1': 0.40024860762551817,
 'rouge2': 0.28355068747929124,
 'rougeL': 0.3558658323663727}