### Import Libraries

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
import evaluate
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(


### Loading the Dataset Locally

In [None]:
## Loading the training dataset
df_train1 = pd.read_csv('Data/train.csv')

## Drop the 'id' column from the dataframe df_train1 to create a new dataframe df_train
df_train = df_train1.drop(columns=['id'])

## Select the first 1000 rows of the dataframe df_train
df_train = df_train.iloc[:1000, :]

## Define a prefix string to be added to each article
prefix = 'summarize: '

## Add the prefix to each article in the 'article' column
df_train['article'] = prefix + df_train['article']

## Loading the testing dataset
df_test = pd.read_csv('/Users/user/Desktop/GitHub/NLP_project/cnn_dailymail/test.csv')
df_test = df_test.drop(columns=['id'])

## Loading the validation dataset
df_val = pd.read_csv('/Users/user/Desktop/GitHub/NLP_project/cnn_dailymail/validation.csv')

## Select the first 200 rows of the dataframe df_val
df_val = df_val.iloc[:200, :]

## Drop the 'id' column from the dataframe
df_val = df_val.drop(columns=['id'])

print("train and val shape:", df_train.shape, "test shape:",df_val.shape)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/user/Desktop/GitHub/NLP_project/cnn_dailymail/train.csv'

### Loading Data from HuggingFace

In [3]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("cnn_dailymail", "3.0.0")

# Check available splits
print(dataset)


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [10]:
type(dataset) #dataset

datasets.dataset_dict.DatasetDict

In [4]:
sample = dataset["train"][0]
print("Article:\n", sample["article"])
print("\nSummary:\n", sample["highlights"])


Article:
 LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart. Deta

### Modeling with BART

In [None]:
## Loading the Bart tokenizer and model for summarization
model_name = 'facebook/bart-large-cnn'
# Load the BERT tokenizer and model for summarization
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to("mps")

# Create a summarization pipeline
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)

# Extract model parameters
def get_model_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return total_params, trainable_params

total_params, trainable_params = get_model_parameters(model)
print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}")


In [None]:
# Converting the pandas dataset to huggingface dataset
# save for every model inference
global_train_df = df_train
global_test_df = df_val
train_df = Dataset.from_pandas(df_train)
val_df = Dataset.from_pandas(df_val)

In [None]:
# Fitting into dataset dict
train_val_test_dataset = DatasetDict({
    'train': train_df,
    'val': val_df})

print(type(train_val_test_dataset))
train_val_test_dataset

## BART fine-tuning

In [None]:
def prepare_dataset(data):
    inputs = data["article"]

    # Tokenize the inputs using the tokenizer
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Tokenize the 'highlights' column from the data to be used as labels
    labels = tokenizer(text_target=data["highlights"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_data = train_val_test_dataset.map(prepare_dataset, batched=True)

In [None]:
tokenized_data["train"]

In [None]:
# Padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return result

In [None]:
# set up hyper-parameters
training_args = Seq2SeqTrainingArguments(
    output_dir="bart-news",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=4,
    predict_with_generate=True,
    #fp16=False,
    report_to="none",
    #use_mps_device = True,
    no_cuda=True
)

In [None]:
# setup trainer
trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["val"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

In [None]:
# save the model
model_path = "bart-news"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)