# BART/BART Proof of Concept

In [1]:
import pandas as pd
import numpy as np

import torch
import evaluate

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer

train_df = pd.read_csv('samsum_csv_data/train.csv')
validate_df = pd.read_csv('samsum_csv_data/validation.csv')

In [2]:

poc_train_df = train_df.head(100)
poc_validate_df = validate_df.head(20)

# Convert pandas DataFrames to 🤗 HF Dataset objects (ref based)
hf_train_dataset = Dataset.from_pandas(poc_train_df)
hf_validate_dataset = Dataset.from_pandas(poc_validate_df)

print(f"\nHugging Face Train Dataset size: {len(hf_train_dataset)}")
print(f"Hugging Face Validation Dataset size: {len(hf_validate_dataset)}")

MODEL_NAME = "facebook/bart-base"

print(f"\nLoading tokenizer for model: {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer vocabulary size: {len(tokenizer)}")


Hugging Face Train Dataset size: 100
Hugging Face Validation Dataset size: 20

Loading tokenizer for model: facebook/bart-base...
Tokenizer vocabulary size: 50265


In [3]:
# In token counts (effectively word)
MAX_INPUT_LENGTH = 1024 # Some dialogs will likely exceed this
MAX_TARGET_LENGTH = 128 # If our summary is trying to go longer than this it's wrong

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["dialogue"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True, # Cut off longer dialogs foot
        padding="max_length" # Pad to max_length for consistent input shapes
    )

    labels = tokenizer(
        text_target=examples["summary"], # Our target is to match the provided summary
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

print(f"\nPreprocessing training/validation data (tokenizing and aligning lengths)...")
tokenized_hf_train_dataset = hf_train_dataset.map(
    preprocess_function,
    batched=True, # Process examples in batches for speed
    remove_columns=['id', 'dialogue', 'summary'] # Remove original text columns
)

tokenized_hf_validate_dataset = hf_validate_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['id', 'dialogue', 'summary']
)


Preprocessing training/validation data (tokenizing and aligning lengths)...


Map: 100%|██████████| 100/100 [00:00<00:00, 2194.22 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 2640.09 examples/s]


In [4]:
tokenized_hf_train_dataset.set_format("torch")
tokenized_hf_validate_dataset.set_format("torch")

print(f"\nLoading BART model for sequence-to-sequence: {MODEL_NAME}...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
print("Model loaded successfully.")

output_dir = "./bart_samsum_poc_results"
print(f"\nSetting up Training Arguments. Output directory: {output_dir}")
# Includes some default values like num_train_epochs for future clarity
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=2,
    warmup_steps=10,
    weight_decay=0.01,
    logging_steps=5,
    eval_strategy="steps",
    eval_steps=10,
    save_steps=10,
    report_to="none",
    gradient_accumulation_steps=4,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",      # Look for higher ROUGE-1 scores 
    greater_is_better=True
)

print("\nTraining Arguments configured.")


Loading BART model for sequence-to-sequence: facebook/bart-base...
Model loaded successfully.

Setting up Training Arguments. Output directory: ./bart_samsum_poc_results

Training Arguments configured.


In [6]:
print("\nLoading ROUGE metric...")
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    predictions = predictions[0]

    # Now, check the shape of predictions. If it's 3D, it's likely logits.
    # We need to convert logits to token IDs by taking the argmax.

    predictions = np.argmax(predictions, axis=-1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process for ROUGE: remove extra whitespace and newlines
    decoded_preds = ["\n".join(pred.strip().split()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split()) for label in decoded_labels]

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    result = {k: round(v * 100, 4) for k, v in result.items()}
    return result

print("Metric computation function defined.")

print("\nInitializing the Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hf_train_dataset,
    eval_dataset=tokenized_hf_validate_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Trainer initialized successfully. Ready for training.")


Loading ROUGE metric...
Metric computation function defined.

Initializing the Trainer...
Trainer initialized successfully. Ready for training.


  trainer = Trainer(


In [7]:
#step6

# Start the training process
print("\nStarting model training...")
train_result = trainer.train()
print("Training complete.")

# Save the trained model and tokenizer
trainer.save_model() # Saves the model and tokenizer to the output_dir specified in TrainingArguments
# For good measure, you can also save the tokenizer explicitly if desired (though save_model usually handles it)
# tokenizer.save_pretrained(training_args.output_dir)

# Save training metrics (optional, but good for tracking progress)
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

print(f"\nModel and tokenizer saved to: {training_args.output_dir}")
print("Training metrics logged and saved.")

# Optionally, you can also run a final evaluation on the validation set after training
print("\nRunning final evaluation on the validation set...")
eval_metrics = trainer.evaluate(eval_dataset=tokenized_hf_validate_dataset)
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)
print("Final evaluation complete.")
print(eval_metrics) # Print the evaluation results


Starting model training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
10,10.9661,10.114227,50.883,23.5127,46.9729,51.0212
20,7.7293,5.987974,33.2256,16.5336,31.9803,33.2915
30,4.7681,3.903649,53.146,26.1281,51.6966,53.3488
40,3.8212,3.087858,56.0596,29.9701,55.0727,56.2348
50,3.2176,2.567768,57.7593,33.3636,56.9496,57.9557
60,2.7972,2.245227,57.7993,33.2497,57.1127,58.0464
70,2.5582,2.091244,58.3633,33.9742,57.3543,58.5695


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Training complete.
***** train metrics *****
  epoch                    =        3.0
  total_flos               =   170358GF
  train_loss               =     5.3673
  train_runtime            = 0:02:04.88
  train_samples_per_second =      2.402
  train_steps_per_second   =      0.601

Model and tokenizer saved to: ./bart_samsum_poc_results
Training metrics logged and saved.

Running final evaluation on the validation set...


***** eval metrics *****
  epoch                   =        3.0
  eval_loss               =     2.0912
  eval_rouge1             =    58.3633
  eval_rouge2             =    33.9742
  eval_rougeL             =    57.3543
  eval_rougeLsum          =    58.5695
  eval_runtime            = 0:00:01.81
  eval_samples_per_second =     11.019
  eval_steps_per_second   =       5.51
Final evaluation complete.
{'eval_loss': 2.0912435054779053, 'eval_rouge1': 58.3633, 'eval_rouge2': 33.9742, 'eval_rougeL': 57.3543, 'eval_rougeLsum': 58.5695, 'eval_runtime': 1.815, 'eval_samples_per_second': 11.019, 'eval_steps_per_second': 5.51, 'epoch': 3.0}


In [8]:


# Define the output directory where your model was saved
output_dir = "./bart_samsum_poc_results"
MODEL_NAME = "facebook/bart-base" # Ensure this matches the model used for training

# Load the fine-tuned model and tokenizer
print(f"\nLoading fine-tuned model and tokenizer from {output_dir}...")
# Map to CPU if no GPU is available, or load directly to GPU if one is now present.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)
# Load model. If model was saved on CPU, it will load to CPU by default.
# If saved on GPU but now running on CPU, PyTorch will handle mapping to CPU.
loaded_model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to(device)

print("Model and tokenizer loaded successfully for inference.")

# --- Prepare example dialogues for inference ---
# We'll use examples from the original validate_df that were not directly used for training.
# Make sure poc_validate_df is still available in your environment from previous steps.
# If not, you might need to re-run Step 1 and the pandas to Dataset conversion.
# For robust example pulling, let's grab directly from the `validate_df` loaded in Step 1.

# Assuming validate_df is available from earlier steps.
# If you closed and restarted your environment, you might need to re-run
# the first code block (Step 1) to load `validate_df`.

if 'validate_df' not in locals():
    print("\n'validate_df' not found. Please ensure you have run Step 1 to load the dataframes.")
    print("For demonstration, using dummy examples.")
    example_data = [
        {"dialogue": "Speaker A: I'm really tired. I worked all night. Speaker B: You should get some rest. Speaker A: I wish I could, but I have a deadline.",
         "summary": "Speaker A is tired from work but has a deadline."},
        {"dialogue": "Participant 1: Did you remember to buy milk? Participant 2: Oh no! I completely forgot. I'll go back to the store now. Participant 1: Thanks!",
         "summary": "Participant 2 forgot to buy milk and will go back to the store."},
    ]
else:
    # Use actual examples from the validation set
    # Let's pick a couple of diverse examples, not just the very first ones if possible.
    # We can use .sample(n) for random examples, or .iloc[] for specific indices.
    num_examples_to_show = 3
    if len(validate_df) >= num_examples_to_show:
        example_data = validate_df.sample(n=num_examples_to_show, random_state=42).to_dict('records')
    else: # If validation set is too small, just use what's available
        example_data = validate_df.to_dict('records')
        print(f"Not enough examples in validate_df to sample {num_examples_to_show}. Showing all {len(validate_df)} examples.")


# --- Perform inference for each example ---
print("\n--- Model Inference Examples ---")
for i, example in enumerate(example_data):
    dialogue = example['dialogue']
    reference_summary = example['summary']

    # Tokenize the input dialogue
    inputs = loaded_tokenizer(
        dialogue,
        return_tensors="pt", # Return PyTorch tensors
        max_length=MAX_INPUT_LENGTH, # Use the same max length as training
        truncation=True
    ).to(device) # Move inputs to the correct device (CPU/GPU)

    # Generate summary
    # Using parameters that typically work well for summarization
    summary_ids = loaded_model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        num_beams=4,        # Use beam search for better quality summaries
        max_length=MAX_TARGET_LENGTH, # Max length of generated summary
        min_length=30,      # Minimum length to encourage more detailed summaries
        early_stopping=True, # Stop generation when all beam hypotheses are complete
        length_penalty=2.0  # Encourage longer summaries (common for abstractive summarization)
    )

    # Decode the generated summary IDs back to text
    generated_summary = loaded_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print(f"\n--- Example {i+1} ---")
    print(f"Original Dialogue:\n{dialogue}")
    print(f"\nReference Summary:\n{reference_summary}")
    print(f"\nGenerated Summary:\n{generated_summary}")
    print("-" * 30)


Loading fine-tuned model and tokenizer from ./bart_samsum_poc_results...
Using device: cuda
Model and tokenizer loaded successfully for inference.

--- Model Inference Examples ---

--- Example 1 ---
Original Dialogue:
Edd: wow, did you hear that they're transferring us to a different department?
Rose: whaaaaat :o
Rose: no! where'd you hear that?
Edd: well, it's quite official
Edd: Anderson just told us
Rose: and do you know what it changes for us?
Edd: they won't change the professors
Edd: but i know the paperwork will get trickier
Rose: and i guess that is a move that is supposed to make everything easier
Edd: yeah, guess so
Edd: they have a funny way of understanding 'to make things easier'

Reference Summary:
Rose and Edd will be transferred to a new department. Their professors will not change but paperwork will become more difficult.

Generated Summary:
Edd and Rose are transferring to a different department. Rose is upset because she heard about the transfer. Rose and Eddie are