# title

In [1]:
import pandas as pd

train_df = pd.read_csv('samsum_csv_data/train.csv')
validate_df = pd.read_csv('samsum_csv_data/validation.csv')

In [2]:
print(f"Original train_df shape: {train_df.shape}")
print(f"Original validate_df shape: {validate_df.shape}")

# Inspect columns to ensure 'dialogue' and 'summary' are present
print("\nTrain DataFrame columns:", train_df.columns.tolist())
print("Validation DataFrame columns:", validate_df.columns.tolist())

# Select a small subset for POC
# Let's aim for 100 training samples and 20 validation samples.
poc_train_df = train_df.head(100)
poc_validate_df = validate_df.head(20)

print(f"\nPOC train_df shape: {poc_train_df.shape}")
print(f"POC validate_df shape: {poc_validate_df.shape}")

# Display a sample from the POC train DataFrame to confirm content
print("\nSample from POC Train DataFrame:")
print(poc_train_df[['dialogue', 'summary']].iloc[0])

Original train_df shape: (14732, 3)
Original validate_df shape: (818, 3)

Train DataFrame columns: ['id', 'dialogue', 'summary']
Validation DataFrame columns: ['id', 'dialogue', 'summary']

POC train_df shape: (100, 3)
POC validate_df shape: (20, 3)

Sample from POC Train DataFrame:
dialogue    Amanda: I baked  cookies. Do you want some?\nJ...
summary     Amanda baked cookies and will bring Jerry some...
Name: 0, dtype: object


In [3]:
from datasets import Dataset # Import the Dataset class
from transformers import AutoTokenizer

# Convert pandas DataFrames to Hugging Face Dataset objects
# This is a lightweight conversion, doesn't copy data unnecessarily
hf_train_dataset = Dataset.from_pandas(poc_train_df)
hf_validate_dataset = Dataset.from_pandas(poc_validate_df)

print(f"\nHugging Face Train Dataset size: {len(hf_train_dataset)}")
print(f"Hugging Face Validation Dataset size: {len(hf_validate_dataset)}")

# Define the model name for the tokenizer (using 'bart-base' for efficiency)
MODEL_NAME = "facebook/bart-base"

# Load the tokenizer. This downloads the vocabulary and tokenization rules.
print(f"\nLoading tokenizer for model: {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

print("\nTokenizer loaded successfully.")
print(f"Tokenizer vocabulary size: {len(tokenizer)}")
print(f"Example tokenization: {tokenizer.encode('Hello world!')}")

  from .autonotebook import tqdm as notebook_tqdm



Hugging Face Train Dataset size: 100
Hugging Face Validation Dataset size: 20

Loading tokenizer for model: facebook/bart-base...

Tokenizer loaded successfully.
Tokenizer vocabulary size: 50265
Example tokenization: [0, 31414, 232, 328, 2]


In [4]:
# Configuration for maximum lengths (important for handling long texts)
MAX_INPUT_LENGTH = 1024 # BART's typical maximum input length
MAX_TARGET_LENGTH = 128 # Reasonable maximum length for summaries

# Define the preprocessing function
def preprocess_function(examples):
    # Tokenize the dialogue (input sequence)
    # 'truncation=True' will cut off texts longer than MAX_INPUT_LENGTH
    model_inputs = tokenizer(
        examples["dialogue"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length" # Pad to max_length for consistent input shapes
    )

    # Tokenize the summary (target sequence)
    # This will be used as the 'labels' for the decoder
    labels = tokenizer(
        text_target=examples["summary"], # Use text_target for the target sequence
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
        padding="max_length" # Pad to max_length for consistent label shapes
    )

    # Assign the tokenized labels to the 'labels' key, which the Trainer expects
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

print(f"\nPreprocessing training data (tokenizing and aligning lengths)...")
tokenized_hf_train_dataset = hf_train_dataset.map(
    preprocess_function,
    batched=True, # Process examples in batches for speed
    remove_columns=['id', 'dialogue', 'summary'] # Remove original text columns
)

print(f"Preprocessing validation data...")
tokenized_hf_validate_dataset = hf_validate_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=['id', 'dialogue', 'summary']
)

print("\nData preprocessing complete.")
print("Sample of tokenized training data structure:")
print(tokenized_hf_train_dataset[0].keys()) # Show what keys are now in the dataset
print(f"Input IDs length: {len(tokenized_hf_train_dataset[0]['input_ids'])}")
print(f"Labels length: {len(tokenized_hf_train_dataset[0]['labels'])}")


Preprocessing training data (tokenizing and aligning lengths)...


Map: 100%|██████████| 100/100 [00:00<00:00, 3330.80 examples/s]


Preprocessing validation data...


Map: 100%|██████████| 20/20 [00:00<00:00, 2620.05 examples/s]


Data preprocessing complete.
Sample of tokenized training data structure:
dict_keys(['input_ids', 'attention_mask', 'labels'])
Input IDs length: 1024
Labels length: 128





In [5]:
import torch
from transformers import AutoModelForSeq2SeqLM, TrainingArguments, Trainer
# step 4
# Ensure the dataset format is set to PyTorch tensors
# This is crucial before passing to the Trainer
tokenized_hf_train_dataset.set_format("torch")
tokenized_hf_validate_dataset.set_format("torch")

# Load the pre-trained BART model for sequence-to-sequence tasks
# This downloads the model weights and architecture for 'bart-base'
MODEL_NAME = "facebook/bart-base" # Using the same model name as for tokenizer
print(f"\nLoading BART model for sequence-to-sequence: {MODEL_NAME}...")
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
print("Model loaded successfully.")

# Configure Training Arguments
# These define how the training will proceed (epochs, batch size, logging, etc.)
output_dir = "./bart_samsum_poc_results" # Directory to save model checkpoints and logs
print(f"\nSetting up Training Arguments. Output directory: {output_dir}")
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,                  # Number of training epochs (small for POC)
    per_device_train_batch_size=1,       # Batch size per GPU/CPU for training
    per_device_eval_batch_size=2,        # Batch size per GPU/CPU for evaluation
    warmup_steps=10,                     # Number of steps for learning rate warmup
    weight_decay=0.01,                   # L2 regularization to prevent overfitting
    logging_dir=f"{output_dir}/logs",   # Directory for TensorBoard logs
    logging_steps=5,                     # Log training metrics every N steps
    eval_strategy="steps",               # Evaluate every N steps 
    eval_steps=10,                       # How often to run evaluation 
    save_steps=10,                       # How often to save model checkpoints 
    report_to="none",                    # Do not report to external services like Weights & Biases 
    gradient_accumulation_steps=4,       # Accumulate gradients over N steps 
    # predict_with_generate is NOT a TrainingArguments parameter in your version,
    # it will be passed directly to the Trainer.
    load_best_model_at_end=True,         # Load the best model found during training 
    metric_for_best_model="rouge1",      # Metric to monitor for best model selection 
    greater_is_better=True,              # Higher ROUGE-1 is better 
)

print("\nTraining Arguments configured.")


Loading BART model for sequence-to-sequence: facebook/bart-base...
Model loaded successfully.

Setting up Training Arguments. Output directory: ./bart_samsum_poc_results

Training Arguments configured.


In [None]:
# step 5
import evaluate
import numpy as np

# Load the ROUGE metric (if not already loaded in the current kernel session)
print("\nLoading ROUGE metric...")
rouge_metric = evaluate.load("rouge")

# Define the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # --- IMPORTANT FIX HERE: Handle tuple output from Trainer ---
    # If predictions is a tuple, assume the actual logits/generated IDs are the first element.
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    # --- END IMPORTANT FIX ---

    # Now, check the shape of predictions. If it's 3D, it's likely logits.
    # We need to convert logits to token IDs by taking the argmax.
    if predictions.ndim == 3:
        predictions = np.argmax(predictions, axis=-1)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Post-process for ROUGE: remove extra whitespace and newlines
    decoded_preds = ["\n".join(pred.strip().split()) for pred in decoded_preds]
    decoded_labels = ["\n".join(label.strip().split()) for label in decoded_labels]

    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    result = {k: round(v * 100, 4) for k, v in result.items()}
    return result

print("Metric computation function defined.")

# Initialize the Hugging Face Trainer (this part remains the same as it worked last time)
print("\nInitializing the Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_hf_train_dataset,
    eval_dataset=tokenized_hf_validate_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("Trainer initialized successfully. Ready for training.")


Loading ROUGE metric...
Metric computation function defined.

Initializing the Trainer...


  trainer = Trainer(


Trainer initialized successfully. Ready for training.


In [9]:
#step6

# Start the training process
print("\nStarting model training...")
train_result = trainer.train()
print("Training complete.")

# Save the trained model and tokenizer
trainer.save_model() # Saves the model and tokenizer to the output_dir specified in TrainingArguments
# For good measure, you can also save the tokenizer explicitly if desired (though save_model usually handles it)
# tokenizer.save_pretrained(training_args.output_dir)

# Save training metrics (optional, but good for tracking progress)
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

print(f"\nModel and tokenizer saved to: {training_args.output_dir}")
print("Training metrics logged and saved.")

# Optionally, you can also run a final evaluation on the validation set after training
print("\nRunning final evaluation on the validation set...")
eval_metrics = trainer.evaluate(eval_dataset=tokenized_hf_validate_dataset)
trainer.log_metrics("eval", eval_metrics)
trainer.save_metrics("eval", eval_metrics)
print("Final evaluation complete.")
print(eval_metrics) # Print the evaluation results


Starting model training...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
10,8.6494,6.668404,26.0871,13.7898,25.4993,26.1613
20,4.8245,3.923614,42.4987,23.2755,41.6729,42.4235
30,3.6496,3.007563,55.8386,31.0021,54.8978,56.047
40,2.9961,2.388318,58.6209,33.5694,57.7354,58.8227
50,2.4964,1.963917,58.9971,35.2336,58.1688,59.1249
60,2.1393,1.701622,58.8888,35.2061,57.907,58.868
70,1.9374,1.573702,58.9578,35.3647,58.2867,58.9761


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


Training complete.
***** train metrics *****
  epoch                    =        3.0
  total_flos               =   170358GF
  train_loss               =     3.9697
  train_runtime            = 0:01:42.78
  train_samples_per_second =      2.919
  train_steps_per_second   =       0.73

Model and tokenizer saved to: ./bart_samsum_poc_results
Training metrics logged and saved.

Running final evaluation on the validation set...


***** eval metrics *****
  epoch                   =        3.0
  eval_loss               =     1.9639
  eval_rouge1             =    58.9971
  eval_rouge2             =    35.2336
  eval_rougeL             =    58.1688
  eval_rougeLsum          =    59.1249
  eval_runtime            = 0:00:01.83
  eval_samples_per_second =     10.921
  eval_steps_per_second   =       5.46
Final evaluation complete.
{'eval_loss': 1.9639171361923218, 'eval_rouge1': 58.9971, 'eval_rouge2': 35.2336, 'eval_rougeL': 58.1688, 'eval_rougeLsum': 59.1249, 'eval_runtime': 1.8314, 'eval_samples_per_second': 10.921, 'eval_steps_per_second': 5.46, 'epoch': 3.0}


In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Define the output directory where your model was saved
output_dir = "./bart_samsum_poc_results"
MODEL_NAME = "facebook/bart-base" # Ensure this matches the model used for training

# Load the fine-tuned model and tokenizer
print(f"\nLoading fine-tuned model and tokenizer from {output_dir}...")
# Map to CPU if no GPU is available, or load directly to GPU if one is now present.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)
# Load model. If model was saved on CPU, it will load to CPU by default.
# If saved on GPU but now running on CPU, PyTorch will handle mapping to CPU.
loaded_model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to(device)

print("Model and tokenizer loaded successfully for inference.")

# --- Prepare example dialogues for inference ---
# We'll use examples from the original validate_df that were not directly used for training.
# Make sure poc_validate_df is still available in your environment from previous steps.
# If not, you might need to re-run Step 1 and the pandas to Dataset conversion.
# For robust example pulling, let's grab directly from the `validate_df` loaded in Step 1.

# Assuming validate_df is available from earlier steps.
# If you closed and restarted your environment, you might need to re-run
# the first code block (Step 1) to load `validate_df`.

if 'validate_df' not in locals():
    print("\n'validate_df' not found. Please ensure you have run Step 1 to load the dataframes.")
    print("For demonstration, using dummy examples.")
    example_data = [
        {"dialogue": "Speaker A: I'm really tired. I worked all night. Speaker B: You should get some rest. Speaker A: I wish I could, but I have a deadline.",
         "summary": "Speaker A is tired from work but has a deadline."},
        {"dialogue": "Participant 1: Did you remember to buy milk? Participant 2: Oh no! I completely forgot. I'll go back to the store now. Participant 1: Thanks!",
         "summary": "Participant 2 forgot to buy milk and will go back to the store."},
    ]
else:
    # Use actual examples from the validation set
    # Let's pick a couple of diverse examples, not just the very first ones if possible.
    # We can use .sample(n) for random examples, or .iloc[] for specific indices.
    num_examples_to_show = 3
    if len(validate_df) >= num_examples_to_show:
        example_data = validate_df.sample(n=num_examples_to_show, random_state=42).to_dict('records')
    else: # If validation set is too small, just use what's available
        example_data = validate_df.to_dict('records')
        print(f"Not enough examples in validate_df to sample {num_examples_to_show}. Showing all {len(validate_df)} examples.")


# --- Perform inference for each example ---
print("\n--- Model Inference Examples ---")
for i, example in enumerate(example_data):
    dialogue = example['dialogue']
    reference_summary = example['summary']

    # Tokenize the input dialogue
    inputs = loaded_tokenizer(
        dialogue,
        return_tensors="pt", # Return PyTorch tensors
        max_length=MAX_INPUT_LENGTH, # Use the same max length as training
        truncation=True
    ).to(device) # Move inputs to the correct device (CPU/GPU)

    # Generate summary
    # Using parameters that typically work well for summarization
    summary_ids = loaded_model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        num_beams=4,        # Use beam search for better quality summaries
        max_length=MAX_TARGET_LENGTH, # Max length of generated summary
        min_length=30,      # Minimum length to encourage more detailed summaries
        early_stopping=True, # Stop generation when all beam hypotheses are complete
        length_penalty=2.0  # Encourage longer summaries (common for abstractive summarization)
    )

    # Decode the generated summary IDs back to text
    generated_summary = loaded_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print(f"\n--- Example {i+1} ---")
    print(f"Original Dialogue:\n{dialogue}")
    print(f"\nReference Summary:\n{reference_summary}")
    print(f"\nGenerated Summary:\n{generated_summary}")
    print("-" * 30)


Loading fine-tuned model and tokenizer from ./bart_samsum_poc_results...
Using device: cuda
Model and tokenizer loaded successfully for inference.

--- Model Inference Examples ---

--- Example 1 ---
Original Dialogue:
Edd: wow, did you hear that they're transferring us to a different department?
Rose: whaaaaat :o
Rose: no! where'd you hear that?
Edd: well, it's quite official
Edd: Anderson just told us
Rose: and do you know what it changes for us?
Edd: they won't change the professors
Edd: but i know the paperwork will get trickier
Rose: and i guess that is a move that is supposed to make everything easier
Edd: yeah, guess so
Edd: they have a funny way of understanding 'to make things easier'

Reference Summary:
Rose and Edd will be transferred to a new department. Their professors will not change but paperwork will become more difficult.

Generated Summary:
Edd will transfer the professors to a different department. Rose and Eddie will be transferred to the same department, but they