In [1]:
import torch
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import Dataset

In [2]:
def preprocess_text(text):
    """Preprocess text by removing unnecessary tokens."""
    return text.strip()

In [3]:
df = pd.read_csv('data.csv')
# Select the ones you want
df = df[['text','summary']]
df = df.sample(frac=1/3, random_state=42) # taking only 1/3 because whole set is taking about 48hours to train
df.head()

Unnamed: 0,text,summary
146201,"MISSISSAUGA, Ontario, June 4 -- Qayyum Abdul J...","MISSISSAUGA, Ontario, June 4 -- Qayyum Abdul J..."
468184,"Welcome to Apartment Life, an online discussio...","Welcome to Apartment Life, an online discussio..."
109069,The biggest traffic jam likely to occur during...,The biggest traffic jam likely to occur during...
267027,1. Conventional wisdom has taken such a beatin...,By Staff Writer Dan Balz What will it take to ...
239711,"Bravo's ""Project Runway,"" which begins its thi...","Search Washington, DC area TV schedules and re..."


In [47]:
# Step 2: Tokenization and Dataset Preparation
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)

# Tokenize dataset
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=1024,
        truncation=True,
        padding="max_length"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=128,
            truncation=True,
            padding="max_length"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Split dataset
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]


Map:   0%|          | 0/156260 [00:00<?, ? examples/s]



In [48]:
tokenized_dataset.save_to_disk("tokenized_data")

Saving the dataset (0/5 shards):   0%|          | 0/156260 [00:00<?, ? examples/s]

In [49]:
tokenized_dataset = Dataset.load_from_disk("tokenized_data")
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

In [50]:
model = BartForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

cuda


In [61]:
# Step 4: Fine-Tuning the Model

training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    eval_strategy="epoch",    # Evaluation strategy
    learning_rate=5e-5,              # Learning rate
    per_device_train_batch_size=12,   # Batch size
    per_device_eval_batch_size=12,    # Evaluation batch size
    num_train_epochs=3,              # Number of epochs
    weight_decay=0.01,               # Weight decay
    save_total_limit=1,              # Limit on saved checkpoints
    logging_dir="./logs",           # Directory for logs
    logging_steps=10,
    evaluation_strategy="epoch",
    fp16=True,  # Enable mixed precision
)



In [62]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Step 5: Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_bart")
tokenizer.save_pretrained("./fine_tuned_bart")

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0079,0.062466
2,0.0006,0.044154
3,0.0,0.040199




('./fine_tuned_bart\\tokenizer_config.json',
 './fine_tuned_bart\\special_tokens_map.json',
 './fine_tuned_bart\\vocab.json',
 './fine_tuned_bart\\merges.txt',
 './fine_tuned_bart\\added_tokens.json')

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def summarize_text(text, trained_model):
    """Generate a summary for the given text."""
    inputs = tokenizer(text, max_length=1024, return_tensors="pt", truncation=True).to(device)
    summary_ids = trained_model.generate(inputs["input_ids"], max_length=128, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    print("Generated summary IDs:", summary_ids)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained("./fine_tuned_bart")
model = BartForConditionalGeneration.from_pretrained("./fine_tuned_bart", use_safetensors=True).to(device)

sample = df.sample(1).iloc[0]
sample_text = sample["text"]

generated_summary = summarize_text(sample.text, model)
print("Generated Summary:", generated_summary)
print("Original summary:", sample['summary'])



Generated summary IDs: tensor([[    2,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0, 25887,  9752,
          1238,  3710,  4585,     9,    45,   519,    22, 18062,    62,     7,
            49,  9061,     7,  1157,   162,     7,   109, 24821,   173,    89,
           113,    25,     5,  1546,   585,    39,  5824,    71,    10,  3550,
            12,   180,   756,     4,     2]], device='cuda:0')
Generated Summary: Dan Rather accused CBS executives of not having "lived up to their obligation to allow me to do substantive work there" as the network announced his departure after a 44-year career.
Original summary: Dan Rather accused CBS executives of not having "lived up to their obligation to allow me to do substantive work there" as the network announ