In [1]:
!pip install tensorflow torch transformers datasets nltk rouge-score scikit-learn

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m 

In [None]:
from datasets import load_dataset
from transformers import BartTokenizer

# Load the dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')

# Initialize the tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# Preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(examples['article'], max_length=1024, truncation=True)
    outputs = tokenizer(examples['highlights'], max_length=150, truncation=True)
    inputs['labels'] = outputs['input_ids']
    return inputs

# Apply preprocessing
processed_dataset = dataset.map(preprocess_function, batched=True)


In [None]:
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments

# Load the pre-trained BART model
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

# Training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    output_dir='./results',
    evaluation_strategy='epoch',
    logging_dir='./logs',
    num_train_epochs=3,
    save_total_limit=1,
    logging_steps=100,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset['train'],
    eval_dataset=processed_dataset['validation'],
)

# Train the model
trainer.train()


In [None]:
from datasets import load_metric

rouge = load_metric('rouge')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = tokenizer.batch_decode(logits, skip_special_tokens=True)
    references = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(predictions=predictions, references=references)

trainer.compute_metrics = compute_metrics
eval_results = trainer.evaluate()
print(eval_results)


In [None]:
from datasets import load_metric

rouge = load_metric('rouge')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = tokenizer.batch_decode(logits, skip_special_tokens=True)
    references = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return rouge.compute(predictions=predictions, references=references)

trainer.compute_metrics = compute_metrics
eval_results = trainer.evaluate()
print(eval_results)
