# BART

https://huggingface.co/facebook/bart-base

## Import libraries

In [None]:
!pip install evaluate
!pip install rouge
!pip install rouge-score

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: dill, responses, mu

In [None]:
import nltk # Imports the library
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
import os

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from rouge_score import rouge_scorer
from torch.cuda.amp import autocast, GradScaler
from torch.nn.utils import clip_grad_norm_

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

## Read dataframe from drive folder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Text Mining Project/raw_data.csv')

In [None]:
df.head()

Unnamed: 0,category,article_id,text,summary
0,business,71,Japan turns to beer alternatives\n \n Japanese...,Asahi is predicting profits to rise 50% in 200...
1,business,394,US Airways staff agree to pay cut\n \n A union...,The seventh largest carrier in the US sought b...
2,business,129,Iraq to invite phone licence bids\n \n Iraq is...,The ministry said that it wanted to increase I...
3,business,463,US economy still growing says Fed\n \n Most ar...,Most areas of the US saw their economy continu...
4,business,177,Optimism remains over UK housing\n \n The UK p...,"Wimpey said the UK housing market had proved ""..."


## Tokenize and preprocess the text data

In [None]:
# Define the device for GPU usage (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tokenize and preprocess the text data
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
max_length = 512  # Maximum sequence length

def tokenize_text(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True, padding='max_length', return_attention_mask=True)
    return inputs.to(device)  # Move the tokenized inputs to the GPU

def tokenize_summary(text):
    inputs = tokenizer.encode(text, return_tensors="pt", max_length=280, truncation=True, padding='max_length', return_attention_mask=True)
    return inputs.to(device)  # Move the tokenized summaries to the GPU


df['TokenizedText'] = df['text'].apply(tokenize_text)
df['TokenizedSummary'] = df['summary'].apply(tokenize_summary)

# Split your data into train and test sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert tokenized data to PyTorch tensors
X_train = torch.stack([seq.squeeze() for seq in train_df['TokenizedText']])
Y_train = torch.stack([seq.squeeze() for seq in train_df['TokenizedSummary']])
X_test = torch.stack([seq.squeeze() for seq in test_df['TokenizedText']])
Y_test = torch.stack([seq.squeeze() for seq in test_df['TokenizedSummary']])

# Define a DataLoader for batching data
train_dataset = TensorDataset(X_train, Y_train)
train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataset = TensorDataset(X_test, Y_test)
test_dataloader = DataLoader(test_dataset, batch_size=4)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

## BART model

In [None]:
# Define the BART model
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')

# Create a GradScaler for mixed-precision training
scaler = GradScaler()

# Define hyperparameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to the GPU
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=50, num_training_steps=len(train_dataloader) * 10)  # Add learning rate scheduler
early_stopping_rounds = 2
best_rouge_score = -1
current_round = 0

# Define gradient accumulation steps
accumulation_steps = 20

def train(model, dataloader, optimizer, scheduler):
    model.train()
    total_loss = 0.0
    optimizer.zero_grad()

    for step, batch in enumerate(tqdm(dataloader, desc="Training")):
        inputs = batch[0].to(device)  # Move the input batch to the GPU
        attention_mask = (inputs != 0).float().to(device)  # Create attention mask
        targets = batch[1].to(device)  # Move the target batch to the GPU

        with autocast():
            outputs = model(input_ids=inputs, attention_mask=attention_mask, decoder_input_ids=targets, labels=targets)
            loss = outputs.loss

        # Perform gradient accumulation
        loss = loss / accumulation_steps
        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            # Update gradients and optimizer once every accumulation_steps
            clip_grad_norm_(model.parameters(), max_norm=1.0)  # Optional gradient clipping
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_loss += loss.item()

    return total_loss / len(dataloader)

def calculate_rouge1_precision(logits, targets):
    scorer = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)
    rouge1_precision = 0.0
    num_samples = len(logits)

    for i in range(num_samples):
        predicted_ids = logits[i].cpu().numpy()
        target_ids = targets[i].cpu().numpy()

        # Convert token IDs to strings
        predicted_text = tokenizer.decode(predicted_ids, skip_special_tokens=True)
        target_text = tokenizer.decode(target_ids, skip_special_tokens=True)

        # Calculate ROUGE-1 precision
        scores = scorer.score(predicted_text, target_text)
        rouge1_precision += scores['rouge1'].precision

    return rouge1_precision / num_samples

# Training loop
for epoch in range(10):  # Change the number of epochs as needed
    train_loss = train(model, train_dataloader, optimizer, scheduler)
    print(f"Epoch {epoch+1}/{9}, Train Loss: {train_loss:.4f}")

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Training: 100%|██████████| 445/445 [01:22<00:00,  5.40it/s]


Epoch 1/9, Train Loss: 0.6415


Training: 100%|██████████| 445/445 [01:23<00:00,  5.35it/s]


Epoch 2/9, Train Loss: 0.6414


Training: 100%|██████████| 445/445 [01:23<00:00,  5.35it/s]


Epoch 3/9, Train Loss: 0.6396


Training: 100%|██████████| 445/445 [01:23<00:00,  5.34it/s]


Epoch 4/9, Train Loss: 0.6409


Training: 100%|██████████| 445/445 [01:23<00:00,  5.34it/s]


Epoch 5/9, Train Loss: 0.6403


Training: 100%|██████████| 445/445 [01:23<00:00,  5.35it/s]


Epoch 6/9, Train Loss: 0.6399


Training: 100%|██████████| 445/445 [01:23<00:00,  5.34it/s]


Epoch 7/9, Train Loss: 0.6392


Training: 100%|██████████| 445/445 [01:23<00:00,  5.34it/s]


Epoch 8/9, Train Loss: 0.6398


Training: 100%|██████████| 445/445 [01:23<00:00,  5.35it/s]


Epoch 9/9, Train Loss: 0.6410


Training: 100%|██████████| 445/445 [01:23<00:00,  5.35it/s]

Epoch 10/9, Train Loss: 0.6398





## Evaluation

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

In [None]:
def evaluate(model, dataloader):
    model.eval()

    test_articles = []
    actual_summaries = []
    predicted_summaries = []
    rouge1_precision_scores = []

    scorer = rouge_scorer.RougeScorer(['rouge1'])

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating Test"):
            inputs = batch[0].to(device)
            attention_mask = (inputs != 0).float().to(device)
            targets = batch[1].to(device)
            outputs = model.generate(input_ids=inputs, attention_mask=attention_mask, max_length=150, num_beams=17, length_penalty=2.0, early_stopping=False)

            for output, target, input_text in zip(outputs, targets, inputs):
                # Calculate ROUGE-1 precision for each sample
                output_text = tokenizer.decode(output, skip_special_tokens=True)
                target_text = tokenizer.decode(target, skip_special_tokens=True)
                rouge_scores = scorer.score(output_text, target_text)
                rouge1_precision_scores.append(rouge_scores['rouge1'].precision)

                # Append tokenized text, actual summaries, and predicted summaries
                test_articles.append(tokenizer.decode(input_text, skip_special_tokens=True))
                actual_summaries.append(target_text)
                predicted_summaries.append(output_text)

    return test_articles, actual_summaries, predicted_summaries, rouge1_precision_scores

# Evaluate the model
test_articles, actual_summaries, predicted_summaries, rouge1_precision_scores = evaluate(model, test_dataloader)

# Create a dictionary with the extracted data
data = {
    'Article': test_articles,
    'Actual Summary': actual_summaries,
    'Predicted Summary': predicted_summaries,
    'ROUGE-1 Precision': rouge1_precision_scores,
}

# Create a Pandas DataFrame from the dictionary
results_df = pd.DataFrame(data)

# Display the DataFrame
pd.set_option('display.max_colwidth', None)
results_df.head(5)

Evaluating Test:  47%|████▋     | 53/112 [05:11<05:47,  5.90s/it]

In [None]:
pd.set_option('display.max_colwidth', 100)

results_df.head(20)

In [None]:
# Evaluation using BLEU and Rouge
references = results_df['Actual Summary'].tolist()
hypotheses = results_df['Predicted Summary'].tolist()

# BLEU score
bleu_scores = [sentence_bleu([ref.split()], hyp.split()) for ref, hyp in zip(references, hypotheses)]
average_bleu = sum(bleu_scores) / len(bleu_scores)

# Rouge score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(ref, hyp) for ref, hyp in zip(references, hypotheses)]
average_rouge = {
    'rouge1': sum(score['rouge1'].fmeasure for score in rouge_scores) / len(rouge_scores),
    'rouge2': sum(score['rouge2'].fmeasure for score in rouge_scores) / len(rouge_scores),
    'rougeL': sum(score['rougeL'].fmeasure for score in rouge_scores) / len(rouge_scores),
}

# Print or use the scores as needed
print(f"Average BLEU Score: {average_bleu}")
print(f"Average Rouge Scores: {average_rouge}")

We save the dataframe with the summaries generated by pre-trained BART model.

In [14]:
results_df.to_csv('BART_summarized.csv', index=False)