<a href="https://colab.research.google.com/github/phuonganh412/summarisation-model/blob/main/Summarization_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Summarization with BART using PyTorch and Hugging Face Transformers


## 1. Libraries and Dependencies

In [23]:
!pip install transformers torch




In [24]:
!pip install transformers sentencepiece pandas



In [25]:
!pip install transformers[torch] accelerate==0.20.1




In [26]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartTokenizer, BartForConditionalGeneration
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

## 2. Set device

In [27]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


## 3. Load Data

In [28]:
import pandas as pd

# Load the dataset from a CSV file containing 'original_text' and 'summary' columns
df = pd.read_csv('sample_data/all_v1.csv')
train_texts = df['original_text'].tolist()
train_summaries = df['reference_summary'].tolist()


## 4. Split Data

In [29]:
# Split the data into training and validation sets
train_data, val_data = train_test_split(df, test_size=0.1, random_state=42)


## 5. Define Custom Dataset

In [30]:
# Create a custom dataset class for text summarization
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_length=512, max_output_length=150):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # Retrieve original text and corresponding summary
        original_text = self.data.iloc[index]['original_text']
        summary = self.data.iloc[index]['reference_summary']

        # Tokenize the input and output sequences
        inputs = self.tokenizer(original_text, max_length=self.max_input_length, return_tensors='pt', truncation=True)
        targets = self.tokenizer(summary, max_length=self.max_output_length, return_tensors='pt', truncation=True)

        # Padding for variable-length sequences
        padding_length = max(self.max_input_length - len(inputs['input_ids'][0]), 0)
        inputs['input_ids'] = torch.nn.functional.pad(inputs['input_ids'], (0, padding_length), value=tokenizer.pad_token_id)
        inputs['attention_mask'] = torch.nn.functional.pad(inputs['attention_mask'], (0, padding_length), value=0)

        padding_length = max(self.max_output_length - len(targets['input_ids'][0]), 0)
        targets['input_ids'] = torch.nn.functional.pad(targets['input_ids'], (0, padding_length), value=tokenizer.pad_token_id)

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': targets['input_ids'].squeeze()
        }


## 6. Load BART Tokenizer and Model

In [31]:
# Load the BART tokenizer and model from Hugging Face Transformers
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn').to(device)


## 7. Create Datasets and Data Loaders

In [32]:
# Create datasets and data loaders for training and validation
train_dataset = SummarizationDataset(train_data, tokenizer)
val_dataset = SummarizationDataset(val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False)


## 8. Training Parameters

In [33]:
# Define training parameters
epochs = 3
learning_rate = 1e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


## 9. Training Loop

In [34]:
# Training loop to train the text summarization model
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}', unit='batch'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Forward pass through the model
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Loss: {average_loss}')

Epoch 1: 100%|██████████| 201/201 [02:14<00:00,  1.50batch/s]


Epoch 1, Loss: 1.4555568475628373


Epoch 2: 100%|██████████| 201/201 [02:07<00:00,  1.57batch/s]


Epoch 2, Loss: 0.2852138769789715


Epoch 3: 100%|██████████| 201/201 [02:07<00:00,  1.57batch/s]

Epoch 3, Loss: 0.1866249622804905





## 10. Save Trained Model

In [35]:
# Save the trained text summarization model
model.save_pretrained('summarization_model')
tokenizer.save_pretrained('summarization_model')


('summarization_model/tokenizer_config.json',
 'summarization_model/special_tokens_map.json',
 'summarization_model/vocab.json',
 'summarization_model/merges.txt',
 'summarization_model/added_tokens.json')

## 11. Inference

In [36]:
# Inference to generate summaries for the validation dataset
model.eval()
generated_summaries = []


In [37]:
# Inference
model.eval()
generated_summaries = []

for batch in tqdm(val_loader, desc='Inference', unit='batch'):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)

    # Generate summaries using the trained model
    summary_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=150, num_beams=2, length_penalty=2.0)
    summaries = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]
    generated_summaries.extend(summaries)

Inference: 100%|██████████| 23/23 [00:34<00:00,  1.51s/batch]


## 12. Evaluate the Model

In [38]:
pip install rouge




In [39]:
from rouge import Rouge

rouge = Rouge()
references = val_data['reference_summary'].tolist()
scores = rouge.get_scores(generated_summaries, references, avg=True)

print("ROUGE Scores:", scores)

ROUGE Scores: {'rouge-1': {'r': 0.4827886689764848, 'p': 0.21254628859046049, 'f': 0.2785428388827907}, 'rouge-2': {'r': 0.2565510633846612, 'p': 0.08235111170360826, 'f': 0.11482782443106589}, 'rouge-l': {'r': 0.44957968849054586, 'p': 0.19777037585099388, 'f': 0.25940470837828683}}


Here's how to interpret these scores:

Recall (r): The fraction of the reference summary that is correctly captured by the generated summary. For example, in ROUGE-1, approximately 49% of the unigrams in the reference summary are also present in the generated summary.

Precision (p): The fraction of the generated summary that correctly corresponds to the reference summary. In ROUGE-1, about 22% of the unigrams in the generated summary are also present in the reference summary.

F1 Score (f): The harmonic mean of recall and precision. It is a balanced measure that considers both false positives and false negatives.

These scores range from 0 to 1, where a higher score indicates a better match between the generated and reference summaries.


## 13. Run Demo

In [43]:
# Load the saved model and tokenizer
loaded_model = BartForConditionalGeneration.from_pretrained('summarization_model')
loaded_tokenizer = BartTokenizer.from_pretrained('summarization_model', model_max_length=512)


In [44]:
# Example input text of policy for inference
input_text = "the app permits the purchase of virtual currency virtual money and use of that virtual money to purchase virtual items or services that we expressly make available for use in the app virtual goods. the purchase of virtual money and virtual goods is limited to account holders who are either a 18 years of age or older or b under the age of 18 and have the consent of a parent to make the purchase. parents of children under the age of 18 can consult the ios or google play settings for their app to restrict in app purchases but should also monitor their children s accounts for unexpected activity including the purchase of virtual money or virtual goods. purchases of virtual money and virtual goodsvirtual money is a category of content so the purchase of virtual money grants you only a limited nontransferable non sublicensable revocable license to use such virtual money to access and purchase virtual goods in conjunction with your personal noncommercial use of the services. you acknowledge that you do not acquire any ownership rights in or to the virtual money virtual goods or other content any balance of virtual goods or virtual money does not reflect any stored value. you agree that virtual money and virtual goods have no monetary value and do not constitute actual currency or property of any type. virtual money may be redeemed only for virtual goods and can never be sold transferred or exchanged for real money real goods or real services from us or anyone else. you also agree that you will only obtain virtual money and or virtual goods from us and through means provided by us and not from any third party platform exchange broker or other mechanism unless expressly authorized. once you acquire a license to virtual money or virtual goods you may not trade or transfer the virtual money or virtual goods to another individual or account unless such functionality is provided to you by us by way of a feature or service whether inside the app or through some other method e g our website. we may cancel any virtual money or virtual goods sold transferred or exchanged in violation of these terms. any such sale transfer or exchange or attempt to do so is prohibited and may result in the termination of your account. during the term of your license to your virtual money you have the right to redeem your virtual money for selected virtual goods. if you are the parent and you are accepting these terms on behalf of your child you accept and acknowledge that your child has your consent to exercise this right independently. pricing and availability of virtual money and virtual goods are subject to change without notice. we reserve the right at any time to change and update our pricing and inventory of virtual money and virtual goods. as set forth below all virtual money virtual goods and other content is provided as is without any warranty. you agree that all sales by us to you of virtual money and virtual goods are final and that we will not permit exchanges or refunds for any unused virtual money or virtual goods once the transaction has been made. purchases by end users outside the u s virtual money and virtual goods may only be purchased and held by legal residents of countries where access to and use of the services are permitted. if you live in the european union you have certain rights to withdraw from online purchases. however please note that once you download virtual money from us your right of withdrawal ends. you agree that a purchase of virtual money involves immediate download of such content and b you lose your right of withdrawal once your purchase is complete. if you live in the european union we will provide you with a vat invoice when we are required to do so by law. you agree that these invoices may be electronic in format. we reserve the right to control regulate change or remove any virtual money or virtual goods without any liability to you."

# Tokenize and generate summary
input_ids = loaded_tokenizer(input_text, max_length=512, return_tensors='pt', truncation=True).input_ids
attention_mask = loaded_tokenizer(input_text, max_length=512, return_tensors='pt', truncation=True).attention_mask

summary_ids = loaded_model.generate(input_ids, attention_mask=attention_mask, max_length=100, num_beams=2, length_penalty=2.0)
generated_summary = loaded_tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)

# Print the results
print("Input Text:", input_text)
print("Generated Summary:", generated_summary)


Input Text: the app permits the purchase of virtual currency virtual money and use of that virtual money to purchase virtual items or services that we expressly make available for use in the app virtual goods. the purchase of virtual money and virtual goods is limited to account holders who are either a 18 years of age or older or b under the age of 18 and have the consent of a parent to make the purchase. parents of children under the age of 18 can consult the ios or google play settings for their app to restrict in app purchases but should also monitor their children s accounts for unexpected activity including the purchase of virtual money or virtual goods. purchases of virtual money and virtual goodsvirtual money is a category of content so the purchase of virtual money grants you only a limited nontransferable non sublicensable revocable license to use such virtual money to access and purchase virtual goods in conjunction with your personal noncommercial use of the services. you a