In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Install Required Libraries


In [2]:
pip install transformers datasets -q 

Note: you may need to restart the kernel to use updated packages.


# Import Libraries

In [3]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset

# Load the Dataset

Summarization data from Hugging face 

In [11]:
# Load a sample dataset
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

# Preprocess function
def preprocess_data(examples):
    # Add the task prefix if needed (like for summarization)
    inputs = ["summarize: " + doc for doc in examples["article"]]
    
    # Tokenize the inputs with padding and truncation
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    
    # Tokenize the labels (e.g., summaries) with padding and truncation
    labels = tokenizer(examples["highlights"], max_length=128, padding="max_length", truncation=True).input_ids
    
    # Replace label pad tokens with -100 to ignore them in loss calculation
    labels = [[(label if label != tokenizer.pad_token_id else -100) for label in label_ids] for label_ids in labels]
    
    model_inputs["labels"] = labels
    return model_inputs


# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
# Apply preprocessing
dataset = dataset.map(preprocess_data, batched=True)


Map:   0%|          | 0/2871 [00:00<?, ? examples/s]

# Display a Dataset Sample

In [20]:
# Load a sample dataset (if not already loaded)
from datasets import load_dataset

# Example: loading the CNN/DailyMail dataset
dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")

# Display a sample of 5 rows
dataset_sample = dataset.select(range(5))  # Select the first 5 samples

# Convert to a pandas DataFrame for easy viewing (optional)
dataset_sample_df = dataset_sample.to_pandas()

# Show the sample
dataset_sample_df


Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a


# Initialize the Model

You can Choose a T5 model variant based on your compute resources, like t5-small, t5-base, or t5-large.

In [12]:
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Define Training Arguments

Set up the hyperparameters for training. Adjust values like learning_rate, num_train_epochs, and per_device_train_batch_size as needed.

In [16]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to='none'
)

# Create the Trainer

The Trainer class manages the training loop, making it easy to handle the model, arguments, dataset, and other element

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,  # Use a separate eval dataset in practice
    tokenizer=tokenizer,
)


# Train the Model
Fine-tune the model using the dataset and arguments defined above.

In [None]:
trainer.train()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss
500,1.8461


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


# Evaluate the Model

In [21]:
trainer.evaluate()

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'eval_loss': 1.5671461820602417,
 'eval_runtime': 36.6089,
 'eval_samples_per_second': 78.424,
 'eval_steps_per_second': 9.806,
 'epoch': 3.0}

# Save the Model

In [22]:
model.save_pretrained("./fine-tuned-t5")
tokenizer.save_pretrained("./fine-tuned-t5")

('./fine-tuned-t5/tokenizer_config.json',
 './fine-tuned-t5/special_tokens_map.json',
 './fine-tuned-t5/spiece.model',
 './fine-tuned-t5/added_tokens.json')

# Generate Predictions

In [24]:
# Check if a GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the appropriate device
model.to(device)

# Sample text to summarize
text = """
Artificial intelligence (AI) has revolutionized various industries by automating complex tasks and analyzing vast amounts of data with precision. From healthcare to finance, AI-driven solutions provide valuable insights and streamline processes. 
For instance, in healthcare, AI aids in early disease detection through image analysis, while in finance, it predicts market trends and detects fraud. However, the integration of AI raises ethical considerations, such as job displacement and privacy concerns. 
Organizations and policymakers must balance innovation with responsibility to ensure AI benefits society at large.
"""

# Prepend "summarize: " to the text
input_text = "summarize: " + text

# Tokenize the input and move inputs to the same device as the model
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)

# Generate the summary
outputs = model.generate(inputs, max_length=50, num_beams=2, early_stopping=True)

# Decode and print the summary
summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Summary:", summary)


Summary: AI-driven solutions provide valuable insights and streamline processes. In healthcare, AI aids in early disease detection through image analysis. In finance, AI predicts market trends and detects fraud. Organizations and policymakers must balance innovation


In [25]:
# Calculate the word count for the original text
original_word_count = len(text.split())
# Calculate the word count for the summarized text
summary_word_count = len(summary.split())
# Display the results
print("Original Text Word Count:", original_word_count)
print("Summary Word Count:", summary_word_count)

Original Text Word Count: 84
Summary Word Count: 34


# compare summaries before and after fine-tuning 

In [27]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the tokenizer and pre-trained T5 model
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Load the pre-trained T5 model (before fine-tuning)
pretrained_model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Load the fine-tuned T5 model (replace "path/to/fine-tuned-t5" with the path to your fine-tuned model)
fine_tuned_model = T5ForConditionalGeneration.from_pretrained("./fine-tuned-t5")

# Check if a GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move both models to the appropriate device
pretrained_model.to(device)
fine_tuned_model.to(device)

# Sample text to summarize
text = """
Artificial intelligence (AI) has revolutionized various industries by automating complex tasks and analyzing vast amounts of data with precision. From healthcare to finance, AI-driven solutions provide valuable insights and streamline processes. 
For instance, in healthcare, AI aids in early disease detection through image analysis, while in finance, it predicts market trends and detects fraud. However, the integration of AI raises ethical considerations, such as job displacement and privacy concerns. 
Organizations and policymakers must balance innovation with responsibility to ensure AI benefits society at large.
"""

# Prepare the input text for the model
input_text = "summarize: " + text
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True).input_ids.to(device)

# Generate the summary using the pre-trained T5 model
with torch.no_grad():
    pretrained_summary_ids = pretrained_model.generate(inputs, max_length=50, num_beams=2, early_stopping=True)
    pretrained_summary = tokenizer.decode(pretrained_summary_ids[0], skip_special_tokens=True)

# Generate the summary using the fine-tuned T5 model
with torch.no_grad():
    fine_tuned_summary_ids = fine_tuned_model.generate(inputs, max_length=50, num_beams=2, early_stopping=True)
    fine_tuned_summary = tokenizer.decode(fine_tuned_summary_ids[0], skip_special_tokens=True)

# Print and compare the summaries
print("Original Text Word Count:", len(text.split()))
print("\nPre-trained Model Summary:", pretrained_summary)
print("Pre-trained Summary Word Count:", len(pretrained_summary.split()))

print("\nFine-tuned Model Summary:", fine_tuned_summary)
print("Fine-tuned Summary Word Count:", len(fine_tuned_summary.split()))


Original Text Word Count: 84

Pre-trained Model Summary: AI-driven solutions provide valuable insights and streamline processes. in healthcare, AI aids in early disease detection through image analysis. in finance, it predicts market trends and detects fraud.
Pre-trained Summary Word Count: 28

Fine-tuned Model Summary: AI-driven solutions provide valuable insights and streamline processes. In healthcare, AI aids in early disease detection through image analysis. In finance, AI predicts market trends and detects fraud. Organizations and policymakers must balance innovation
Fine-tuned Summary Word Count: 34
