In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from torch.utils.data import Dataset
from rouge_score import rouge_scorer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Check if CUDA is available and which device is being used
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda:1


In [2]:
# Load the dataset
reviews = pd.read_csv("Reviews.csv")
reviews = reviews.dropna().reset_index(drop = True)
reviews = reviews[['Text', 'Summary']]
print(reviews.shape)
reviews = reviews.drop_duplicates()
print(reviews.shape)
reviews = reviews[0:10000].reset_index(drop = True)
print(reviews.shape)

(568411, 2)
(394956, 2)
(10000, 2)


In [17]:
import re
import unicodedata
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
from nltk.corpus import stopwords

lemmatizer = WordNetLemmatizer()
def cleaning(data):
    corpus = []
    for i in range(0, len(data)):
        sentence = re.sub('[^a-zA-Z]', ' ', str(data[i])) # d. Removing Special Characters
        sentence = sentence.lower() # Lowering the text
        sentence = sentence.split() # Tokenization
        
        # Remove blank space tokens
        sentence = [word for word in sentence if word.strip()]  # Removes empty strings
        
        all_stopwords = stopwords.words('english') # Removing the stopwords
        #all_stopwords.remove('not')
        # e. Lemmatization
        sentence = [lemmatizer.lemmatize(word) for word in sentence if not word in set(all_stopwords)]
        sentence = ' '.join(sentence)
        corpus.append(sentence)
      
    return corpus


reviews['cleaned_text'] =  cleaning(reviews['Text'])
reviews.head()

Unnamed: 0,Text,Summary,cleaned_text
0,I have bought several of the Vitality canned d...,Good Quality Dog Food,bought several vitality canned dog food produc...
1,Product arrived labeled as Jumbo Salted Peanut...,Not as Advertised,product arrived labeled jumbo salted peanut pe...
2,This is a confection that has been around a fe...,"""Delight"" says it all",confection around century light pillowy citrus...
3,If you are looking for the secret ingredient i...,Cough Medicine,looking secret ingredient robitussin believe f...
4,Great taffy at a great price. There was a wid...,Great taffy,great taffy great price wide assortment yummy ...


In [4]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(reviews, test_size=0.25, random_state=42)
print(train_data.shape)
print(test_data.shape)

(7500, 2)
(2500, 2)


In [5]:
# Initialize the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")

# Set pad token to end of sequence token
tokenizer.pad_token = tokenizer.eos_token

In [6]:
# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        input_text = item['Text']
        target_text = item['Summary']
        
        input_encoding = self.tokenizer(input_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        target_encoding = self.tokenizer(target_text, max_length=self.max_length, padding="max_length", truncation=True, return_tensors="pt")
        
        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': target_encoding['input_ids'].squeeze()
        }

# Create instances of the custom dataset class for training and testing
train_dataset = CustomDataset(train_data, tokenizer)
test_dataset = CustomDataset(test_data, tokenizer)

In [22]:
! pip install transformers[torch]

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: distro-info 1.1build1 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of distro-info or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: python-debian 0.1.43ubuntu1 has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of python-debian or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [21]:
# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    num_train_epochs=20,
    learning_rate= 5e-5,  #5e-5,
    output_dir="./output",
    logging_dir="./logs",
    logging_steps=500,
)

# Define the evaluation function
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    for hypothesis, reference in zip(pred_str, labels_str):
        scores = scorer.score(reference, hypothesis)

        for metric in rouge_scores.keys():
            rouge_scores[metric].append(scores[metric].fmeasure)

    rouge_avg_scores = {metric: np.mean(scores) for metric, scores in rouge_scores.items()}
    return rouge_avg_scores

In [8]:
# Define the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()




Step,Training Loss


TrainOutput(global_step=469, training_loss=0.11479628670698544, metrics={'train_runtime': 227.3539, 'train_samples_per_second': 32.988, 'train_steps_per_second': 2.063, 'total_flos': 1959690240000000.0, 'train_loss': 0.11479628670698544, 'epoch': 1.0})

In [6]:
# Define the directory path where you want to save the model
output_dir = "./saved_model"

# Save the trained model and tokenizer
model.save_pretrained(output_dir)
print("Model saved successfully at:", output_dir)

Model saved successfully at: ./saved_model


In [7]:
saved_model_dir = "./saved_model"

# Load the model and tokenizer
loaded_model = GPT2LMHeadModel.from_pretrained(saved_model_dir)

In [None]:
# Define the input text and summary
from datasets import load_metric
o = 5
input_text = list(test_data['Text'][o:o+1])[0]
reference_summary = list(test_data['Summary'][o:o+1])[0]

# Encode input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")
input_ids = input_ids.to(loaded_model.device)

# Generate summary
summary_ids = loaded_model.generate(input_ids, max_length=150, num_return_sequences=1, early_stopping=True)

# Decode the generated summary
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Compute ROUGE score
rouge_metric = load_metric("rouge")
predictions = [generated_summary]
references = [reference_summary]
rouge_results = rouge_metric.compute(predictions=predictions, references=references)

# Print the input text, reference summary, generated summary, and ROUGE scores
print("Given Review Text:")
print(input_text)
print("\nGiven Summary:")
print(reference_summary)
print("\nGenerated Summary:")
print(generated_summary)
print("\nROUGE Scores:")
print(f"ROUGE-1: Precision: {rouge_results['rouge1'].mid.precision:.2f}, Recall: {rouge_results['rouge1'].mid.recall:.2f}, F1-Score: {rouge_results['rouge1'].mid.fmeasure:.2f}")
print(f"ROUGE-2: Precision: {rouge_results['rouge2'].mid.precision:.2f}, Recall: {rouge_results['rouge2'].mid.recall:.2f}, F1-Score: {rouge_results['rouge2'].mid.fmeasure:.2f}")
print(f"ROUGE-L: Precision: {rouge_results['rougeL'].mid.precision:.2f}, Recall: {rouge_results['rougeL'].mid.recall:.2f}, F1-Score: {rouge_results['rougeL'].mid.fmeasure:.2f}")



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [56]:
test_data = test_data.reset_index(drop =True)
test_data.head()

Unnamed: 0,Text,Summary
0,I'm always on the lookout for soda's that are ...,Soda Substitute that tastes Similar to the Ori...
1,PopChips are the best potato chips I have ever...,The Barbecue are my favorite chips!!!
2,I ordered these after hearing about them from ...,VERY cool
3,This is the first K-cup and Tea product from T...,Twinings Earl Grey Tea K-Cups
4,My dog loves these and I feel good about givin...,Great organic dog treat


In [37]:
from datasets import load_metric

# Load ROUGE metric
rouge_metric = load_metric("rouge")

# Initialize lists to store ROUGE scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Iterate through test dataset
for idx in range(len(test_data[0:100])):
    example = test_data['Text'][idx]
    
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    input_ids = input_ids.to(loaded_model.device)

# Generate summary
    summary_ids = model.generate(input_ids, max_length=100, num_return_sequences=1, early_stopping=True)

# Decode the generated summary
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Compute ROUGE scores
    reference_summary = test_data['Summary'][idx]
    rouge_results = rouge_metric.compute(predictions=[generated_summary], references=[reference_summary])

    # Store ROUGE scores
    rouge1_scores.append(rouge_results['rouge1'].mid.fmeasure)
    rouge2_scores.append(rouge_results['rouge2'].mid.fmeasure)
    rougeL_scores.append(rouge_results['rougeL'].mid.fmeasure)

# Compute average ROUGE scores
avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores)
avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores)
avg_rougeL = sum(rougeL_scores) / len(rougeL_scores)

# Print average ROUGE scores
print("Average ROUGE-1 Score:", avg_rouge1)
print("Average ROUGE-2 Score:", avg_rouge2)
print("Average ROUGE-L Score:", avg_rougeL)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Average ROUGE-1 Score: 0.018875613807350887
Average ROUGE-2 Score: 0.0010336532287751798
Average ROUGE-L Score: 0.01862561380735089


Validating using the summarization using the Pre-Trained GPT-2 without Fine-Tuning

In [20]:
"""
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_metric
import torch

# Load the pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Define the input text
input_text = "My dog loves these and I feel good about giving them to her since they're organic. When I first opened the package, I thought the treats were a little too large for my small dog, but was happy to see that they're scored and fairly easy to break in two. If I had any complaints, it would be that the treats are extremely hard and are a little bit difficult for my 15 year old dog to eat."

# Tokenize the input text
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate summary
summary_ids = model.generate(input_ids, max_length=150, num_return_sequences=1, early_stopping=True)

# Decode the generated summary
generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("Input Text:")
print(input_text)
# Print the generated summary
print("Generated Summary:")
print(generated_summary)

# Compute ROUGE score
rouge_metric = load_metric("rouge")

# Define the reference and predicted summaries
references = [generated_summary]
predictions = [input_text]

# Compute ROUGE score
rouge_results = rouge_metric.compute(predictions=predictions, references=references)

# Print the ROUGE score
print("\nROUGE Score:")
print(rouge_results)
"""

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Input Text:
My dog loves these and I feel good about giving them to her since they're organic. When I first opened the package, I thought the treats were a little too large for my small dog, but was happy to see that they're scored and fairly easy to break in two. If I had any complaints, it would be that the treats are extremely hard and are a little bit difficult for my 15 year old dog to eat.
Generated Summary:
My dog loves these and I feel good about giving them to her since they're organic. When I first opened the package, I thought the treats were a little too large for my small dog, but was happy to see that they're scored and fairly easy to break in two. If I had any complaints, it would be that the treats are extremely hard and are a little bit difficult for my 15 year old dog to eat. I'm not sure if I would have bought them if I had a larger dog, but I'm sure they would have been a little bit easier to chew on. I'm not sure if I would have bought them if I had a larger dog, b

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
