In [3]:
import pandas as pd
from pathlib import Path
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load the dataset
data_path = Path('..') / 'data' / 'data.csv'

if not data_path.exists():
    raise FileNotFoundError(f"The file at {data_path} does not exist.")

data = pd.read_csv(data_path)

# Select relevant columns
data = data[["primaryCategories", "reviews.text", "reviews.rating"]]

# Drop rows with missing values in the specified columns (if any)
data.dropna(subset=['primaryCategories', 'reviews.text', 'reviews.rating'], inplace=True)

# Convert ratings to string to ensure consistent grouping
data['reviews.rating'] = data['reviews.rating'].astype(str)

# Filter the data for a specific category and 5-star rating
category = "Electronics"  # Example category, change as needed
rating = "5"
filtered_data = data[(data['primaryCategories'] == category) & (data['reviews.rating'] == rating)]

# For testing, use a smaller subset
filtered_data = filtered_data.head(10)  # Use only the first 10 rows for testing

# Concatenate all reviews within the selected group
grouped_reviews = filtered_data.groupby(['primaryCategories', 'reviews.rating'])['reviews.text'].apply(lambda x: ' '.join(x)).reset_index()

# Load the PEGASUS model and tokenizer
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")

# Check if GPU is available and use it
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Move the model to the device
model.to(device)
print(f"Using device: {device}")

# Function to split text into chunks based on maximum length
def split_into_chunks(text, max_length):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0
    for word in words:
        word_length = len(tokenizer.encode(word, add_special_tokens=False))
        if current_length + word_length <= max_length:
            current_chunk.append(word)
            current_length += word_length
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = word_length
    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Function to summarize text by chunking with batch processing
def summarize_text(text, max_chunk_length=512, summary_max_length=150, summary_min_length=30, batch_size=2):  # Reduced batch size for testing
    chunks = split_into_chunks(text, max_chunk_length)
    print(f"Text split into {len(chunks)} chunks.")
    
    chunk_summaries = []
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1} with {len(batch)} chunks.")
        inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=max_chunk_length)
        inputs = inputs.to(device)
        
        # Generate summaries for the batch
        summary_ids = model.generate(
            inputs['input_ids'], 
            attention_mask=inputs['attention_mask'],
            max_length=summary_max_length, 
            min_length=summary_min_length, 
            length_penalty=1.0,  # Adjust length penalty
            num_beams=6,  # Increase number of beams
            no_repeat_ngram_size=3,  # Prevent repetition of phrases
            early_stopping=True
        )
        
        # Decode the summaries
        summaries = [tokenizer.decode(g, skip_special_tokens=True) for g in summary_ids]
        print(f"Batch summaries: {summaries}")
        chunk_summaries.extend(summaries)
    
    # Combine the summaries and summarize them if necessary
    combined_summary = ' '.join(chunk_summaries)
    print(f"Combined summary length: {len(tokenizer.encode(combined_summary))}")
    if len(tokenizer.encode(combined_summary)) > max_chunk_length:
        final_summary = summarize_text(combined_summary, max_chunk_length, summary_max_length, summary_min_length, batch_size)
    else:
        final_summary = combined_summary
    
    return final_summary

# Function to remove repetitive phrases
def remove_repetitive_phrases(text):
    sentences = text.split('. ')
    unique_sentences = []
    seen_sentences = set()
    
    for sentence in sentences:
        if sentence not in seen_sentences:
            unique_sentences.append(sentence)
            seen_sentences.add(sentence)
    
    return '. '.join(unique_sentences)

# Apply summarization to the selected group of reviews
grouped_reviews['summary'] = grouped_reviews['reviews.text'].apply(lambda x: remove_repetitive_phrases(summarize_text(x)))

# Display the summary for the selected combination
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Summary: {row['summary']}\n")

# Visualize the reviews and their summaries
for idx, row in grouped_reviews.iterrows():
    print(f"Primary Category: {row['primaryCategories']}, Star Rating: {row['reviews.rating']}")
    print(f"Original Reviews: {row['reviews.text'][:1000]}...")  # Print only the first 1000 characters of the original reviews
    print(f"Summary: {row['summary']}\n")



Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda
Text split into 2 chunks.
Processing batch 1 with 2 chunks.
Batch summaries: ['After discarding and getting rid of broken cd cases, broken cds, and selecting those ones we really like, this binder turned up to be an excellent option to store our favourite cds and dvds and keep them in a small space at our living room, giving us the choice to donate or get rid of those cds towers that took a lot of room, despite looking nice. Cons: May be not trendy looking as newer, but still sets well on my kitchen island Pros: I love the Alexa series so when this one went on sale I had to get it with a camera. Pros: Works good like Alexia Cons: setting it up is a, HASLE had to call verizon three times Other Thoughts: I WOULD RECOMMEND', 'Pros: I WOULD recommend anyone who was going to purchase anything, newegg is the BEST PLACE TO LOOK! Cons: No compaints']
Combined summary length: 177
Primary Category: Electronics, Star Rating: 5
Summary: After discarding and getting rid of broken

## Fine-tuning Pegasus model

### 1 hour fine-tuning

In [None]:
import pandas as pd
import torch
from pathlib import Path
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict

# Load and preprocess the dataset
data_path = Path('..') / 'data' / 'data.csv'
data = pd.read_csv(data_path)

# Select and preprocess relevant columns
data = data[["reviews.text"]].dropna()
data['reviews.text'] = data['reviews.text'].astype(str)

# Assume summaries are not available; generate synthetic summaries for demonstration
# In practice, you would use actual summaries if available
data['summary'] = data['reviews.text'].apply(lambda x: x[:512])  # Example: Use the first 512 characters as a summary

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Split the dataset into train and validation sets
dataset = dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({
    'train': dataset['train'],
    'test': dataset['test']
})

# Load tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")

# Tokenize the input text and summaries
def tokenize_function(examples):
    model_inputs = tokenizer(examples['reviews.text'], max_length=512, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], max_length=128, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Define training arguments with max_steps set to train for approximately 1 hour
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=100,  # Evaluate every 100 steps
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    max_steps=756,  # Train for approximately 1 hour
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,  # Save less frequently
    save_total_limit=2,
)

# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-pegasus")
tokenizer.save_pretrained("./fine-tuned-pegasus")

# Summarize the reviews
def summarize_text(text, max_length=256, num_beams=8, length_penalty=0.8):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest")
    summary_ids = model.generate(
        inputs["input_ids"], 
        max_length=max_length, 
        num_beams=num_beams, 
        length_penalty=length_penalty, 
        forced_eos_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Apply summarization to the dataset
data['summary'] = data['reviews.text'].apply(summarize_text)
for idx, row in data.iterrows():
    print(f"Original Review: {row['reviews.text'][:500]}...")
    print(f"Summary: {row['summary']}\n")






Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25498 [00:00<?, ? examples/s]



Map:   0%|          | 0/2834 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
100,0.0253,0.042816
200,0.0272,0.021437
300,0.0158,0.019021


### 8 hours fine-tuning

In [None]:
import pandas as pd
import torch
from pathlib import Path
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Trainer, TrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict

# Load and preprocess the dataset
data_path = Path('..') / 'data' / 'data.csv'
data = pd.read_csv(data_path)

# Select and preprocess relevant columns
data = data[["reviews.text"]].dropna()
data['reviews.text'] = data['reviews.text'].astype(str)

# Assume summaries are not available; generate synthetic summaries for demonstration
# In practice, you would use actual summaries if available
data['summary'] = data['reviews.text'].apply(lambda x: x[:512])  # Example: Use the first 512 characters as a summary

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Split the dataset into train and validation sets
dataset = dataset.train_test_split(test_size=0.1)
dataset_dict = DatasetDict({
    'train': dataset['train'],
    'test': dataset['test']
})

# Load tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")

# Tokenize the input text and summaries
def tokenize_function(examples):
    model_inputs = tokenizer(examples['reviews.text'], max_length=512, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['summary'], max_length=128, truncation=True)
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

# Define training arguments with max_steps set to train for approximately 8 hours
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,  # Evaluate every 500 steps
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    max_steps=6048,  # Train for approximately 8 hours
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=1000,  # Save less frequently
    save_total_limit=2,
)

# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine-tuned-pegasus")
tokenizer.save_pretrained("./fine-tuned-pegasus")

# Summarize the reviews
def summarize_text(text, max_length=256, num_beams=8, length_penalty=0.8):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest")
    summary_ids = model.generate(
        inputs["input_ids"], 
        max_length=max_length, 
        num_beams=num_beams, 
        length_penalty=length_penalty, 
        forced_eos_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Apply summarization to the dataset
data['summary'] = data['reviews.text'].apply(summarize_text)
for idx, row in data.iterrows():
    print(f"Original Review: {row['reviews.text'][:500]}...")
    print(f"Summary: {row['summary']}\n")


## Loading & Summarizing with fine-tuned model

In [None]:
# Load the fine-tuned model and tokenizer
tokenizer = PegasusTokenizer.from_pretrained("./fine-tuned-pegasus")
model = PegasusForConditionalGeneration.from_pretrained("./fine-tuned-pegasus")

# Function to summarize text
def summarize_text(text, max_length=150, min_length=30):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="longest")
    summary_ids = model.generate(inputs["input_ids"], max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Summarize the reviews
data['summary'] = data['reviews.text'].apply(summarize_text)
for idx, row in data.iterrows():
    print(f"Original Review: {row['reviews.text'][:500]}...")
    print(f"Summary: {row['summary']}\n")
