In [None]:
import pandas as pd
review_df=pd.read_csv('/kaggle/input/amz-review/Reviews.csv')

In [None]:
# Drop all columns except 'Summary' and 'Text'
review_df = review_df[['Summary', 'Text']]

In [None]:
print(review_df.columns)

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from transformers import GPT2Tokenizer
import nltk
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')

!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
# Preprocess text function with lemmatization and GPT2 tokenizer
def preprocess_text(text):
    # Check if text is not null and is a string
    if isinstance(text, str):
        # Remove HTML tags
        text = BeautifulSoup(text, "html.parser").get_text()

        # Convert text to lowercase
        text = text.lower()

        text = re.sub(r'[^\w\s]', '', text)

        # Tokenization using GPT2 tokenizer
        tokens = tokenizer.tokenize(text)

        # Perform lemmatization
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

        # Join the tokens back into a single string and remove consecutive spaces
        processed_text = ' '.join(lemmatized_tokens).strip()

        return processed_text
    else:
        return ''

# Load or create the review_df dataframe here

# Apply preprocessing function to 'Summary' and 'Text' columns and save them as new columns
review_df['Summary_Preprocessed'] = review_df['Summary'].apply(preprocess_text)
review_df['Text_Preprocessed'] = review_df['Text'].apply(preprocess_text)

# Keep only the original and preprocessed summary and text columns
processed_df = review_df[['Summary', 'Summary_Preprocessed', 'Text', 'Text_Preprocessed']]

# Save the preprocessed dataframe to a CSV file
processed_df.to_csv("/kaggle/working/PREPROCESSE_REVIEW.csv", index=False)

In [None]:
import pandas as pd
preprocessed_df=pd.read_csv('/kaggle/input/preprocessed-rev/PREPROCESSE_REVIEW.csv')

In [None]:
print(preprocessed_df.head())

In [None]:
print(len(preprocessed_df))

In [None]:
new_1L_df = preprocessed_df.iloc[50000:60000]

In [None]:
print(len(new_1L_df))

In [None]:
from sklearn.model_selection import train_test_split

# Perform train-test split with 75% training data and 25% testing data
train_df, test_df = train_test_split(new_1L_df, test_size=0.25, random_state=42)

In [None]:
# Store the test dataset into a CSV file
test_df.to_csv('/kaggle/working/test.csv', index=False)

In [None]:
pip install rouge-score

In [None]:
!pip install --upgrade jax jaxlib

In [None]:
# Import necessary libraries
import torch
from transformers import GPT2LMHeadModel

# Load the model class
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Specify the file path from where you saved the model
model_load_path = "/kaggle/input/model-final/model.pth"

# Load the model's state dictionary
model.load_state_dict(torch.load(model_load_path))

# Put the model in evaluation mode
model.eval()

print("Model loaded successfully from:", model_load_path)


In [None]:
# Import libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from rouge_score import rouge_scorer
from tqdm import tqdm

# Define the dataset class
class SummaryDataset(Dataset):
    def __init__(self, data_df, tokenizer, max_length, padding):
        self.data_df = data_df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.padding = padding

    def __getitem__(self, idx):
        review_text = self.data_df.iloc[idx]['Text_Preprocessed']
        summary_text = self.data_df.iloc[idx]['Summary_Preprocessed']

        inputs = self.tokenizer(review_text, truncation=True, max_length=self.max_length, padding=self.padding, return_tensors="pt")
        if isinstance(summary_text, str):
            summary_text = [summary_text]  # Convert single summary to list

        labels = self.tokenizer(summary_text, truncation=True, max_length=self.max_length, padding=False, return_tensors="pt").input_ids.squeeze(0)

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': labels
        }

    def __len__(self):
        return len(self.data_df)

# Define function to collate batches
def collate_fn(batch):
    max_length = max(len(item['input_ids']) for item in batch)
    input_ids = []
    attention_mask = []
    labels = []
    for item in batch:
        padded_input_ids = torch.nn.functional.pad(item['input_ids'], (0, max_length - len(item['input_ids'])), value=tokenizer.pad_token_id)
        padded_attention_mask = torch.nn.functional.pad(item['attention_mask'], (0, max_length - len(item['attention_mask'])), value=0)
        input_ids.append(padded_input_ids)
        attention_mask.append(padded_attention_mask)
        padded_labels = torch.nn.functional.pad(item['labels'], (0, max_length - len(item['labels'])), value=-100)  # Use -100 as padding for CrossEntropyLoss
        labels.append(padded_labels)
    input_ids = torch.stack(input_ids)
    attention_mask = torch.stack(attention_mask)
    labels = torch.stack(labels)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }

# # Load data and preprocess
# train_df = pd.read_csv("/path/to/train_dataset.csv")
# test_df = pd.read_csv("/path/to/test_dataset.csv")
train_df.dropna(subset=['Text_Preprocessed', 'Summary_Preprocessed'], inplace=True)
test_df.dropna(subset=['Text_Preprocessed', 'Summary_Preprocessed'], inplace=True)

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Define hyperparameters
learning_rate = 1e-5
batch_size = 8
num_epochs = 10
max_length = 128
padding = True

# Create datasets and data loaders
train_dataset = SummaryDataset(train_df, tokenizer, max_length, padding)
test_dataset = SummaryDataset(test_df, tokenizer, max_length, padding)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

# Define optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch')

    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Train Loss: {avg_train_loss:.4f}")
    scheduler.step()


In [None]:
model_save_path = "/kaggle/working/final-fine-tuned-model"  
torch.save(model.state_dict(), model_save_path)

print("Model saved successfully at:", model_save_path)


In [None]:
# Import necessary libraries
import torch
from transformers import GPT2LMHeadModel

# Load the model class
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Specify the file path from where you saved the model
model_load_path = "/kaggle/input/gpt-fine-tuned-model"

# Load the model's state dictionary
model.load_state_dict(torch.load(model_load_path))

# Put the model in evaluation mode
model.eval()

print("Model loaded successfully from:", model_load_path)


In [None]:
import logging

# Set the logging level to ERROR to suppress messages below this level
logging.getLogger("transformers").setLevel(logging.ERROR)


In [None]:
# test_df=pd.read_csv('/kaggle/input/test-dataset-original/test_dataset_original.csv')

In [None]:
# Initialize a list to store generated summaries
generated_summaries = []

# Set the model to evaluation mode
model.eval()

# Iterate through the test dataset
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Generating Summaries', unit='batch'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Generate summaries
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=max_length+1, num_beams=4, early_stopping=True)
        generated_summaries.extend([tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True) for output in outputs])

# Add generated summaries to the test dataset DataFrame
test_df['Generated_Summary'] = generated_summaries[:len(test_df)]


In [None]:
print(test_df.head())

In [None]:
# Function to decode text and summary
def decode_text(text):
    return text.replace('Ġ', ' ')

# Decoding 'Text' column
test_df['Decoded_Text'] = test_df['Text'].apply(decode_text)

# Decoding 'Summary' column
test_df['Decoded_Summary'] = test_df['Summary'].apply(decode_text)

# Decoding 'Generated_Summary' column
test_df['Decoded_Generated_Summary'] = test_df['Generated_Summary'].apply(decode_text)

# Displaying the DataFrame with decoded columns
print(test_df.head())


In [None]:
from rouge_score import rouge_scorer

# Initialize Rouge scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Initialize lists to store ROUGE scores
rouge1_scores = []
rouge2_scores = []
rougeL_scores = []

# Iterate through each pair of generated summary and preprocessed summary
for generated_summary, preprocessed_summary in zip(test_df['Generated_Summary'], test_df['Summary_Preprocessed']):
    # Calculate ROUGE scores for each pair
    scores = scorer.score(generated_summary, preprocessed_summary)
    
    # Append individual ROUGE scores to respective lists
    rouge1_scores.append(scores['rouge1'].fmeasure)
    rouge2_scores.append(scores['rouge2'].fmeasure)
    rougeL_scores.append(scores['rougeL'].fmeasure)

# Add ROUGE scores to the test dataset DataFrame
test_df['ROUGE-1'] = rouge1_scores
test_df['ROUGE-2'] = rouge2_scores
test_df['ROUGE-L'] = rougeL_scores

In [None]:
print(test_df.head())

In [None]:
# Save the DataFrame to a CSV file
test_df.to_csv('/kaggle/working/test.csv', index=False)

In [None]:
print(test_df.head())

In [1]:
# Import necessary libraries
import torch
from transformers import GPT2LMHeadModel

# Load the model class
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Specify the file path from where you saved the model
model_load_path = "/kaggle/input/model-final/model.pth"

# Load the model's state dictionary
model.load_state_dict(torch.load(model_load_path))

# Pevaluation mode
model.eval()

print("Model loaded successfully from:", model_load_path)


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded successfully from: /kaggle/input/model-final/model.pth


In [3]:
pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=334d2d53dfb34021c8f3eb7799d1d2666bdbf87d5e90747f4631ba26e0613802
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [14]:
from rouge_score import rouge_scorer
import torch
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.stem import WordNetLemmatizer
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk.corpus import stopwords

# Download NLTK stopwords list 
nltk.download('stopwords')

# Get the list of stopwords
stop_words = set(stopwords.words('english'))

# Download NLTK resources 
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define a function to preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        text = BeautifulSoup(text, "html.parser").get_text()
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        tokens = tokenizer.tokenize(text)
        # Remove stopwords
        tokens = [token for token in tokens if token not in stop_words]
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        processed_text = ' '.join(lemmatized_tokens).strip()
        return processed_text
    else:
        return ''

# Define a function to generate summary
def generate_summary(review_text, model, tokenizer, device, max_length=128):
    preprocessed_review_text = preprocess_text(review_text)
    inputs = tokenizer.encode("summarize: " + preprocessed_review_text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = inputs.to(device)
    model.to(device)
    tokenizer.pad_token = tokenizer.eos_token
    padding_token_id = tokenizer.pad_token_id
    attention_mask = inputs.ne(padding_token_id)
    summary_ids = model.generate(inputs, max_length=max_length, length_penalty=1.0, num_beams=4, early_stopping=True, attention_mask=attention_mask, max_new_tokens=100)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return generated_summary[:max_length]

# Define a function to calculate ROUGE score
def calculate_rouge_score(generated_summary, actual_given_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(generated_summary, actual_given_summary)
    return scores

# Take input text and reference summary from the user
input_text = input("Enter the input text: ")
actual_given_summary = input("Enter the reference summary: ")

# Generate summary
generated_summary = generate_summary(input_text, model, tokenizer, device)

# Decode the summary
decoded_summary = tokenizer.decode(tokenizer.encode(generated_summary), skip_special_tokens=True)

# Calculate ROUGE scores
rouge_scores = calculate_rouge_score(generated_summary, actual_given_summary)
# Decode the summary to remove special tokens
decoded_generated_summary = generated_summary.replace("Ġ", " ")


print("Generated Summary:", decoded_generated_summary)
print("ROUGE Scores:", rouge_scores)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Enter the input text:  Great taffy at a great price.  There was a wide assortment of yummy taffy.  Delivery was very quick.  If your a taffy lover, this is a deal.
Enter the reference summary:  Great taffy


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=100) and `max_length`(=128) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Generated Summary: summarize: great  t aff  at  a  great  price    there  was  a  wide  assortment  of  y ummy  t aff    delivery  was  very  quick
ROUGE Scores: {'rouge1': Score(precision=0.5, recall=0.045454545454545456, fmeasure=0.08333333333333334), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0.5, recall=0.045454545454545456, fmeasure=0.08333333333333334)}
