In [7]:
import pandas as pd

# Load the original CSV file
original_csv = 'Reviews.csv'

# Read the first 1000 rows of the CSV file
df = pd.read_csv(original_csv, nrows=5000)

# Define the new CSV file name
new_csv = 'preprocessed.csv'

# Save the first 1000 rows to a new CSV file
df.to_csv(new_csv, index=False)

print("First 1000 rows saved to", new_csv)


First 1000 rows saved to preprocessed.csv


In [8]:
import re
import random
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelWithLMHead
import torch.optim as optim
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Initialize GPT-2 tokenizer and model





In [9]:
preprocessed_reviews = pd.read_csv('preprocessed.csv')
train_data, test_data = train_test_split(preprocessed_reviews, test_size=0.25, random_state=42)

In [None]:

from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split





class CustomTextDataset(Dataset):
    def __init__(self, data, tokenizer, text_column='Text', block_size=128):
        self.data = data
        self.tokenizer = tokenizer
        self.text_column = text_column
        self.block_size = block_size

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        article = self.data[self.text_column].iloc[idx]
        encoding = self.tokenizer.encode_plus(
            article,
            add_special_tokens=True,
            max_length=self.block_size,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()
        return {'input_ids': input_ids, 'attention_mask': attention_mask}



def load_data_collator(tokenizer, mlm=False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

def train(train_data, model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    # Set the padding token to the end-of-sequence token
    tokenizer.pad_token = tokenizer.eos_token
    train_dataset = CustomTextDataset(train_data, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir)

    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        learning_rate=5e-6,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )

    trainer.train()
    trainer.save_model()



# Example usage:
#train_file_path = "train_data.csv"  # Path to your training data CSV file
model_name = "gpt2"  # Name of the pretrained model
output_dir = "./gpt2_finetuned"  # Output directory for saving the finetuned model
overwrite_output_dir = True  # Whether to overwrite the output directory if it already exists
per_device_train_batch_size = 8  # Batch size per GPU/CPU during training
num_train_epochs = 10  # Number of training epochs
save_steps = 10000  # Save checkpoint every specified number of steps during training

train_data = train_data  # Load your training data
train(train_data, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs, save_steps)





In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from rouge import Rouge
import pandas as pd

# Load the fine-tuned model
model_path = "/home/mk/Desktop/IR_ass_4/gpt2_finetuned"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Example review text
example_review_text = ""
# Example reference summary
example_reference_summary = ""

def generate_summary(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    max_length = len(inputs[0]) + 10  # Adjust max_length to be greater than or equal to the length of the input sequence
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate summary for the example review text
example_generated_summary = generate_summary(example_review_text)

# Print example summaries
print("Given Review Text:", example_review_text)
print("Given Summary:", example_reference_summary)
print("Generated Summary:", example_generated_summary)

# Initialize the Rouge object
rouge = Rouge()

# Example ROUGE score calculation
rouge_scores = rouge.get_scores(example_generated_summary, example_reference_summary)

# Print ROUGE scores
print("\nROUGE Scores:")
for metric, scores in rouge_scores[0].items():
    print(f"ROUGE-{metric}: Precision: {scores['p']:.2f}, Recall: {scores['r']:.2f}, F1-Score: {scores['f']:.2f}")


In [10]:

test_data.to_csv('summary.csv', index=False)

In [None]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from rouge import Rouge

# Load the fine-tuned model
model_path = "/home/mk/Desktop/IR_ass_4/gpt2_finetuned"
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)

# Define the function to generate summaries
def generate_summary(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True)
    max_length = len(inputs[0]) + 10  # Adjust max_length to be greater than or equal to the length of the input sequence
    summary_ids = model.generate(inputs, max_length=max_length, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Initialize the Rouge object
rouge = Rouge()

# Example function to calculate ROUGE scores for a DataFrame
def calculate_rouge_scores(df, generated_column, reference_column):
    rouge_scores = []
    for idx, row in df.iterrows():
        generated_summary = str(row[generated_column]).strip()  # Remove leading/trailing whitespaces
        reference_summary = str(row[reference_column]).strip()  # Remove leading/trailing whitespaces
        if generated_summary and reference_summary:  # Check if neither the generated nor reference summary is empty
            scores = rouge.get_scores(generated_summary, reference_summary)
            rouge_scores.append(scores)
    return rouge_scores

# Assuming you have a DataFrame df with generated summaries in "Generated_Summary" column
# and reference summaries in "Summary" column
test_data.to_csv('summary.csv', index=False)
test_data = pd.read_csv('summary.csv')  # Assuming you have test data in a CSV file
test_data["Generated_Summary"] = test_data["Text"].apply(lambda row: generate_summary(row))
test_data.to_csv('summary.csv', index=False)

# Calculate ROUGE scores
rouge_scores = calculate_rouge_scores(test_data, "Generated_Summary", "Summary")

# Print the ROUGE scores
for scores in rouge_scores:
    print(scores)


In [6]:
def calculate_average_rouge_scores(rouge_scores):
    rouge_1_precision = []
    rouge_1_recall = []
    rouge_1_f1 = []

    rouge_2_precision = []
    rouge_2_recall = []
    rouge_2_f1 = []

    rouge_l_precision = []
    rouge_l_recall = []
    rouge_l_f1 = []

    for scores in rouge_scores:
        # ROUGE-1
        rouge_1_precision.append(scores[0]['rouge-1']['p'])
        rouge_1_recall.append(scores[0]['rouge-1']['r'])
        rouge_1_f1.append(scores[0]['rouge-1']['f'])

        # ROUGE-2
        rouge_2_precision.append(scores[0]['rouge-2']['p'])
        rouge_2_recall.append(scores[0]['rouge-2']['r'])
        rouge_2_f1.append(scores[0]['rouge-2']['f'])

        # ROUGE-l
        rouge_l_precision.append(scores[0]['rouge-l']['p'])
        rouge_l_recall.append(scores[0]['rouge-l']['r'])
        rouge_l_f1.append(scores[0]['rouge-l']['f'])

    # Calculate average precision, recall, and F1-score for each ROUGE component
    avg_rouge_1_precision = sum(rouge_1_precision) / len(rouge_1_precision)
    avg_rouge_1_recall = sum(rouge_1_recall) / len(rouge_1_recall)
    avg_rouge_1_f1 = sum(rouge_1_f1) / len(rouge_1_f1)

    avg_rouge_2_precision = sum(rouge_2_precision) / len(rouge_2_precision)
    avg_rouge_2_recall = sum(rouge_2_recall) / len(rouge_2_recall)
    avg_rouge_2_f1 = sum(rouge_2_f1) / len(rouge_2_f1)

    avg_rouge_l_precision = sum(rouge_l_precision) / len(rouge_l_precision)
    avg_rouge_l_recall = sum(rouge_l_recall) / len(rouge_l_recall)
    avg_rouge_l_f1 = sum(rouge_l_f1) / len(rouge_l_f1)

    return {
        'rouge-1': {'precision': avg_rouge_1_precision, 'recall': avg_rouge_1_recall, 'f1': avg_rouge_1_f1},
        'rouge-2': {'precision': avg_rouge_2_precision, 'recall': avg_rouge_2_recall, 'f1': avg_rouge_2_f1},
        'rouge-l': {'precision': avg_rouge_l_precision, 'recall': avg_rouge_l_recall, 'f1': avg_rouge_l_f1},
        'average': {
            'rouge-1': avg_rouge_1_f1,
            'rouge-2': avg_rouge_2_f1,
            'rouge-l': avg_rouge_l_f1
        }
    }
# Example usage:
average_rouge_scores = calculate_average_rouge_scores(rouge_scores)

# Print the average ROUGE scores
print("Average ROUGE-1:")
print(f"Precision: {average_rouge_scores['rouge-1']['precision']:.2f}, "
      f"Recall: {average_rouge_scores['rouge-1']['recall']:.2f}, "
      f"F1-Score: {average_rouge_scores['rouge-1']['f1']:.2f}")

print("\nAverage ROUGE-2:")
print(f"Precision: {average_rouge_scores['rouge-2']['precision']:.2f}, "
      f"Recall: {average_rouge_scores['rouge-2']['recall']:.2f}, "
      f"F1-Score: {average_rouge_scores['rouge-2']['f1']:.2f}")

print("\nAverage ROUGE-l:")
print(f"Precision: {average_rouge_scores['rouge-l']['precision']:.2f}, "
      f"Recall: {average_rouge_scores['rouge-l']['recall']:.2f}, "
      f"F1-Score: {average_rouge_scores['rouge-l']['f1']:.2f}")

# Print the average ROUGE scores in a combined format
print("\nAverage ROUGE Scores (Combined):")
print("Average ROUGE-1 score:", average_rouge_scores['average']['rouge-1'])
print("Average ROUGE-2 score:", average_rouge_scores['average']['rouge-2'])
print("Average ROUGE-L score:", average_rouge_scores['average']['rouge-l'])


Average ROUGE-1:
Precision: 0.03, Recall: 0.23, F1-Score: 0.05

Average ROUGE-2:
Precision: 0.00, Recall: 0.06, F1-Score: 0.01

Average ROUGE-l:
Precision: 0.02, Recall: 0.22, F1-Score: 0.04

Average ROUGE Scores (Combined):
Average ROUGE-1 score: 0.045087922268717655
Average ROUGE-2 score: 0.009015151774630099
Average ROUGE-L score: 0.04400684118763658
