In [5]:
%pip install datasets torch
%pip install rouge-score transformers scikit-learn

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install --upgrade transformers
%pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Add before saving the model
import torch
import gc
gc.collect()
torch.cuda.empty_cache()  # If using GPU

In [8]:
import pandas as pd
import numpy as np
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    BartForConditionalGeneration,
    BartTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import nltk
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import os
import warnings
warnings.filterwarnings('ignore')

# Create necessary directories
os.makedirs("models/fine_tuned_bart", exist_ok=True)
os.makedirs("data/evaluation", exist_ok=True)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Check transformers version
import transformers
print(f"Transformers version: {transformers.__version__}")

# Load dataset
papers_df = pd.read_csv(r'C:\Users\Kanchan\Desktop\SciReX\data\cleaned\generative_ai_papers_enriched.csv')
print(f"Loaded {len(papers_df)} papers")

# Keep only necessary columns and remove missing values
summarization_df = papers_df[['abstract', 'title']].dropna()
print(f"After removing missing values: {len(summarization_df)} papers")

# Split data
train_df, test_df = train_test_split(summarization_df, test_size=0.2, random_state=42)
print(f"Training data: {len(train_df)}, Test data: {len(test_df)}")

# Load tokenizer and model
model_name = "facebook/bart-base"  # Could also use "allenai/scibert_scivocab_uncased" with adapter
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

# Define maximum input and output lengths
max_input_length = 512
max_output_length = 64

# Preprocess function
def preprocess_function(examples):
    inputs = [doc for doc in examples["abstract"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

    labels = tokenizer(
        examples["title"], max_length=max_output_length, truncation=True, padding="max_length"
    )

    model_inputs["labels"] = labels["input_ids"]

    # Replace padding token id's with -100 so they're ignored in the loss
    labels_with_ignore = []
    for labels_example in model_inputs["labels"]:
        labels_example = [label if label != tokenizer.pad_token_id else -100 for label in labels_example]
        labels_with_ignore.append(labels_example)

    model_inputs["labels"] = labels_with_ignore

    return model_inputs

# Convert dataframes to datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Preprocess datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model
)

# Try to use modern parameters first, fall back to older ones if needed
try:
    # For newer versions of transformers
    training_args = TrainingArguments(
        output_dir="models/fine_tuned_bart",
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=3,
        predict_with_generate=True,
        fp16=torch.cuda.is_available(),
        push_to_hub=False,
    )
except TypeError:
    # For older versions of transformers
    print("Falling back to older TrainingArguments configuration")
    training_args = TrainingArguments(
        output_dir="models/fine_tuned_bart",
        eval_steps=500,
        save_steps=500,
        learning_rate=2e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=3,
        # predict_with_generate is not available in older versions
        fp16=torch.cuda.is_available(),
        push_to_hub=False,
    )
    
# Define a custom trainer class if needed (for older transformers versions)
# that implements predict_with_generate functionality
class SummarizationTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        # Implement custom prediction step if needed
        # For older versions of transformers that don't support predict_with_generate
        # This is just a placeholder - implementation would depend on transformers version
        pass

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
print("Starting model training...")
try:
    trainer.train()
except Exception as e:
    print(f"Error during training: {e}")
    print("Consider upgrading your transformers library: pip install --upgrade transformers")
    raise

# Save the fine-tuned model
model_save_path = "models/fine_tuned_bart"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model saved to {model_save_path}")

# Evaluation function
def generate_summary(abstract):
    inputs = tokenizer(abstract, max_length=max_input_length, truncation=True, return_tensors="pt").to(device)
    summary_ids = model.generate(
        inputs["input_ids"],
        num_beams=4,
        min_length=10,
        max_length=max_output_length,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Generate summaries for test set
print("\nGenerating summaries for evaluation...")
test_abstracts = test_df["abstract"].tolist()
test_titles = test_df["title"].tolist()
generated_summaries = []

for abstract in test_abstracts:
    summary = generate_summary(abstract)
    generated_summaries.append(summary)

# Prepare for BLEU calculation
references_for_bleu = [[title.lower().split()] for title in test_titles]
candidates_for_bleu = [summary.lower().split() for summary in generated_summaries]

# Calculate BLEU score
bleu_score = corpus_bleu(references_for_bleu, candidates_for_bleu)
print(f"BLEU Score: {bleu_score:.4f}")

# Calculate ROUGE scores
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}

for ref, hyp in zip(test_titles, generated_summaries):
    scores = scorer.score(ref, hyp)
    for key in rouge_scores:
        rouge_scores[key] += scores[key].fmeasure

# Average ROUGE scores
for key in rouge_scores:
    rouge_scores[key] /= len(test_titles)
    print(f"{key}: {rouge_scores[key]:.4f}")

# Save evaluation results
eval_results = pd.DataFrame({
    'abstract': test_abstracts[:10],  # Showing only first 10 for brevity
    'actual_title': test_titles[:10],
    'generated_summary': generated_summaries[:10]
})

eval_results.to_csv('data/evaluation/summarization_results.csv', index=False)
print("\nSample evaluation results saved to data/evaluation/summarization_results.csv")

Using device: cpu
Transformers version: 4.51.3
Loaded 200 papers
After removing missing values: 200 papers
Training data: 160, Test data: 40


Map: 100%|██████████| 160/160 [00:00<00:00, 285.73 examples/s]
Map: 100%|██████████| 40/40 [00:00<00:00, 250.36 examples/s]


Falling back to older TrainingArguments configuration
Starting model training...


Step,Training Loss


Model saved to models/fine_tuned_bart

Generating summaries for evaluation...
BLEU Score: 0.0554
rouge1: 0.3563
rouge2: 0.1477
rougeL: 0.3142

Sample evaluation results saved to data/evaluation/summarization_results.csv
