In [1]:
!pip install pandas torch transformers datasets scikit-learn rouge-score bert-score peft sacrebleu

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl 

In [3]:
import os
import pandas as pd
import torch
import random
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from peft import LoraConfig, get_peft_model, TaskType

In [4]:
# Load the dataset
input_path = '/kaggle/input/eco-news-toplu/eco_news_cleaned3.csv'
if not os.path.exists(input_path):
    raise FileNotFoundError(f"File not found: {input_path}")

# Read CSV file
df = pd.read_csv(input_path)

# Drop rows with NaN values in 'icerik' or 'ozet' columns
df = df.dropna(subset=['icerik', 'ozet'])

# Separate articles and summaries
articles = df['icerik'].tolist()
summaries = df['ozet'].tolist()

# Veriyi ilk olarak train ve test olarak ayırıyoruz
train_articles, test_articles, train_summaries, test_summaries = train_test_split(
    articles, summaries, test_size=0.2, random_state=42
)

# Train setini bir daha ayırarak train ve validation setlerini oluşturuyoruz
train_articles, val_articles, train_summaries, val_summaries = train_test_split(
    train_articles, train_summaries, test_size=0.1, random_state=42
)

print(f"Train set size: {len(train_articles)}")
print(f"Validation set size: {len(val_articles)}")
print(f"Test set size: {len(test_articles)}")

Train set size: 5576
Validation set size: 620
Test set size: 1550


In [5]:
# Her bir seti Hugging Face Dataset formatına dönüştürme
train_data = {'article': train_articles, 'summary': train_summaries}
val_data = {'article': val_articles, 'summary': val_summaries}
test_data = {'article': test_articles, 'summary': test_summaries}

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
test_dataset = Dataset.from_dict(test_data)

# DatasetDict formatında birleştirme
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [6]:
# Load tokenizer and model
model_name = 'facebook/mbart-large-50'
model = MBartForConditionalGeneration.from_pretrained(model_name)
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

In [7]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [8]:
def preprocess_data(examples):
    inputs = [f"Özetle: {article}" for article in examples["article"]]
    targets = examples['summary']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=150, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/5576 [00:00<?, ? examples/s]

Map:   0%|          | 0/620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1550 [00:00<?, ? examples/s]

In [None]:
# Eğitim ve doğrulama setini küçültme
#tokenized_dataset["train"] = tokenized_dataset["train"].filter(lambda example, index: index % 100 == 0, with_indices=True)
#tokenized_dataset["validation"] = tokenized_dataset["validation"].filter(lambda example, index: index % 100 == 0, with_indices=True)
#tokenized_dataset["test"] = tokenized_dataset["test"].filter(lambda example, index: index % 100 == 0, with_indices=True)


# Veri seti uzunluğunu yazdırma
#print(f"Filtered train set size: {len(tokenized_dataset['train'])}")
#print(f"Filtered validation set size: {len(tokenized_dataset['validation'])}")
#print(f"Filtered test set size: {len(tokenized_dataset['test'])}")

In [None]:
for name, module in model.named_modules():
    print(name)

In [10]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
    bias='none',
    task_type=TaskType.SEQ_2_SEQ_LM,
    target_modules=["q_proj", "v_proj"],
)

model = get_peft_model(model, lora_config)

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_dir='./logs',
    report_to="none",
)

In [12]:
# Initialize Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

# Start training
trainer.train()

Epoch,Training Loss,Validation Loss
1,8.3252,8.105238
2,7.9835,7.811513
3,8.0381,7.713507
4,7.6652,7.572963
5,7.6203,7.537825
6,7.5665,7.519961
7,7.5733,7.506798
8,7.5379,7.500991
9,7.5218,7.497191
10,7.5323,7.491316


TrainOutput(global_step=27880, training_loss=7.634449546956263, metrics={'train_runtime': 16683.2145, 'train_samples_per_second': 6.685, 'train_steps_per_second': 1.671, 'total_flos': 1.2245569565097984e+17, 'train_loss': 7.634449546956263, 'epoch': 20.0})

In [13]:
# LoRA adaptasyonlarını kaydet
model.save_pretrained("lora_finetuned_model")
tokenizer.save_pretrained("lora_finetuned_model")
print("LoRA adaptasyonları başarıyla kaydedildi.")



LoRA adaptasyonları başarıyla kaydedildi.


In [14]:
# LoRA adaptasyonlarını temel modelle birleştir
merged_model = model.merge_and_unload()

# Birleştirilmiş tam modeli kaydet
merged_model.save_pretrained("full_finetuned_model")
tokenizer.save_pretrained("full_finetuned_model")
print("Birleştirilmiş tam model başarıyla kaydedildi.")



Birleştirilmiş tam model başarıyla kaydedildi.


In [None]:
# Save the fine-tuned model
#model.save_pretrained("fine_tuned_mbart_50_lora")
#tokenizer.save_pretrained("fine_tuned_mbart_50_lora")

#print("Fine-tuning complete. Model and tokenizer saved.")

# Eğitim tamamlandıktan sonra LoRA adaptasyonlarını kaydet
model.save_pretrained("lora_mbart_50_finetuned_model")
tokenizer.save_pretrained("lora_mbart_50_finetuned_model")
print("LoRA adaptasyonları kaydedildi.")

In [None]:

# LoRA ve temel modeli birleştirip kaydet
base_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50")
lora_model = PeftModel.from_pretrained(base_model, "lora_mbart_50_finetuned_model")
lora_model.save_pretrained("merged_model_lora_mbart_50")
tokenizer.save_pretrained("merged_model_lora_mbart_50")
print("Birleştirilmiş model kaydedildi.")

In [15]:
def generate_summary(model, tokenizer, text, max_length=150, min_length=30):
    """Haber metni için özet oluşturma."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(input_ids, max_length=max_length, min_length=min_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# ROUGE Hesaplama Fonksiyonu
def evaluate_rouge(model, tokenizer):
    print("Calculating ROUGE scores...")
    rouge_results = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = rouge.score(predicted_summary, reference_summary)
        rouge_results.append({
            "rouge1_f1": scores['rouge1'].fmeasure,
            "rouge2_f1": scores['rouge2'].fmeasure,
            "rougeL_f1": scores['rougeL'].fmeasure
        })

    avg_rouge = {
        key: sum(d[key] for d in rouge_results) / len(rouge_results)
        for key in rouge_results[0]
    }
    print(f"Average ROUGE Scores: {avg_rouge}")

# BLEU Hesaplama Fonksiyonu
def evaluate_bleu(model, tokenizer):
    print("Calculating BLEU scores...")
    bleu_results = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        bleu_score = sacrebleu.sentence_bleu(predicted_summary, [reference_summary]).score
        bleu_results.append(bleu_score)

    avg_bleu = sum(bleu_results) / len(bleu_results)
    print(f"Average BLEU Score: {avg_bleu:.4f}")

# Corpus BLEU Hesaplama Fonksiyonu
def evaluate_corpus_bleu(model, tokenizer):
    print("Calculating Corpus BLEU score...")
    predictions = []
    references = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        predictions.append(predicted_summary)
        references.append([reference_summary])  # SacreBLEU çoklu referansı destekler, bu yüzden liste içinde olmalı

    bleu_score = sacrebleu.corpus_bleu(predictions, references).score
    print(f"Corpus BLEU Score: {bleu_score:.4f}")

# BERTScore Hesaplama Fonksiyonu
def evaluate_bertscore(model, tokenizer):
    print("Calculating BERTScore...")
    bert_results = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        P, R, F1 = bert_score([predicted_summary], [reference_summary], lang="tr")
        bert_results.append(F1.mean().item())

    avg_bert = sum(bert_results) / len(bert_results)
    print(f"Average BERTScore: {avg_bert:.4f}")

In [19]:
import sacrebleu

In [16]:
evaluate_rouge(merged_model, tokenizer)

Calculating ROUGE scores...
Average ROUGE Scores: {'rouge1_f1': 0.5450307142660759, 'rouge2_f1': 0.44234999014827164, 'rougeL_f1': 0.5138451239983941}


In [20]:
evaluate_bleu(merged_model, tokenizer)

Calculating BLEU scores...
Average BLEU Score: 32.8252


In [21]:
evaluate_corpus_bleu(merged_model, tokenizer)

Calculating Corpus BLEU score...
Corpus BLEU Score: 78.6645


In [17]:
evaluate_bertscore(merged_model, tokenizer)

Calculating BERTScore...


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Average BERTScore: 0.7756
