In [1]:
!pip install pandas torch transformers datasets scikit-learn rouge-score bert-score sacrebleu

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Building wheels for collected packages: rouge-score
  Building 

In [2]:
import os
import pandas as pd
import torch
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import sacrebleu
import random

In [3]:
# Load the dataset
input_path = '/kaggle/input/eco-news-toplu/eco_news_cleaned3.csv'
if not os.path.exists(input_path):
    raise FileNotFoundError(f"File not found: {input_path}")

# Read CSV file
df = pd.read_csv(input_path)

# Drop rows with NaN values in 'icerik' or 'ozet' columns
df = df.dropna(subset=['icerik', 'ozet'])

# Separate articles and summaries
articles = df['icerik'].tolist()
summaries = df['ozet'].tolist()

# Veriyi ilk olarak train ve test olarak ayırıyoruz
train_articles, test_articles, train_summaries, test_summaries = train_test_split(
    articles, summaries, test_size=0.2, random_state=42
)

# Train setini bir daha ayırarak train ve validation setlerini oluşturuyoruz
train_articles, val_articles, train_summaries, val_summaries = train_test_split(
    train_articles, train_summaries, test_size=0.1, random_state=42
)

print(f"Train set size: {len(train_articles)}")
print(f"Validation set size: {len(val_articles)}")
print(f"Test set size: {len(test_articles)}")

Train set size: 5576
Validation set size: 620
Test set size: 1550


In [4]:
# Her bir seti Hugging Face Dataset formatına dönüştürme
train_data = {'article': train_articles, 'summary': train_summaries}
val_data = {'article': val_articles, 'summary': val_summaries}
test_data = {'article': test_articles, 'summary': test_summaries}

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
test_dataset = Dataset.from_dict(test_data)

# DatasetDict formatında birleştirme
dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [5]:
# Load tokenizer and model
model_name = 'google/mt5-base'
model = MT5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = MT5Tokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

In [6]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [7]:
def preprocess_data(examples):
    inputs = [f"Özetle: {article}" for article in examples["article"]]
    targets = examples['summary']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=150, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
    
tokenized_dataset = dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/5576 [00:00<?, ? examples/s]

Map:   0%|          | 0/620 [00:00<?, ? examples/s]

Map:   0%|          | 0/1550 [00:00<?, ? examples/s]

In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=20,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_dir='./logs',
    report_to="none",
)

In [9]:
# Initialize Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

# Start training
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,3.3826,0.536709
2,0.5018,0.325035
3,0.3825,0.28042
4,0.3464,0.273189
5,0.3297,0.263144
6,0.3205,0.254422
7,0.2944,0.249078
8,0.2814,0.243436
9,0.2839,0.244794
10,0.2637,0.24175


TrainOutput(global_step=27880, training_loss=0.7404609048041586, metrics={'train_runtime': 17179.5512, 'train_samples_per_second': 6.491, 'train_steps_per_second': 1.623, 'total_flos': 1.3371762460852224e+17, 'train_loss': 0.7404609048041586, 'epoch': 20.0})

In [10]:
# Save the fine-tuned model
model.save_pretrained("fine_tuned_mt5-base_7200")
tokenizer.save_pretrained("fine_tuned_mt5-base_7200")

print("Fine-tuning complete. Model and tokenizer saved.")

Fine-tuning complete. Model and tokenizer saved.


In [11]:
def generate_summary(model, tokenizer, text, max_length=150, min_length=30):
    """Haber metni için özet oluşturma."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    text = f'Özetle {text}'
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(input_ids, max_length=max_length, min_length=min_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [12]:
# ROUGE Hesaplama Fonksiyonu
def evaluate_rouge(model, tokenizer):
    print("Calculating ROUGE scores...")
    rouge_results = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = rouge.score(predicted_summary, reference_summary)
        rouge_results.append({
            "rouge1_f1": scores['rouge1'].fmeasure,
            "rouge2_f1": scores['rouge2'].fmeasure,
            "rougeL_f1": scores['rougeL'].fmeasure
        })

    avg_rouge = {
        key: sum(d[key] for d in rouge_results) / len(rouge_results)
        for key in rouge_results[0]
    }
    print(f"Average ROUGE Scores: {avg_rouge}")

# BLEU Hesaplama Fonksiyonu
def evaluate_bleu(model, tokenizer):
    print("Calculating BLEU scores...")
    bleu_results = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        bleu_score = sacrebleu.sentence_bleu(predicted_summary, [reference_summary]).score
        bleu_results.append(bleu_score)

    avg_bleu = sum(bleu_results) / len(bleu_results)
    print(f"Average BLEU Score: {avg_bleu:.4f}")
    # Corpus BLEU Hesaplama Fonksiyonu

def evaluate_corpus_bleu(model, tokenizer):
    print("Calculating Corpus BLEU score...")
    predictions = []
    references = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        predictions.append(predicted_summary)
        references.append([reference_summary])  # SacreBLEU çoklu referansı destekler, bu yüzden liste içinde olmalı

    bleu_score = sacrebleu.corpus_bleu(predictions, references).score
    print(f"Corpus BLEU Score: {bleu_score:.4f}")

# BERTScore Hesaplama Fonksiyonu
def evaluate_bertscore(model, tokenizer):
    print("Calculating BERTScore...")
    bert_results = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        P, R, F1 = bert_score([predicted_summary], [reference_summary], lang="tr")
        bert_results.append(F1.mean().item())

    avg_bert = sum(bert_results) / len(bert_results)
    print(f"Average BERTScore: {avg_bert:.4f}")

In [13]:
evaluate_rouge(model, tokenizer)

Calculating ROUGE scores...
Average ROUGE Scores: {'rouge1_f1': 0.6002937514363476, 'rouge2_f1': 0.49924526596227137, 'rougeL_f1': 0.5724003102508476}


In [14]:
evaluate_bleu(model, tokenizer)
evaluate_corpus_bleu(model, tokenizer)

Calculating BLEU scores...
Average BLEU Score: 39.2424
Calculating Corpus BLEU score...
Corpus BLEU Score: 59.1272


In [15]:
evaluate_bertscore(model, tokenizer)

Calculating BERTScore...


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Average BERTScore: 0.8014


In [16]:
import random
# Rastgele bir örnek seçme
random_index = random.randint(0, len(test_articles) - 1)
article = test_articles[random_index]
reference_summary = test_summaries[random_index]
predicted_summary = generate_summary(model, tokenizer, article)

# Seçilen rastgele örneği yazdırma
print(f"\n------ Rastgele Seçilen Haber {random_index + 1} ------")
print(f"--- Orijinal Metin ---\n{article}")
print(f"--- Modelin Oluşturduğu Özet ---\n{predicted_summary}")
print(f"--- Gerçek Özet ---\n{reference_summary}\n")


------ Rastgele Seçilen Haber 1310 ------
--- Orijinal Metin ---
**Borsa İstanbul**'da **BIST 100 endeksi**, önceki **kapanış**a göre 0,61 puan azalırken, toplam işlem hacmi 19,7 milyar lira seviyesinde gerçekleşti.  Bankacılık endeksi yüzde 1,10 ve holding endeksi yüzde 0,37 değer kazandı. Sektör endeksleri arasında en fazla kazandıran yüzde 1,28 ile metal eşya makine, en çok gerileyen ise yüzde 9,93 ile spor oldu.  Güne yükselişle başlayan ve bankacılık hisselerinde yoğunlaşan alımların etkisiyle 1.105,98 puanı gören BIST 100 endeksi, kapanışa yakın kazançlarını geri vererek günü yüzde 0,06 düşüşle 1.099,06 puandan tamamladı.  Analistler, yarın yurt içinde temmuz ayı ödemeler dengesi istatistikleri, yurt dışında ise İngiltere'de sanayi üretimi, Avro Bölgesi, Almanya ve ABD'de Tüketici Fiyat Endeksi verilerinin takip edileceğini bildirdi.  Yeni tip koronavirüs tedavisine ilişkin haber akışı ve Doğu Akdeniz başta olmak üzere jeopolitik gelişmelerin gündemin odağındaki yerini koruduğun