In [1]:
!pip install pandas torch transformers datasets scikit-learn rouge-score bert-score sacrebleu

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Building wheels for collected packages: rouge-score
  Building 

# VERİ SETİNİ HAZIRLA

In [2]:
import os
import pandas as pd
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import sacrebleu

In [3]:
# Load the dataset
input_path = '/kaggle/input/eco-news-me/haberler.csv'
if not os.path.exists(input_path):
    raise FileNotFoundError(f"File not found: {input_path}")

# Read CSV file
df = pd.read_csv(input_path)

# Drop rows with NaN values in 'icerik' or 'ozet' columns
df = df.dropna(subset=['icerik', 'ozet'])

# Separate articles and summaries
articles = df['icerik'].tolist()
summaries = df['ozet'].tolist()

# Split the dataset into train, validation and test sets
train_articles, test_articles, train_summaries, test_summaries = train_test_split(
    articles, summaries, test_size=0.2, random_state=42
)

# Split the train set into train and validation sets
train_articles, val_articles, train_summaries, val_summaries = train_test_split(
    train_articles, train_summaries, test_size=0.1, random_state=42
)

print(f"Train set size: {len(train_articles)}")
print(f"Validation set size: {len(val_articles)}")
print(f"Test set size: {len(test_articles)}")

Train set size: 172
Validation set size: 20
Test set size: 48


In [4]:
# Tokenizer
train_data = {'article': train_articles, 'summary': train_summaries}
val_data = {'article': val_articles, 'summary': val_summaries}
test_data = {'article': test_articles, 'summary': test_summaries}

train_dataset = Dataset.from_dict(train_data)
val_dataset = Dataset.from_dict(val_data)
test_dataset = Dataset.from_dict(test_data)

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

In [5]:
# Tokenize the dataset
model_name = 'google/flan-t5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [6]:
# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [7]:
# Preprocess the data
def preprocess_data(examples):
    inputs = [f"Özetle: {article}" for article in examples["article"]]
    targets = examples['summary']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=150, truncation=True, padding="max_length")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs
    
tokenized_dataset = dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/172 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

# MODELİ EĞİT

In [8]:
# Define the evaluation metric
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=100,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_dir='./logs',
    report_to="none",
)

In [9]:
# Define the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
)

# Train the model
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,4.166616
2,No log,2.53634
3,No log,1.822128
4,No log,1.67906
5,No log,1.522172
6,No log,1.401485
7,No log,1.331476
8,No log,1.305739
9,No log,1.285748
10,No log,1.270098


TrainOutput(global_step=4300, training_loss=0.8692446083246276, metrics={'train_runtime': 1952.1654, 'train_samples_per_second': 8.811, 'train_steps_per_second': 2.203, 'total_flos': 1.17778264621056e+16, 'train_loss': 0.8692446083246276, 'epoch': 100.0})

In [None]:
# Save the fine-tuned model
model.save_pretrained("fine_tuned_flan-t5-200")
tokenizer.save_pretrained("fine_tuned_flan-t5-200")

print("Fine-tuning complete. Model and tokenizer saved.")

# TEST

In [16]:
def generate_summary(model, tokenizer, text, max_length=150, min_length=30):
    """Haber metni için özet oluşturma."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    text = f'Özetle {text}'
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True).to(device)
    summary_ids = model.generate(input_ids, max_length=max_length, min_length=min_length, num_beams=4, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [11]:
# ROUGE Hesaplama Fonksiyonu
def evaluate_rouge(model, tokenizer):
    print("Calculating ROUGE scores...")
    rouge_results = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = rouge.score(predicted_summary, reference_summary)
        rouge_results.append({
            "rouge1_f1": scores['rouge1'].fmeasure,
            "rouge2_f1": scores['rouge2'].fmeasure,
            "rougeL_f1": scores['rougeL'].fmeasure
        })

    avg_rouge = {
        key: sum(d[key] for d in rouge_results) / len(rouge_results)
        for key in rouge_results[0]
    }
    print(f"Average ROUGE Scores: {avg_rouge}")

# BLEU Hesaplama Fonksiyonu
def evaluate_bleu(model, tokenizer):
    print("Calculating BLEU scores...")
    bleu_results = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        bleu_score = sacrebleu.sentence_bleu(predicted_summary, [reference_summary]).score
        bleu_results.append(bleu_score)

    avg_bleu = sum(bleu_results) / len(bleu_results)
    print(f"Average BLEU Score: {avg_bleu:.4f}")
    # Corpus BLEU Hesaplama Fonksiyonu

def evaluate_corpus_bleu(model, tokenizer):
    print("Calculating Corpus BLEU score...")
    predictions = []
    references = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        predictions.append(predicted_summary)
        references.append([reference_summary])  # SacreBLEU çoklu referansı destekler, bu yüzden liste içinde olmalı

    bleu_score = sacrebleu.corpus_bleu(predictions, references).score
    print(f"Corpus BLEU Score: {bleu_score:.4f}")

# BERTScore Hesaplama Fonksiyonu
def evaluate_bertscore(model, tokenizer):
    print("Calculating BERTScore...")
    bert_results = []

    for article, reference_summary in zip(test_articles, test_summaries):
        predicted_summary = generate_summary(model, tokenizer, article)
        P, R, F1 = bert_score([predicted_summary], [reference_summary], lang="tr")
        bert_results.append(F1.mean().item())

    avg_bert = sum(bert_results) / len(bert_results)
    print(f"Average BERTScore: {avg_bert:.4f}")

In [12]:
evaluate_rouge(model, tokenizer)

Calculating ROUGE scores...
Average ROUGE Scores: {'rouge1_f1': 0.3370354590904768, 'rouge2_f1': 0.16195183920750106, 'rougeL_f1': 0.2819354606201934}


In [13]:
evaluate_bleu(model, tokenizer)
evaluate_corpus_bleu(model, tokenizer)

Calculating BLEU scores...
Average BLEU Score: 7.0374
Calculating Corpus BLEU score...
Corpus BLEU Score: 18.9022


In [14]:
evaluate_bertscore(model, tokenizer)

Calculating BERTScore...


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Average BERTScore: 0.5884


## Test Veri Setinden Random Örnek

In [17]:
import random
# Rastgele bir örnek seçme
random_index = random.randint(0, len(test_articles) - 1)
article = test_articles[random_index]
reference_summary = test_summaries[random_index]
predicted_summary = generate_summary(model, tokenizer, article)

# Seçilen rastgele örneği yazdırma
print(f"\n------ Rastgele Seçilen Haber {random_index + 1} ------")
print(f"--- Orijinal Metin ---\n{article}")
print(f"--- Modelin Oluşturduğu Özet ---\n{predicted_summary}")
print(f"--- Gerçek Özet ---\n{reference_summary}\n")


------ Rastgele Seçilen Haber 8 ------
--- Orijinal Metin ---
Otokar Otomotiv ve Savunma Sanayi (OTKAR), bugün Kamuyu Aydınlatma Platformu’na gönderdiği açıklamada temettü kararı hakkında bilgi verdi. Açıklamada, şirketin 2023 yılında 1.9 milyar lira net dönem karı elde ettiği belirtilerek ortaklara ödenecek toplam kar payı tutarının 720 milyon lira olduğu kaydedildi. Otokar'ın hisse başına 5 lira 40 kuruş temettü ödemesinin kararlaştırıldığı ifade edildi. Temettünün dağıtılacağı tarih henüz açıklanmadı.
--- Modelin Oluşturduğu Özet ---
Otokar, 2023 ylnda 1.9 milyar TL net dönem kar elde ettii belirtilerek ortaklara ödenecek toplam kar pay tutarn 720 milyon TL oldu. Temettü ödemesinin datlacak.
--- Gerçek Özet ---
Otokar, 2023 yılında elde ettiği 1,9 milyar TL net kar sonrası yatırımcılarına hisse başına 5,40 TL temettü ödeyeceğini duyurdu. Toplamda 720 milyon TL kar payı dağıtılacak, ancak dağıtım tarihi henüz açıklanmadı.

