In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import unicodedata # Emoji ve özel semboller için
import nltk

nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words('english')

def clean_text(text):
    text = text.lower()  # küçük harfe çevir
    text = re.sub(r'[^\w\s]', '', text)  # noktalama işaretlerini kaldır
    text = re.sub(r'\s+', ' ', text).strip()  # fazla boşlukları sadeleştir
    text = re.sub(r'<.*?>', '', text) # HTML etiketlerini kaldır
    text = re.sub(r"https?://\S+|www\S+", "", text) # URL'leri kaldır
    text = re.sub(r'#\S+', '', text) # Hashtag'leri kaldır
    text = re.sub(r"@\S+", "", text) # metindeki mentionleri kaldır
    text = " ".join([word for word in text.split() if word not in stopwords])
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')
    return text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/melikenurcaydan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [4]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")  # veya "facebook/bart-base"

max_input_length = 512
max_target_length = 128



In [5]:
def preprocess(batch):
    # article ve highlights birer liste olduğundan her eleman için ayrı ayrı temizle
    cleaned_articles = [clean_text(a) for a in batch["article"]]
    cleaned_summaries = [clean_text(s) for s in batch["highlights"]]

    # her biri için token işlemi uygula
    inputs = tokenizer(cleaned_articles, max_length=max_input_length, padding="max_length", truncation=True)
    targets = tokenizer(cleaned_summaries, max_length=max_target_length, padding="max_length", truncation=True)

    inputs["labels"] = targets["input_ids"]
    return inputs

In [6]:
tokenized_dataset = dataset.map(preprocess, batched=True)

Map: 100%|██████████| 11490/11490 [00:12<00:00, 939.22 examples/s]


In [7]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import EarlyStoppingCallback

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small") #özetleme görevinde kullanılacak Transformer modeli yüklenir ve ardından Hugging Face'in Trainer API’si için eğitim konfigürasyonlarını tanımlanır. 



In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",                      # model ve log çıktıları buraya kaydedilecek
    evaluation_strategy="epoch",                 # her epoch sonunda değerlendirme
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,                          # hızlı prototipleme için 3 epoch
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",   # hangi metrik izlenecek
    greater_is_better=False,  
    logging_dir="./logs",
    logging_steps=100
)

In [17]:
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device) 

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(2000)),
    eval_dataset=tokenized_dataset["validation"].select(range(500)),
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # 2 epoch boyunca gelişme yoksa dur
)

In [18]:
trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  4%|▍         | 100/2500 [00:56<20:52,  1.92it/s]

{'loss': 5.5415, 'learning_rate': 1.9200000000000003e-05, 'epoch': 0.2}


  8%|▊         | 200/2500 [01:53<37:25,  1.02it/s]

{'loss': 1.5303, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.4}


 12%|█▏        | 300/2500 [03:41<32:27,  1.13it/s]

{'loss': 1.3437, 'learning_rate': 1.76e-05, 'epoch': 0.6}


 16%|█▌        | 400/2500 [05:14<30:17,  1.16it/s]

{'loss': 1.2996, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.8}


 20%|██        | 500/2500 [06:47<33:01,  1.01it/s]

{'loss': 1.2425, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


                                                  
 20%|██        | 500/2500 [07:23<33:01,  1.01it/s]

{'eval_loss': 0.9200246930122375, 'eval_runtime': 35.8698, 'eval_samples_per_second': 13.939, 'eval_steps_per_second': 3.485, 'epoch': 1.0}


 24%|██▍       | 600/2500 [09:06<28:08,  1.13it/s]  

{'loss': 1.1894, 'learning_rate': 1.5200000000000002e-05, 'epoch': 1.2}


 28%|██▊       | 700/2500 [10:38<31:03,  1.04s/it]

{'loss': 1.191, 'learning_rate': 1.4400000000000001e-05, 'epoch': 1.4}


 32%|███▏      | 800/2500 [12:15<28:50,  1.02s/it]

{'loss': 1.1775, 'learning_rate': 1.3600000000000002e-05, 'epoch': 1.6}


 36%|███▌      | 900/2500 [13:52<26:16,  1.01it/s]

{'loss': 1.1843, 'learning_rate': 1.2800000000000001e-05, 'epoch': 1.8}


 40%|████      | 1000/2500 [15:29<23:08,  1.08it/s]

{'loss': 1.1292, 'learning_rate': 1.2e-05, 'epoch': 2.0}


                                                   
 40%|████      | 1000/2500 [16:04<23:08,  1.08it/s]

{'eval_loss': 0.8656452894210815, 'eval_runtime': 35.1798, 'eval_samples_per_second': 14.213, 'eval_steps_per_second': 3.553, 'epoch': 2.0}


 44%|████▍     | 1100/2500 [17:53<25:35,  1.10s/it]  

{'loss': 1.15, 'learning_rate': 1.1200000000000001e-05, 'epoch': 2.2}


 48%|████▊     | 1200/2500 [19:31<22:28,  1.04s/it]

{'loss': 1.1129, 'learning_rate': 1.04e-05, 'epoch': 2.4}


 52%|█████▏    | 1300/2500 [21:13<20:00,  1.00s/it]

{'loss': 1.1398, 'learning_rate': 9.600000000000001e-06, 'epoch': 2.6}


 56%|█████▌    | 1400/2500 [22:51<20:16,  1.11s/it]

{'loss': 1.1311, 'learning_rate': 8.8e-06, 'epoch': 2.8}


 60%|██████    | 1500/2500 [24:41<19:39,  1.18s/it]

{'loss': 1.0892, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}


                                                   
 60%|██████    | 1500/2500 [25:24<19:39,  1.18s/it]

{'eval_loss': 0.8523855209350586, 'eval_runtime': 43.8588, 'eval_samples_per_second': 11.4, 'eval_steps_per_second': 2.85, 'epoch': 3.0}


 64%|██████▍   | 1600/2500 [27:18<15:25,  1.03s/it]  

{'loss': 1.1199, 'learning_rate': 7.2000000000000005e-06, 'epoch': 3.2}


 68%|██████▊   | 1700/2500 [29:07<14:58,  1.12s/it]

{'loss': 1.1151, 'learning_rate': 6.4000000000000006e-06, 'epoch': 3.4}


 72%|███████▏  | 1800/2500 [30:56<12:05,  1.04s/it]

{'loss': 1.0851, 'learning_rate': 5.600000000000001e-06, 'epoch': 3.6}


 76%|███████▌  | 1900/2500 [32:46<09:51,  1.01it/s]

{'loss': 1.1001, 'learning_rate': 4.800000000000001e-06, 'epoch': 3.8}


 80%|████████  | 2000/2500 [34:29<08:15,  1.01it/s]

{'loss': 1.0886, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


                                                   
 80%|████████  | 2000/2500 [35:07<08:15,  1.01it/s]

{'eval_loss': 0.8468968272209167, 'eval_runtime': 37.8398, 'eval_samples_per_second': 13.214, 'eval_steps_per_second': 3.303, 'epoch': 4.0}


 84%|████████▍ | 2100/2500 [36:53<06:06,  1.09it/s]  

{'loss': 1.1123, 'learning_rate': 3.2000000000000003e-06, 'epoch': 4.2}


 88%|████████▊ | 2200/2500 [38:35<04:58,  1.01it/s]

{'loss': 1.1105, 'learning_rate': 2.4000000000000003e-06, 'epoch': 4.4}


 92%|█████████▏| 2300/2500 [40:14<03:04,  1.08it/s]

{'loss': 1.0923, 'learning_rate': 1.6000000000000001e-06, 'epoch': 4.6}


 96%|█████████▌| 2400/2500 [41:50<01:34,  1.06it/s]

{'loss': 1.1118, 'learning_rate': 8.000000000000001e-07, 'epoch': 4.8}


100%|██████████| 2500/2500 [43:10<00:00,  1.26it/s]

{'loss': 1.046, 'learning_rate': 0.0, 'epoch': 5.0}


                                                   
100%|██████████| 2500/2500 [43:41<00:00,  1.26it/s]

{'eval_loss': 0.8460965156555176, 'eval_runtime': 30.579, 'eval_samples_per_second': 16.351, 'eval_steps_per_second': 4.088, 'epoch': 5.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
100%|██████████| 2500/2500 [43:42<00:00,  1.05s/it]

{'train_runtime': 2622.6072, 'train_samples_per_second': 3.813, 'train_steps_per_second': 0.953, 'train_loss': 1.3373460021972656, 'epoch': 5.0}





TrainOutput(global_step=2500, training_loss=1.3373460021972656, metrics={'train_runtime': 2622.6072, 'train_samples_per_second': 3.813, 'train_steps_per_second': 0.953, 'train_loss': 1.3373460021972656, 'epoch': 5.0})

In [25]:
import evaluate
rouge = evaluate.load("rouge")

In [26]:
model.to(device)
model.eval() 

for i in range(5):
    article = tokenized_dataset["test"][i]["article"]
    reference = tokenized_dataset["test"][i]["highlights"]

    inputs = tokenizer(article, return_tensors="pt", truncation=True, padding="max_length", max_length=512) #Haber metni, tokenizer ile modele uygun formata dönüştürülüyor (tensor olarak).
    input_ids = inputs["input_ids"].to(device)  # giriş verisini MPS'e taşı

    summary_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    print(f"\nHaber {i+1}")
    print("Model Özeti:\n", generated_summary)
    print("Gerçek Özet:\n", reference)



Haber 1
Model Özeti:
 the 123rd member of the international Criminal Court is a step that gives the court jurisdiction over alleged crimes in palestinians. the ICC opened a preliminary examination into the situation in Palestinian territories.
Gerçek Özet:
 Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June .
Israel and the United States opposed the move, which could open the door to war crimes investigations against Israelis .

Haber 2
Model Özeti:
 Theia is a friendly white-and-black bully breed mix now named theia. she was found four days after being hit by a car and buried in a field. the dog has been receiving care at the Veterinary Teaching Hospital. she suffered dislocated jaw, leg injuries and a caved-in sinus cavity.
Gerçek Özet:
 Theia, a bully breed mix, was apparently hit by a car, whacked with a hammer and buried in a field .
"She's a true miracle dog and she deserves a good life," says Sara Mellado, who is looki

In [27]:
generated_summaries = []
reference_summaries = []

for i in range(5):
    article = tokenized_dataset["test"][i]["article"]
    reference = tokenized_dataset["test"][i]["highlights"]

    inputs = tokenizer(article, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    input_ids = inputs["input_ids"].to(device)
    model.to(device)
    model.eval()

    summary_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    generated = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    generated_summaries.append(generated)
    reference_summaries.append(reference)

# ROUGE hesapla
rouge_scores = rouge.compute(predictions=generated_summaries, references=reference_summaries)

# Sonuçları yazdır
print("ROUGE-1:", rouge_scores["rouge1"])
print("ROUGE-2:", rouge_scores["rouge2"])
print("ROUGE-L:", rouge_scores["rougeL"])


ROUGE-1: 0.3342826888502629
ROUGE-2: 0.13014166959372436
ROUGE-L: 0.2439524660081373
