In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
# 1. Model ve Tokenizer'Ä± YÃ¼kle
model_name = "SmolLM2-360M-Instruct-v1-model"  # Mistral 7B modeli
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

In [None]:
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})


In [None]:
# 2. Tapaco Veri Setini YÃ¼kleyin
dataset = load_dataset("turkish-nlp-suite/InstrucTurca", split="train", cache_dir="/media/hosman/Yedek/Datasets/")  # Tapaco veri setini yÃ¼kle
dataset2 = load_dataset("Metin/WikiRAG-TR", split="train", cache_dir="/media/hosman/Yedek/Datasets/").rename_columns({"question": "Input", "answer": "Output"})
dataset3 = load_dataset("kayrab/patient-doctor-qa-tr-167732",  cache_dir="/media/hosman/Yedek/Datasets/").rename_columns({"question_content": "Input", "question_answer": "Output"})

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments

from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset

In [None]:
dataset,dataset2,dataset3 

In [None]:
dataset3 = concatenate_datasets([dataset3["train"], dataset3["test"]])

In [None]:
dataset = concatenate_datasets([dataset, dataset2, dataset3])

In [None]:
dataset

In [None]:
from datasets import Dataset
import numpy as np

# None iÃ§eren satÄ±rlarÄ± temizleyen fonksiyon
def remove_none_rows(example):
    return example["Input"] is not None and example["Output"] is not None

# None deÄŸerleri iÃ§eren satÄ±rlarÄ± filtrele
dataset = dataset.filter(remove_none_rows)

In [None]:
from datasets import Dataset
import numpy as np

# Token uzunluklarÄ±nÄ± hesaplayan fonksiyon
def get_token_lengths(example):
    input_length = len(example["Input"])
    output_length = len(example["Output"])
    return {"input_length": input_length, "output_length": output_length}

# TÃ¼m veri seti iÃ§in hesaplama
token_lengths = dataset.map(get_token_lengths, batched=False)

# Maksimum ve ortalama token sayÄ±larÄ±
max_input_length = max(token_lengths["input_length"])
max_output_length = max(token_lengths["output_length"])

avg_input_length = np.mean(token_lengths["input_length"])
avg_output_length = np.mean(token_lengths["output_length"])

# SonuÃ§larÄ± yazdÄ±r
print(f"ğŸ“Œ Maksimum Input Token SayÄ±sÄ±: {max_input_length}")
print(f"ğŸ“Œ Ortalama Input Token SayÄ±sÄ±: {avg_input_length:.2f}")
print(f"ğŸ“Œ Maksimum Output Token SayÄ±sÄ±: {max_output_length}")
print(f"ğŸ“Œ Ortalama Output Token SayÄ±sÄ±: {avg_output_length:.2f}")


In [None]:
# Filtreleme fonksiyonu
def filter_long_samples(example):
    input_length = len(example["Input"])
    output_length = len(example["Output"])
    
    # EÄŸer input ve output ortalamadan bÃ¼yÃ¼kse filtrele (False dÃ¶ndÃ¼r)
    return not (input_length > 128 and output_length > 128)

# Yeni filtrelenmiÅŸ dataset
dataset = dataset.filter(filter_long_samples)

# FiltrelenmiÅŸ veri kÃ¼mesi hakkÄ±nda bilgi
print(f"âœ… FiltrelenmiÅŸ veri seti satÄ±r sayÄ±sÄ±: {len(dataset)}")


In [None]:
from datasets import DatasetDict

# 1. Veriyi train ve test olarak ayÄ±rma
# Ã–rneÄŸin, dataset zaten tek bir bÃ¼yÃ¼k veri seti (Ã¶rneÄŸin "data") iÃ§eriyor
# Bunu %80 train ve %20 test olarak bÃ¶lelim
train_dataset, temp_dataset = dataset.train_test_split(test_size=0.2, seed=42).values()

# 2. Test setini de %50 validation ve %50 test olarak bÃ¶lelim
val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.2, seed=42).values()

# 3. Veriyi tokenizasyon iÅŸlemi iÃ§in tokenize edelim
def tokenize_function(example):
    # Input ve Output'u tokenize et
    input_tokens = tokenizer(example["Input"], padding="max_length", truncation=True, max_length=128)
    output_tokens = tokenizer(example["Output"], padding="max_length", truncation=True, max_length=128)

    # Tokenized Input ve Output'u dÃ¶ndÃ¼r
    return {
        "input_ids": input_tokens["input_ids"],
        "attention_mask": input_tokens["attention_mask"],
        "labels": output_tokens["input_ids"]
    }

# 4. Tokenize iÅŸlemini her bir split iÃ§in uygulayalÄ±m
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True).remove_columns(["Input","Output"])
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True).remove_columns(["Input","Output"])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True).remove_columns(["Input","Output"])

# Tokenize edilmiÅŸ veri setlerini birleÅŸtirebilirsiniz (opsiyonel)
final_dataset = DatasetDict({
    'train': tokenized_train_dataset,
    'validation': tokenized_val_dataset,
    'test': tokenized_test_dataset
})



In [None]:
tokenized_test_dataset[0]["input_ids"], tokenized_test_dataset[0]["attention_mask"]

In [None]:
tokenized_test_dataset

In [None]:
#model.gradient_checkpointing_enable()
#model.use_cache = False


In [None]:
import math
import numpy as np
import wandb
import evaluate
from sentence_transformers import SentenceTransformer, util
from collections import Counter

# Metrikleri yÃ¼kle
bleu = evaluate.load("sacrebleu")
rouge = evaluate.load("rouge")
chrf = evaluate.load("chrf")
#bert_model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# Distinct-N hesaplayan fonksiyon
def compute_distinct_n(preds, n=2):
    all_ngrams = [tuple(preds[i:i+n]) for i in range(len(preds)-n+1)]
    return len(set(all_ngrams)) / max(1, len(all_ngrams))

# Metrikleri hesaplayan ana fonksiyon
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    results = {}

    try:
        loss = np.mean(preds)
        results["perplexity"] = math.exp(loss)
    except Exception as e:
        results["perplexity"] = f"error: {str(e)}"

    try:
        results["bleu"] = bleu.compute(predictions=preds, references=labels)["score"]
    except Exception as e:
        results["bleu"] = f"error: {str(e)}"

    try:
        rouge_scores = rouge.compute(predictions=preds, references=labels)
        results.update({
            "rouge-1": rouge_scores["rouge1"].mid.fmeasure,
            "rouge-2": rouge_scores["rouge2"].mid.fmeasure,
            "rouge-L": rouge_scores["rougeL"].mid.fmeasure,
        })
    except Exception as e:
        results["rouge-1"] = results["rouge-2"] = results["rouge-L"] = f"error: {str(e)}"

    try:
        results["chrf"] = chrf.compute(predictions=preds, references=labels)["score"]
    except Exception as e:
        results["chrf"] = f"error: {str(e)}"

    try:
        results["distinct-1"] = compute_distinct_n(preds, n=1)
        results["distinct-2"] = compute_distinct_n(preds, n=2)
    except Exception as e:
        results["distinct-1"] = results["distinct-2"] = f"error: {str(e)}"

    # try:
    #     similarities = [util.pytorch_cos_sim(bert_model.encode(p), bert_model.encode(l)).item() for p, l in zip(preds, labels)]
    #     results["semantic_similarity"] = sum(similarities) / len(similarities)
    # except Exception as e:
    #     results["semantic_similarity"] = f"error: {str(e)}"

    # WandB loglama
    wandb.log(results)
    return results


In [None]:
import wandb
from transformers import TrainerCallback
import torch

class WandBQuestionCallback(TrainerCallback):
    def __init__(self, tokenizer, model, questions, device="cuda", log_interval=1000):
        self.tokenizer = tokenizer
        self.model = model.to(device)
        self.device = device
        self.questions = questions  # List of question strings
        self.log_interval = log_interval
        
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % self.log_interval == 0:
            wandb.log({"step": state.global_step})
            self.log_model_responses()

    def log_model_responses(self):
        responses = {}
        for question in self.questions:
            messages = [
                {"role": "system", "content": "Sen yardÄ±msever bir asistansÄ±n"},
                {"role": "user", "content": question}
            ]
            input_text = self.tokenizer.apply_chat_template(messages, tokenize=False)
            inputs = self.tokenizer.encode(input_text, return_tensors="pt").to(self.device)
            outputs = self.model.generate(inputs, max_new_tokens=128, temperature=0.2, top_p=0.9, do_sample=True)
            response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            responses[question] = response
        
        wandb.log({"model_responses": responses})

In [None]:
# Ã–rnek sorular
questions = [
    "433 * b - 7420490 = -7413995 denklemini Ã§Ã¶z.",
    "TÃ¼rkiye'nin baÅŸkenti neresidir?",
    "E=mc^2 denkleminin fiziksel anlamÄ± nedir?",
    "Merhaba.NasÄ±lsÄ±n?",
    "Merhaba, dÃ¼n diÅŸ Ã§ekimi yapÄ±ldÄ±ktan sonra bu sabah aÅŸÄ±rÄ± kanama ile hekime baÅŸvurdum. Pihtinin oluÅŸtuÄŸunu, ancak kanamanÄ±n durmadÄ±ÄŸÄ± gerekÃ§esiyle dikiÅŸ iÅŸlemi uyguladÄ±. BugÃ¼n herhangi bir kanama veya aÄŸrÄ± yok, yalnÄ±z dikiÅŸ bÃ¶lgesinde mukusa benzer bir doku oluÅŸtu. Tekrar gitmem gerekir mi?",
    "Merhaba, ben 18 yaÅŸÄ±ndayÄ±m, geÃ§en yÄ±l elimin Ã¼st kÄ±smÄ± yanmÄ±ÅŸtÄ±, ÅŸimdi iyileÅŸti ancak elimin Ã¼stÃ¼nde yanÄ±k izi kaldÄ±. Bu iz iÃ§in herhangi bir ilaÃ§ veya farklÄ± tedavi yÃ¶ntemi var mÄ±dÄ±r?"
    "Mulan filminin hikayesi hangi kaynaktan esinlenmiÅŸtir?",
    "Kartografya gÃ¼nÃ¼mÃ¼zde nasÄ±l teknolojilerden faydalanÄ±yor?"

]

# Callback'i oluÅŸtur
wandb_callback = WandBQuestionCallback(tokenizer, model, questions)

In [None]:
# 4. EÄŸitim AyarlarÄ±nÄ± TanÄ±mlayÄ±n
training_args = TrainingArguments(
    output_dir="./SmolLM2-360M-Instruct-v1",  # Ã‡Ä±ktÄ± dizini
    evaluation_strategy="epoch",         # DeÄŸerlendirme adÄ±mlarÄ±
    save_strategy="steps",               # Kaydetme adÄ±mlarÄ±
    save_steps=300,                      # Her 500 adÄ±mda modeli kaydet
    logging_dir="./logs",                # Log dosyalarÄ± dizini
    logging_steps=500,                   # Her 100 adÄ±mda log yazdÄ±r
    learning_rate=2e-5,                  # Ã–ÄŸrenme oranÄ±
    num_train_epochs=6,                  # Epoch sayÄ±sÄ±
    per_device_train_batch_size=32,       # GPU baÅŸÄ±na batch boyutu
    per_device_eval_batch_size=32,       # GPU baÅŸÄ±na batch boyutu
    gradient_accumulation_steps=4,       # Gradient birikimi iÃ§in adÄ±m sayÄ±sÄ±
    bf16=True,                           # 16-bit floating-point
    fp16_opt_level="O2",                # Optimizasyon dÃ¼zeyi (O1, O2, O3)
    dataloader_num_workers=4,
    #evaluation_strategy="no",           # Sadece eÄŸitim (deÄŸerlendirme yapÄ±lmÄ±yor)
    report_to="wandb",                    # WandB veya diÄŸer araÃ§lara raporlama yok
    save_total_limit=2,                  # Sadece son iki checkpoint'i sakla
    lr_scheduler_type="cosine",  # Cosine learning rate decay
    warmup_steps=1000,           # Ä°lk 1000 adÄ±mda LR'yi yavaÅŸ yavaÅŸ artÄ±r
    weight_decay=0.01,           # AdamW kullanÄ±rken weight decay ekle
    optim="adamw_torch",         # Daha hÄ±zlÄ± AdamW optimizasyonu
    
    
)

# 5. Trainer Nesnesi ile EÄŸitimi BaÅŸlatÄ±n
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_dataset["train"],
    eval_dataset=final_dataset["validation"],
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics,
    callbacks=[wandb_callback]
)

In [None]:
trainer.train(resume_from_checkpoint="SmolLM2-360M-Instruct-v1/checkpoint-6300")

 70%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ   | 123382/175782 [14:06:39<5:47:32,  2.51it/s]

In [None]:
# 6. EÄŸitilmiÅŸ Modeli Kaydedin
model.save_pretrained("./SmolLM2-360M-Instruct-v1")
tokenizer.save_pretrained("./SmolLM2-360M-Instruct-v1")

print("EÄŸitim tamamlandÄ± ve model kaydedildi.")