In [1]:
import torch
from transformers import AutoConfig, AutoModelForCausalLM
from MyLLM.CrispyLLM_RoPE2.modeling_crispy_rope import CrispyLLMConfig, CrispyForCausalLM
from transformers import XLMRobertaTokenizer


# 3. Kayıt (Auto ile kullanabilmek için)
AutoConfig.register("crispy", CrispyLLMConfig)
AutoModelForCausalLM.register(CrispyLLMConfig, CrispyForCausalLM)

In [2]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"

In [3]:
max_seq_length = 1024  # Choose any! We auto support RoPE Scaling internally!
dtype = torch.bfloat16 # None for auto detection. bfloat16 for Tesla T4, V100, bfloat16 for Ampere+
load_in_4bit = False 
load_in_8bit = False 

In [4]:


# XLM-Roberta tokenizer yükleniyor
#tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

In [5]:
special_tokens_dict = {
    "bos_token": "<s>",
    "eos_token": "<|eot_id|>",
    "additional_special_tokens":  [
        "<|im_start|>", "<|im_end|>",
        "<|system|>", "<|user|>", "<|assistant|>",
        "<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>"
    ]
}


#tokenizer.add_special_tokens(special_tokens_dict)

In [6]:



#crispy_config = CrispyLLMConfig(attn_implementation="flash_attention_2", use_flash_attention_2=True, vocab_size=len(tokenizer.get_vocab()), n_heads=16, max_seq_len=max_seq_length, hidden_size=64*16, num_hidden_layers=16, dtype="bfloat16")

#crispy_config._attn_implementation_autoset = True  # 👈 Buraya ekliyorsun

#model = AutoModelForCausalLM.from_config(crispy_config)

In [7]:
model_path = "./Crispy-330M-V1-Rope-NewTokenizer-JustLanguage/checkpoint-19600"

model = AutoModelForCausalLM.from_pretrained(model_path ,  
                                            attn_implementation="flash_attention_2",
                                            trust_remote_code=True,
                                            torch_dtype=torch.bfloat16,
                                            device_map="auto"
      ) 
tokenizer = XLMRobertaTokenizer.from_pretrained("./Crispy-330M-V1-Rope-NewTokenizer-JustLanguage")

📦 Loading weights from model.safetensors


In [8]:
model

CrispyForCausalLM(
  (embedding): EmbeddingLayer(
    (token_embedding): TokenEmbedding(
      (embedding_layer): Embedding(250010, 1024)
    )
  )
  (decoderBlocks): ModuleList(
    (0-15): 16 x DecoderBlock(
      (attention_block): AttentionBlock(
        (qkv_proj): Linear(in_features=1024, out_features=3072, bias=True)
        (o_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (rms_norm1): RMSNormBlock(
          (rmsNorm): RMSNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (attn): FlashAttentionBlockBase()
        (rope): RotaryPositionalEmbedding()
      )
      (feedforward_network): FeedforwardNetwork(
        (ln1): Linear(in_features=1024, out_features=4096, bias=True)
        (swiglu): SwiGLU(
          (linear1): Linear(in_features=4096, out_features=2048, bias=True)
          (linear2): Linear(in_features=1024, out_features=4096, bias=True)
        )
        (ln2): Linear(in_features=4096, out_features=1024, bias=True)
      )
   

In [9]:
torch.autograd.set_detect_anomaly(True)  # debug amaçlı


<torch.autograd.anomaly_mode.set_detect_anomaly at 0x7851eb12bb00>

In [10]:
assert tokenizer.pad_token_id == 1, "pad_token_id yanlış!"


In [11]:
model = model.train()

In [12]:
model

CrispyForCausalLM(
  (embedding): EmbeddingLayer(
    (token_embedding): TokenEmbedding(
      (embedding_layer): Embedding(250010, 1024)
    )
  )
  (decoderBlocks): ModuleList(
    (0-15): 16 x DecoderBlock(
      (attention_block): AttentionBlock(
        (qkv_proj): Linear(in_features=1024, out_features=3072, bias=True)
        (o_proj): Linear(in_features=1024, out_features=1024, bias=True)
        (rms_norm1): RMSNormBlock(
          (rmsNorm): RMSNorm((1024,), eps=1e-05, elementwise_affine=True)
        )
        (attn): FlashAttentionBlockBase()
        (rope): RotaryPositionalEmbedding()
      )
      (feedforward_network): FeedforwardNetwork(
        (ln1): Linear(in_features=1024, out_features=4096, bias=True)
        (swiglu): SwiGLU(
          (linear1): Linear(in_features=4096, out_features=2048, bias=True)
          (linear2): Linear(in_features=1024, out_features=4096, bias=True)
        )
        (ln2): Linear(in_features=4096, out_features=1024, bias=True)
      )
   

In [None]:
from datasets import load_dataset, concatenate_datasets

datasetC4 = load_dataset("allenai/c4", "tr", split="train", cache_dir="/media/hosman/Yedek/Datasets/", num_proc=4).shuffle(seed=42).select(range(250000))
datasetC4

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

In [None]:
datasetWiki = load_dataset("wikimedia/wikipedia", "20231101.tr",split="train", cache_dir="/media/hosman/Yedek/Datasets/", num_proc=4).shuffle(seed=42).select(range(250000))
datasetWiki

In [None]:
datasetCC100 = load_dataset("statmt/cc100", "tr",split="train", cache_dir="/media/hosman/Yedek/Datasets/", num_proc=4, trust_remote_code=True).shuffle(seed=42).select(range(250000))
datasetCC100

In [None]:
datasetOscar = load_dataset("oscar-corpus/OSCAR-2201", "tr",split="train", cache_dir="/media/hosman/Yedek/Datasets/", num_proc=4).shuffle(seed=42).select(range(250000))
datasetOscar

In [None]:
def replace_empty_with_none(example):
    # 'inputs' sütunundaki boş karakteri None ile değiştirelim
    if example['text'] == "":
        example['text'] = None
    return example

# dataset4'teki 'inputs' sütunundaki boş karakterleri None ile değiştir
datasetC4 = datasetC4.map(replace_empty_with_none)
datasetWiki = datasetWiki.map(replace_empty_with_none)
datasetCC100 = datasetCC100.map(replace_empty_with_none)
datasetOscar = datasetOscar.map(replace_empty_with_none)

In [None]:
dataset = concatenate_datasets([datasetC4, datasetWiki, datasetCC100, datasetOscar])

In [None]:
dataset = dataset.filter(lambda x: x["text"]!=None)


In [None]:
dataset

In [None]:
print(dataset[5]["text"])

In [None]:
print(tokenizer.tokenize(dataset[5]["text"]))

In [None]:
print(tokenizer.decode(tokenizer.encode(dataset[5]["text"])))

In [None]:
dataset = dataset.filter(lambda x:( len(tokenizer.encode(x["text"])) )<max_seq_length)

In [None]:
dataset = dataset.shuffle(seed=42)
dataset = dataset.shuffle(seed=41)
dataset = dataset.shuffle(seed=40)
dataset = dataset.shuffle(seed=39)

In [None]:
from datasets import DatasetDict

# 1. Veriyi train ve test olarak ayırma
# Örneğin, dataset zaten tek bir büyük veri seti (örneğin "data") içeriyor
# Bunu %80 train ve %20 test olarak bölelim
train_dataset, temp_dataset = dataset.train_test_split(test_size=0.1, seed=42).values()

# 2. Test setini de %50 validation ve %50 test olarak bölelim
val_dataset, test_dataset = temp_dataset.train_test_split(test_size=0.5, seed=42).values()

In [None]:
#model.gradient_checkpointing_enable()
model.use_cache = True

In [None]:
import wandb

wb_c = wandb.init(project="Basic LLM Train", name="Crispy-330M-V2-Rope-NewTokenizer-JustLanguage" , resume="allow") #id="a7zeymst",id="ecibz7e4" id="dbaxrwf4"
wb_c.watch(model, log="all")

In [None]:
import wandb
import evaluate
import numpy as np
from prettytable import PrettyTable
import torch
import re
from rapidfuzz import fuzz

def exact_match(prediction, reference):
    return prediction.strip().lower() == reference.strip().lower()

def contains_correct_result(prediction, reference):
    try:
        ref_nums = [int(s) for s in re.findall(r"\d+", reference)]
        pred_nums = [int(s) for s in re.findall(r"\d+", prediction)]
        return any(num in pred_nums for num in ref_nums)
    except:
        return False

def fuzzy_match_score(prediction, reference):
    return fuzz.ratio(prediction, reference) / 100.0  # normalize to 0-1

def evaluate_model(model, tokenizer, test_dataset, max_seq_length=256):
    """
    Eğitilmiş modeli test veri kümesi üzerinde değerlendirir ve sonuçları wandb'a loglar.
    
    Parametreler:
    - model: Eğitilmiş dil modeli
    - tokenizer: Modelin tokenizer'ı
    - test_dataset: Test veri kümesi (instruction-output içermeli)
    - max_seq_length: Maksimum yanıt uzunluğu (varsayılan: 256)

    Çıktı:
    - Metin tablosu (PrettyTable ile)
    - wandb logları
    """

    # Değerlendirme metriklerini yükleme
    rouge = evaluate.load("rouge")
    bleu = evaluate.load("bleu")
    meteor = evaluate.load("meteor")
    bertscore = evaluate.load("bertscore")

    predictions = []
    references = []
    exact_matches = []
    correct_results = []
    fuzzy_scores = []

    # Modeli değerlendirme moduna al
    model.eval()

    print("🚀 Model test verisi üzerinde değerlendiriliyor...\n")

    for example in test_dataset:
        input_text = f"### Talimat:\n{example['instruction']}\n\n### Yanıt:\n"
        reference_text = example["output"]

        inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

        with torch.no_grad():
            output_ids = model.generate(**inputs, max_new_tokens=max_seq_length)

        decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        decoded_output = decoded_output.split("### Yanıt")[-1].strip()

        predictions.append(decoded_output)
        references.append(reference_text)

        exact_matches.append(exact_match(decoded_output, reference_text))
        correct_results.append(contains_correct_result(decoded_output, reference_text))
        fuzzy_scores.append(fuzzy_match_score(decoded_output, reference_text))

    # Metrik hesaplamaları
    rouge_scores = rouge.compute(predictions=predictions, references=references)
    bleu_score = bleu.compute(predictions=predictions, references=[[ref] for ref in references])
    meteor_score = meteor.compute(predictions=predictions, references=references)
    bert_scores = bertscore.compute(predictions=predictions, references=references, lang="tr")

    bert_precision = np.mean(bert_scores["precision"])
    bert_recall = np.mean(bert_scores["recall"])
    bert_f1 = np.mean(bert_scores["f1"])
    exact_match_score = np.mean(exact_matches)
    correct_result_score = np.mean(correct_results)
    fuzzy_match_avg = np.mean(fuzzy_scores)

    # Sonuçları tabloya ekle
    table = PrettyTable()
    table.field_names = ["Metrik", "Değer"]
    table.add_row(["ROUGE-1", round(rouge_scores["rouge1"], 4)])
    table.add_row(["ROUGE-2", round(rouge_scores["rouge2"], 4)])
    table.add_row(["ROUGE-L", round(rouge_scores["rougeL"], 4)])
    table.add_row(["BLEU", round(bleu_score["bleu"], 4)])
    table.add_row(["METEOR", round(meteor_score["meteor"], 4)])
    table.add_row(["BERTScore Precision", round(bert_precision, 4)])
    table.add_row(["BERTScore Recall", round(bert_recall, 4)])
    table.add_row(["BERTScore F1", round(bert_f1, 4)])
    table.add_row(["Exact Match", round(exact_match_score, 4)])
    table.add_row(["Contains Correct Result", round(correct_result_score, 4)])
    table.add_row(["Fuzzy Match", round(fuzzy_match_avg, 4)])

    # Sonuçları yazdır
    print(table)

    # wandb log
    wandb.log({
        "ROUGE-1": rouge_scores["rouge1"],
        "ROUGE-2": rouge_scores["rouge2"],
        "ROUGE-L": rouge_scores["rougeL"],
        "BLEU": bleu_score["bleu"],
        "METEOR": meteor_score["meteor"],
        "BERTScore Precision": bert_precision,
        "BERTScore Recall": bert_recall,
        "BERTScore F1": bert_f1,
        "Exact Match": exact_match_score,
        "Contains Correct Result": correct_result_score,
        "Fuzzy Match": fuzzy_match_avg
    })

    print("\n✅ Model değerlendirme tamamlandı ve tüm metrikler wandb'a loglandı.")


In [None]:
train_dataset[0]["text"]

In [None]:
#model.gradient_checkpointing_enable()

In [None]:
torch.autograd.set_detect_anomaly(True)

In [None]:
#val_dataset = val_dataset.select(range(10100, 11000))

In [None]:
import math

def get_warmup_steps_from_dataset(dataset_len, batch_size, num_epochs, pct=0.05):
    """
    Dataset bilgisine göre dinamik warmup step sayısı hesaplar.

    Args:
        dataset_len (int): Dataset’teki toplam örnek sayısı.
        batch_size (int): Batch başına örnek sayısı.
        num_epochs (int): Toplam epoch sayısı.
        pct (float): Warmup oranı (0.03 - 0.1 arası önerilir).

    Returns:
        int: Warmup step sayısı.
    """
    steps_per_epoch = math.ceil(dataset_len / batch_size)
    total_steps = steps_per_epoch * num_epochs
    warmup_steps = int(total_steps * pct)
    return warmup_steps


In [None]:
from transformers import TrainerCallback
import torch

class GradientCheckCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        model = kwargs["model"]

        found_problem = False
        for name, param in model.named_parameters():
            if param.grad is not None:
                if torch.isnan(param.grad).any():
                    print(f"🚨 NaN in gradients of {name}")
                    found_problem = True
                if torch.isinf(param.grad).any():
                    print(f"🚨 Inf in gradients of {name}")
                    found_problem = True

        if found_problem:
            print(f"⛔ Problematic gradients detected at step {state.global_step}!")
            
            control.should_training_stop = True  # Eğitimi durdur


        return control


class ManualGradientClipCallback(TrainerCallback):
    def __init__(self, max_grad_norm=1.0):
        self.max_grad_norm = max_grad_norm

    def on_step_end(self, args, state, control, **kwargs):
        model = kwargs["model"]

        # Gradyanları kliple
        total_norm = torch.nn.utils.clip_grad_norm_(
            model.parameters(), self.max_grad_norm
        )

        if torch.isnan(total_norm) or torch.isinf(total_norm):
            print(f"🚨 NaN/Inf gradyan normu! Step: {state.global_step}")
        elif total_norm > self.max_grad_norm:
            print(f"⚠️ Gradyan norm ({total_norm:.2f}) sınırı aştı, kliplendi.")

        return control
    

In [None]:
def tokenize_fn(example):
   
    full_text = example["text"]
    tokenized = tokenizer(
        full_text,
        padding="max_length",
        truncation=True,
        max_length=max_seq_length,
        return_tensors="pt"
    )
    
    tokenized["input_ids"] = tokenized["input_ids"][0]
    tokenized["labels"] = tokenized["input_ids"].clone()
    tokenized["attention_mask"] = tokenized["attention_mask"][0]
    

    return tokenized

train_dataset = train_dataset.map(tokenize_fn, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(tokenize_fn, remove_columns=val_dataset.column_names)
test_dataset = test_dataset.map(tokenize_fn, remove_columns=test_dataset.column_names)

In [None]:
tokenizer.decode(train_dataset[100]["input_ids"]), tokenizer.decode(train_dataset[100]["labels"])

In [None]:
train_dataset[0]

In [None]:
np.array(train_dataset[0]["input_ids"]).shape

In [None]:
from transformers import TrainingArguments, DataCollatorForSeq2Seq, Trainer
from unsloth import is_bfloat16_supported

trainer = Trainer(
    model = model,
    #tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    #dataset_text_field = "text",
    #max_seq_length = max_seq_length,
    #data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    #dataset_num_proc = 2,
    #packing = False, # Can make training 5x faster for short sequences.
    #callbacks=[wandb_callback],
    #packing=False,
    #remove_unused_columns=True,
    #torch_compile=True,
    callbacks=[GradientCheckCallback(), ManualGradientClipCallback()],
    args = TrainingArguments(
        gradient_checkpointing=False, 
        gradient_accumulation_steps = 16,
        eval_accumulation_steps=16,
        num_train_epochs=1,  
        per_device_train_batch_size=4,       # GPU başına batch boyutu
        per_device_eval_batch_size=4,       # GPU başına batch boyutu
        learning_rate =  0.001 ,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 50,
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        eval_steps=20000,
        eval_strategy="steps",
        lr_scheduler_type = "polynomial",
        seed = 3407,
        output_dir = "Crispy-330M-V2-Rope-NewTokenizer-JustLanguage",
        report_to="wandb",                    # WandB veya diğer araçlara raporlama yok
        save_total_limit=2,                  # Sadece son iki checkpoint'i sakla
        save_steps=50,
        warmup_steps=1000,           # İlk 1000 adımda LR'yi yavaş yavaş artır
        max_grad_norm=1.0,
        torch_empty_cache_steps=50,
        no_cuda=False,
        use_cpu=False,
        adam_beta2=0.95,
        auto_find_batch_size=True,
        logging_nan_inf_filter=True
    ),
)

In [None]:
trainer.train(
    resume_from_checkpoint=False
            )

In [None]:
# Test değerlendirmesi
#evaluate_model(model, tokenizer, test_dataset, max_seq_length=max_seq_length)

In [None]:
# 6. Eğitilmiş Modeli Kaydedin
model.save_pretrained("./Crispy-330M-V2-Rope-NewTokenizer-JustLanguage")
tokenizer.save_pretrained("./Crispy-330M-V2-Rope-NewTokenizer-JustLanguage")

print("Eğitim tamamlandı ve model kaydedildi.")

In [None]:
import unsloth

from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
import torch
from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset

In [None]:
from transformers import XLMRobertaTokenizer

# XLM-Roberta tokenizer yükleniyor
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
special_tokens_dict = {
    "bos_token": "<s>",
    "eos_token": "<|eot_id|>",
    "additional_special_tokens":  [
        "<|im_start|>", "<|im_end|>",
        "<|system|>", "<|user|>", "<|assistant|>",
        "<|start_header_id|>", "<|end_header_id|>", "<|eot_id|>"
    ]
}


tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
import torch

# Model ve tokenizer'ını yükle
from transformers import PreTrainedTokenizerFast

from transformers import AutoConfig, AutoModelForCausalLM
from MyLLM.CrispyLLM_RoPE2.modeling_crispy_rope import CrispyLLMConfig, CrispyForCausalLM
from transformers import PreTrainedTokenizerFast

# 3. Kayıt (Auto ile kullanabilmek için)
AutoConfig.register("crispy", CrispyLLMConfig)
AutoModelForCausalLM.register(CrispyLLMConfig, CrispyForCausalLM)
model = AutoModelForCausalLM.from_pretrained("./Crispy-330M-V1-Rope-NewTokenizer-JustLanguage/checkpoint-19600" ,  
                                            attn_implementation="flash_attention_2",
                                            trust_remote_code=True,
                                            torch_dtype=torch.bfloat16,
                                            device_map="auto"
      ).cuda().eval()


In [None]:
""" 
# Sohbet geçmişi
chat_history = ""

# Cevap üretme fonksiyonu
def generate_response(prompt, max_new_tokens=256):
    input_text = chat_history + prompt
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            use_cache=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = output_text[len(input_text):].strip()
    return response

print("🧠 Crispy Chatbot hazır! Çıkmak için Ctrl+C, sıfırlamak için '/reset' yaz.")
print("-" * 50)

# Sonsuz konuşma döngüsü
while True:
    user_input = input("👤 Sen: ")
    
    if user_input.strip().lower() == "/reset":
        chat_history = ""
        print("🔁 Sohbet sıfırlandı.")
        continue

    chat_history += f"👤 Sen: {user_input}\n"
    response = generate_response(f"👤 Sen: {user_input}\n🤖 Crispy:")
    chat_history += f"🤖 Crispy: {response}\n"

    print(f"🤖 Crispy: {response}")
 """

In [None]:
input_text = """E-postanın tonunu değerlendirin ve resmi mi yoksa gayri resmi mi olduğunu ."""

In [None]:
input_ids = tokenizer(input_text, return_tensors="pt", padding="max_length", truncation=True, max_length=512)
labels = input_ids["input_ids"].clone()
labels[labels == tokenizer.pad_token_id] = -100


In [None]:
outputs = model.generate(input_ids=input_ids["input_ids"].cuda(), max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
prompt = "Ali sabah uyanır ve"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=50)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


In [None]:
input_ids  = tokenizer(input_text, padding="max_length", max_length=1024,return_tensors="pt").to(model.device)

with torch.no_grad():
    # Modelden yanıt üret
    generated_ids = model.generate(
        **input_ids, 
        max_new_tokens=1024 ,
        do_sample=False,
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        #num_beams=5, 
        no_repeat_ngram_size=3,  
        early_stopping=True,
        top_k=50,
        top_p=0.9,
        temperature=0.9,
    )

# Üretilen token'ları geri metne çevir
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print(generated_text[len(input_text):])

