In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch

In [None]:
model_id = "ytu-ce-cosmos/Turkish-Gemma-9b-v0.1"


In [None]:
device="cuda"

In [None]:
from google.colab import drive
drive.mount('/content/drive')




save_path_on_drive = "/content/drive/MyDrive/LLM_Modellerim/Turkish-Gemma-9b-v0.1" 


bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
    llm_int8_threshold=6.0
)


tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    quantization_config=bnb_config, 
    trust_remote_code=True,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True

)

model.generation_config.do_sample = True # Örneklemeyi etkinleştirin
model.generation_config.temperature = 0.7 # Kendi istediğiniz bir değerde bırakabilirsiniz
model.generation_config.top_p = 0.9     # Kendi istediğiniz bir değerde bırakabilirsiniz
model.generation_config.top_k = 50
# Modeli ve tokenizer'ı Drive'a kaydedin
# Önce klasörü oluşturduğunuzdan emin olun
import os
os.makedirs(save_path_on_drive, exist_ok=True)

print(f"Model '{save_path_on_drive}' konumuna kaydediliyor...")
model.save_pretrained(save_path_on_drive)
tokenizer.save_pretrained(save_path_on_drive)
print("Model Drive'a kaydedildi.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model 'ytu-ce-cosmos/Turkish-Gemma-9b-v0.1' Hugging Face Hub'dan indiriliyor...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Model indirildi.
Model '/content/drive/MyDrive/LLM_Modellerim/Turkish-Gemma-9b-v0.1' konumuna kaydediliyor...
Model Drive'a kaydedildi.


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True
)

tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    attn_implementation='eager',
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [None]:
# 3. LoRA Konfigürasyonu
peft_config = LoraConfig(
    r=256,
    lora_alpha=128,
    target_modules="all-linear",
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

trainable params: 864,288,768 || all params: 10,105,994,752 || trainable%: 8.5522


In [None]:
import json
from datasets import Dataset



file_path = "datas.jsonl"
data = []

with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
        data.append(json.loads(line))


dataset = Dataset.from_list(data)



# Yeni Bölüm

In [None]:
def tokenize(example):
    messages = example["messages"]

    formatted_messages = []
    for message in messages:
        formatted_messages.append({
            "role": message["role"],
            "content": message["content"]
        })

    prompt = tokenizer.apply_chat_template(
        formatted_messages,
        tokenize=False,
        add_generation_prompt=False
    )

    return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

tokenized = dataset.map(tokenize, batched=False)


Map:   0%|          | 0/766 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir="./turkish-gemma-lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=1,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,2.1048
20,0.9891
30,0.9362
40,0.8387
50,0.7821
60,0.8434
70,0.7876
80,0.7078
90,0.768
100,0.6239




TrainOutput(global_step=288, training_loss=0.5608267693056, metrics={'train_runtime': 2508.4571, 'train_samples_per_second': 0.916, 'train_steps_per_second': 0.115, 'total_flos': 6.486574617015091e+16, 'train_loss': 0.5608267693056, 'epoch': 3.0})

In [None]:
model.save_pretrained("./turkish-gemma-lora")
tokenizer.save_pretrained("./turkish-gemma-lora")

('./turkish-gemma-lora/tokenizer_config.json',
 './turkish-gemma-lora/special_tokens_map.json',
 './turkish-gemma-lora/chat_template.jinja',
 './turkish-gemma-lora/tokenizer.model',
 './turkish-gemma-lora/added_tokens.json',
 './turkish-gemma-lora/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
