In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch

In [8]:
# model_id = "ytu-ce-cosmos/Turkish-Gemma-9b-v0.1"
local_base_model_path = r"C:\Users\Oğuzkaan\.cache\huggingface\hub\models--ytu-ce-cosmos--Turkish-Gemma-9b-v0.1\snapshots\1bbe9f19a26a70dc4d0c709e1a816e6a9378a7f5"

In [9]:
device="cuda"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
    llm_int8_threshold=6.0
)

tokenizer = AutoTokenizer.from_pretrained(local_base_model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    local_base_model_path,
    device_map="auto",
    quantization_config=bnb_config,
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

: 

In [None]:
#LoRA Konfigürasyonu
peft_config = LoraConfig(
    r=256,               
    lora_alpha=128,             
    target_modules="all-linear", 
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)

model.print_trainable_parameters()

trainable params: 864,288,768 || all params: 10,105,994,752 || trainable%: 8.5522


In [8]:
dataset = load_dataset("json", data_files={"train": "datas.jsonl"})

In [9]:
def tokenize(example):
    messages = example["messages"]
    
    formatted_messages = []
    for message in messages:
        formatted_messages.append({
            "role": message["role"],
            "content": message["content"]
        })

    prompt = tokenizer.apply_chat_template(
        formatted_messages, 
        tokenize=False, 
        add_generation_prompt=False
    )
    
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

tokenized = dataset["train"].map(tokenize, batched=False)


In [10]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir="./turkish-gemma-lora",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=100,
    save_total_limit=1,
    learning_rate=2e-4,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    data_collator=data_collator
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 6.00 GiB of which 0 bytes is free. Of the allocated memory 12.61 GiB is allocated by PyTorch, and 40.38 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
model.save_pretrained("./turkish-gemma-lora")
tokenizer.save_pretrained("./turkish-gemma-lora")

In [6]:
local_base_model_path = r"C:\Users\Oğuzkaan\.cache\huggingface\hub\models--ytu-ce-cosmos--Turkish-Gemma-9b-v0.1\snapshots\1bbe9f19a26a70dc4d0c709e1a816e6a9378a7f5"

In [None]:
def test_model():
    from peft import PeftModel
    
    base_model = AutoModelForCausalLM.from_pretrained(
        local_base_model_path,
        device_map="auto",
        quantization_config=bnb_config,
        trust_remote_code=True,
        low_cpu_mem_usage=True  
    )
    lora_model = PeftModel.from_pretrained(base_model, "../finetuned-gemma")
    
    messages = [{"role": "user", "content": "Merhaba, nasılsın?"}]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to(lora_model.device)
    eos_token_id = tokenizer.eos_token_id

    generated_ids = lora_model.generate(
        inputs,
        max_new_tokens=256,
        do_sample=True,
        eos_token_id=eos_token_id,
        temperature=0.7,
        top_p=0.9
    )
    output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    print(output[0])

test_model()

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.95s/it]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


ValueError: weight is on the meta device, we need a `value` to put in on cpu.