# LoRA Model Training in Google Colab

This notebook is designed to train a LoRA model using the provided dataset and the training script from `train_lora.py`. Follow the steps below to set up the environment and execute the training process.

In [None]:
# Install necessary libraries
!pip install torch transformers datasets peft


In [None]:
# Load the dataset
from datasets import load_dataset

dataset = load_dataset('json', data_files='data/merged_min.jsonl', split='train')
print(f'Dataset loaded with {len(dataset)} examples.')

In [None]:
# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType

# Define model name
model_name = 'mistralai/Mistral-7B-v0.1'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=False).to(device)

# LoRA configuration
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.05,
    bias='none',
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, peft_config)

# Preprocessing function
def format_example(example):
    source = example['metadata']['source'] if 'metadata' in example and 'source' in example['metadata'] else ''
    output = example['output']
    if source and output:
        output = f'{output}\n\nДетальніше: {source}'
    text = f'### Інструкція:\n{example['instruction']}\n\n### Вхід:\n{example['input']}\n\n### Вихід:\n{output}'
    return {'input_ids': tokenizer(text, truncation=True, max_length=512, padding='max_length')['input_ids']}

# Tokenize the dataset
tokenized_dataset = dataset.map(format_example)

# Training arguments
training_args = TrainingArguments(
    output_dir='./lora_model',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    bf16=True,
    optim='adamw_torch'
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
)

# Start training
trainer.train()