In [1]:
import zipfile
import os
from datasets import load_from_disk

# Загружаем тренировочный датасет из локального диска
train_dataset = load_from_disk('./train_dataset')
val_dataset = load_from_disk('./val_dataset')


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
!huggingface-cli login
#

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset

model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    # load_in_4bit=True,
    low_cpu_mem_usage= True,
    offload_folder="offload_weights",
)

Downloading shards:   0%|          | 0/3 [01:34<?, ?it/s]


KeyboardInterrupt: 

In [7]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

tokenizer.add_special_tokens({'additional_special_tokens': ['<context>', '<response>']})
model.resize_token_embeddings(len(tokenizer))

for param in model.parameters():
    param.requires_grad=False

model.gradient_checkpointing_enable() ### CHECK
model.enable_input_require_grads()  ###CHECK

In [8]:
from transformers import DataCollatorForLanguageModeling
from torch.optim import AdamW
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm import tqdm

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj"],
    inference_mode=False
)

model = get_peft_model(model, peft_config)

In [9]:
train_dataset = train_dataset

def tokenize_function(examples):
    result = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=120)
    return result

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/180000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [10]:
print("Trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))
print("Total parameters (excluding quantization):", sum(p.numel() for p in model.parameters()))


Trainable parameters: 2097152
Total parameters (excluding quantization): 3760476160


In [None]:
#llama 7b 65536
2097152 / 65536 / 3276800 / 2097152 / 1048576

32.0

In [None]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/bot_telegram/results', # TODO: поменять + чекпоинты + квантизациz
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=1,
    evaluation_strategy="steps",
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/content/drive/MyDrive/bot_telegram/logs',
    learning_rate=1e-4,
    fp16=True,
    gradient_accumulation_steps=8,
    save_steps=100,
    save_total_limit=3,
    optim="adamw_torch_fused",
    gradient_checkpointing=True,
)

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)

model.to('cuda')

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_val_dataset,
    # train_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    optimizers=(optimizer, None),
)

trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
