In [None]:
!pip install -q -U transformers bitsandbytes peft datasets accelerate trl

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.6/251.6 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m36.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

In [None]:
import torch
from datasets import load_dataset
import re

from transformers import (
    AutoTokenizer,
    TFAutoModelForCausalLM,
    AutoModel,
    AutoModelForCausalLM,
    TrainingArguments,
    BitsAndBytesConfig,
    MBart50TokenizerFast,
    MBartForConditionalGeneration,
    pipeline,
    logging
)

from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel,
    PeftConfig
)

from trl import SFTTrainer
import os

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
base_model = "mistralai/Mistral-7B-Instruct-v0.2"
auth_token = os.getenv("HF_TOKEN")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model,
    token=auth_token,
    padding_side="right",
    add_eos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=False,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    use_auth_token=auth_token,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
dataset_name = "wikitext"
config = "wikitext-103-raw-v1"

train_dataset = load_dataset(dataset_name, config, split="train[0:16000]")
eval_dataset = load_dataset(dataset_name, config, split="train[16000:20000]")

train_dataset

Dataset({
    features: ['text'],
    num_rows: 16000
})

In [None]:
def generate_prompt(sample):
    return {"text": sample['text']}

train_dataset = train_dataset.map(generate_prompt)
eval_dataset = eval_dataset.map(generate_prompt)


In [None]:
for i in range(5):
    print(train_dataset[i])

{'text': ''}
{'text': ' = Valkyria Chronicles III = \n'}
{'text': ''}
{'text': ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n'}
{'text': " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adju

In [None]:
model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model)

In [None]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
lora_config = LoraConfig(
    r=8, # 8-128
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.1,
    task_type="CAUSAL_LM",
)

In [None]:
print_trainable_parameters(model)

trainable params: 0 || all params: 3752071168 || trainable%: 0.0


In [None]:
training_arguments = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_strategy="steps",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    max_steps=125, # if there is a better possibility, max_steps should be 1000.
    eval_strategy ="steps",
    eval_steps=25,
    do_eval=True,
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_arguments,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=1024
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
torch.utils.checkpoint.use_reentrant = False
model.config.use_cache = False
trainer.train()

Step,Training Loss,Validation Loss
25,2.0534,2.089742
50,2.4376,2.045264
75,2.3056,2.030692
100,2.2325,2.019893
125,2.0257,2.01777




TrainOutput(global_step=125, training_loss=2.2109418029785157, metrics={'train_runtime': 5264.4917, 'train_samples_per_second': 0.38, 'train_steps_per_second': 0.024, 'total_flos': 1.7982195834765312e+16, 'train_loss': 2.2109418029785157, 'epoch': 0.125})

In [None]:
my_finetuned_model = "mistral-7B-wikitext-finetuned"

trainer.model.push_to_hub(my_finetuned_model)



adapter_model.safetensors:   0%|          | 0.00/609M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Mesutby/mistral-7B-wikitext-finetuned/commit/443aab72bce3709f4082cec6fc93509d3a863f0e', commit_message='Upload model', commit_description='', oid='443aab72bce3709f4082cec6fc93509d3a863f0e', pr_url=None, pr_revision=None, pr_num=None)