pip install -q git+https://github.com/huggingface/trl

In [1]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# The instruction dataset to use
dataset_name = "Conjuror64/C64-inst-format"

# Fine-tuned model name
new_model = "Conjuror64/mistral-C64Wizard-PEFT"


hub_name = "Conjuror64/Mistral-C64Wizard-instruct"

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"



max_steps = 1000 # to tweak to get the best out of the model 
nb_epochs = 3

In [2]:
import wandb
import random

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="C64Wizard",
    
    # track hyperparameters and run metadata
    config={

    "epochs":nb_epochs,
    }
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpierre-emmanuel-chaut[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
import torch
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer
import os
 
# Load the tokenizer from the model (llama2)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True, add_eos_token=True, use_fast=False)
tokenizer.padding_side = "right"
tokenizer.pad_token_id = 18610

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name,
                                             trust_remote_code=True,
                                             load_in_4bit=True,
                                             quantization_config=bnb_config,
                                             low_cpu_mem_usage=True,
                                             device_map={"": 0},
                                             )

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [6]:
from datasets import load_dataset, Dataset
from peft import LoraConfig, AutoPeftModelForCausalLM, prepare_model_for_kbit_training, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
import os

model.resize_token_embeddings(len(tokenizer))
model.config.use_cache=False
model.config.pretraining_tp=1
model.config.window = 512 
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)


Loading Dataset

In [7]:
from datasets import load_dataset

# Load the dataset
dataset_train = load_dataset(dataset_name, split="train", download_mode='force_redownload',ignore_verifications=True)
dataset_test = load_dataset(dataset_name, split="eval", download_mode='force_redownload',ignore_verifications=True)



Downloading readme:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/302k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3226 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

Downloading readme:   0%|          | 0.00/531 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/302k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3226 [00:00<?, ? examples/s]

Generating eval split:   0%|          | 0/5 [00:00<?, ? examples/s]

In [8]:



peft_config = LoraConfig(
    r=64, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM", target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",

    ],inference_mode = False
)



model = get_peft_model(model, peft_config)
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    gradient_checkpointing = True,

    evaluation_strategy="steps",
    learning_rate=2e-5,
    lr_scheduler_type="constant",
    warmup_ratio=0.03,
    max_grad_norm=0.3,
    save_strategy="epoch",
    logging_dir="./logs", 
    logging_steps=50,
    num_train_epochs=nb_epochs,
    group_by_length=True,
    fp16=False,
    report_to="wandb",
    push_to_hub=True,
    adam_beta2=0.999,
         do_train=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    peft_config=peft_config,
    dataset_text_field="text",
    args=training_arguments,
    tokenizer=tokenizer,
    packing=False,
    max_seq_length=None,
    neftune_noise_alpha=5
)




Map:   0%|          | 0/3226 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [9]:

trainer.train()
trainer.model.push_to_hub(new_model)
tokenizer.push_to_hub(new_model)

  0%|          | 0/1209 [00:00<?, ?it/s]



{'loss': 3.2415, 'learning_rate': 2e-05, 'epoch': 0.12}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.79343843460083, 'eval_runtime': 1.1051, 'eval_samples_per_second': 4.524, 'eval_steps_per_second': 0.905, 'epoch': 0.12}
{'loss': 2.0392, 'learning_rate': 2e-05, 'epoch': 0.25}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.1339714527130127, 'eval_runtime': 1.0142, 'eval_samples_per_second': 4.93, 'eval_steps_per_second': 0.986, 'epoch': 0.25}
{'loss': 1.8411, 'learning_rate': 2e-05, 'epoch': 0.37}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 2.0130653381347656, 'eval_runtime': 1.1022, 'eval_samples_per_second': 4.537, 'eval_steps_per_second': 0.907, 'epoch': 0.37}
{'loss': 1.6808, 'learning_rate': 2e-05, 'epoch': 0.5}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.8966524600982666, 'eval_runtime': 1.0423, 'eval_samples_per_second': 4.797, 'eval_steps_per_second': 0.959, 'epoch': 0.5}
{'loss': 1.5244, 'learning_rate': 2e-05, 'epoch': 0.62}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7386853694915771, 'eval_runtime': 1.019, 'eval_samples_per_second': 4.907, 'eval_steps_per_second': 0.981, 'epoch': 0.62}
{'loss': 1.4074, 'learning_rate': 2e-05, 'epoch': 0.74}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.7484862804412842, 'eval_runtime': 1.1046, 'eval_samples_per_second': 4.527, 'eval_steps_per_second': 0.905, 'epoch': 0.74}
{'loss': 1.4123, 'learning_rate': 2e-05, 'epoch': 0.87}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6649881601333618, 'eval_runtime': 1.1103, 'eval_samples_per_second': 4.503, 'eval_steps_per_second': 0.901, 'epoch': 0.87}
{'loss': 1.38, 'learning_rate': 2e-05, 'epoch': 0.99}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.6485092639923096, 'eval_runtime': 1.0312, 'eval_samples_per_second': 4.849, 'eval_steps_per_second': 0.97, 'epoch': 0.99}




{'loss': 1.2501, 'learning_rate': 2e-05, 'epoch': 1.12}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.549824595451355, 'eval_runtime': 1.1113, 'eval_samples_per_second': 4.499, 'eval_steps_per_second': 0.9, 'epoch': 1.12}
{'loss': 1.2612, 'learning_rate': 2e-05, 'epoch': 1.24}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5468645095825195, 'eval_runtime': 1.0299, 'eval_samples_per_second': 4.855, 'eval_steps_per_second': 0.971, 'epoch': 1.24}
{'loss': 1.2268, 'learning_rate': 2e-05, 'epoch': 1.36}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5825997591018677, 'eval_runtime': 1.06, 'eval_samples_per_second': 4.717, 'eval_steps_per_second': 0.943, 'epoch': 1.36}
{'loss': 1.2155, 'learning_rate': 2e-05, 'epoch': 1.49}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5543769598007202, 'eval_runtime': 1.0348, 'eval_samples_per_second': 4.832, 'eval_steps_per_second': 0.966, 'epoch': 1.49}
{'loss': 1.1903, 'learning_rate': 2e-05, 'epoch': 1.61}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.548553705215454, 'eval_runtime': 1.0465, 'eval_samples_per_second': 4.778, 'eval_steps_per_second': 0.956, 'epoch': 1.61}
{'loss': 1.1934, 'learning_rate': 2e-05, 'epoch': 1.74}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5154931545257568, 'eval_runtime': 1.0788, 'eval_samples_per_second': 4.635, 'eval_steps_per_second': 0.927, 'epoch': 1.74}
{'loss': 1.1816, 'learning_rate': 2e-05, 'epoch': 1.86}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.516025185585022, 'eval_runtime': 1.0895, 'eval_samples_per_second': 4.589, 'eval_steps_per_second': 0.918, 'epoch': 1.86}
{'loss': 1.1261, 'learning_rate': 2e-05, 'epoch': 1.98}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5335047245025635, 'eval_runtime': 1.0697, 'eval_samples_per_second': 4.674, 'eval_steps_per_second': 0.935, 'epoch': 1.98}




{'loss': 1.0258, 'learning_rate': 2e-05, 'epoch': 2.11}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5121917724609375, 'eval_runtime': 1.1021, 'eval_samples_per_second': 4.537, 'eval_steps_per_second': 0.907, 'epoch': 2.11}
{'loss': 1.0478, 'learning_rate': 2e-05, 'epoch': 2.23}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.5202537775039673, 'eval_runtime': 1.013, 'eval_samples_per_second': 4.936, 'eval_steps_per_second': 0.987, 'epoch': 2.23}
{'loss': 1.0447, 'learning_rate': 2e-05, 'epoch': 2.36}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4992659091949463, 'eval_runtime': 1.0539, 'eval_samples_per_second': 4.744, 'eval_steps_per_second': 0.949, 'epoch': 2.36}
{'loss': 1.0317, 'learning_rate': 2e-05, 'epoch': 2.48}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4722049236297607, 'eval_runtime': 1.0621, 'eval_samples_per_second': 4.708, 'eval_steps_per_second': 0.942, 'epoch': 2.48}
{'loss': 1.0453, 'learning_rate': 2e-05, 'epoch': 2.6}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4938623905181885, 'eval_runtime': 1.1006, 'eval_samples_per_second': 4.543, 'eval_steps_per_second': 0.909, 'epoch': 2.6}
{'loss': 0.9746, 'learning_rate': 2e-05, 'epoch': 2.73}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.473103404045105, 'eval_runtime': 0.9835, 'eval_samples_per_second': 5.084, 'eval_steps_per_second': 1.017, 'epoch': 2.73}
{'loss': 1.011, 'learning_rate': 2e-05, 'epoch': 2.85}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.490756630897522, 'eval_runtime': 1.0958, 'eval_samples_per_second': 4.563, 'eval_steps_per_second': 0.913, 'epoch': 2.85}
{'loss': 0.9939, 'learning_rate': 2e-05, 'epoch': 2.98}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 1.4571453332901, 'eval_runtime': 1.0139, 'eval_samples_per_second': 4.931, 'eval_steps_per_second': 0.986, 'epoch': 2.98}
{'train_runtime': 11587.2114, 'train_samples_per_second': 0.835, 'train_steps_per_second': 0.104, 'train_loss': 1.345323654913133, 'epoch': 3.0}


adapter_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/StarkWizard/Mistral-7b-instruct-cairo-PEFT/commit/82e95a57e5ee7caed38d5f054f69afc320958c05', commit_message='Upload tokenizer', commit_description='', oid='82e95a57e5ee7caed38d5f054f69afc320958c05', pr_url=None, pr_revision=None, pr_num=None)

---
If you want to merge and push now.. done on CPU

---

In [10]:
from transformers import AutoModelForCausalLM
from peft import PeftModel
import torch
from transformers import  AutoTokenizer, BitsAndBytesConfig

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_name,
                                             trust_remote_code=True,
                                             low_cpu_mem_usage=True,
                                             device_map={"": "cpu"},
                                             torch_dtype=torch.float16
                                             )

model_to_merge  = PeftModel.from_pretrained(model, new_model,
                        torch_dtype=torch.float16, 
                        device_map={"": "cpu"}
                         )
merged_model = model_to_merge.merge_and_unload()
#model.save_pretrained("cairo-mistral")
merged_model.push_to_hub(hub_name,max_shard_size="1GB")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00011-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00013-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00012-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

pytorch_model-00014-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00015-of-00015.bin:   0%|          | 0.00/816M [00:00<?, ?B/s]

Upload 15 LFS files:   0%|          | 0/15 [00:00<?, ?it/s]

pytorch_model-00010-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00009-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00008-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

pytorch_model-00007-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00006-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00005-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00004-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

pytorch_model-00003-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00002-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

pytorch_model-00001-of-00015.bin:   0%|          | 0.00/900M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/StarkWizard/Mistral-7b-instruct-cairo-instruct/commit/fcd6105c1f889a046b73dc76c72b966513e16c4f', commit_message='Upload MistralForCausalLM', commit_description='', oid='fcd6105c1f889a046b73dc76c72b966513e16c4f', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
import torch
from transformers import AutoTokenizer, TextStreamer, GenerationConfig, BitsAndBytesConfig
from attention_sinks import AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=hub_name,
                                             trust_remote_code=True,
                                             device_map={"": 0},
                                             attention_sink_size=4,
                                             quantization_config=bnb_config,
                                            attention_sink_window_size=252, # <- Low for the sake of faster generation
                                             )
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/622 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/15 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00015.bin:   0%|          | 0.00/900M [00:00<?, ?B/s]

Downloading (…)l-00002-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00003-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00004-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

Downloading (…)l-00005-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00006-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00007-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00008-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

Downloading (…)l-00009-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00010-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00011-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00012-of-00015.bin:   0%|          | 0.00/956M [00:00<?, ?B/s]

Downloading (…)l-00013-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00014-of-00015.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)l-00015-of-00015.bin:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

[Attention Sinks] Injected Position Shifting into 32 attention classes.
[Attention Sinks] Injected Attention Sink KV Cache into 1 model class.


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )

In [None]:
quantization_config: 
load_in_8bit: False 
load_in_4bit: True 
llm_int8_threshold: 6.0 
llm_int8_skip_modules: None 
llm_int8_enable_fp32_cpu_offload: False 
llm_int8_has_fp16_weight: False 
bnb_4bit_quant_type: "nf4" 
bnb_4bit_use_double_quant: False 
bnb_4bit_compute_dtype: "float16"