In [1]:
import os
import sys
from typing import List

import fire
import torch
import transformers
from datasets import load_dataset
from transformers import GenerationConfig
from peft import PeftModel
"""
Unused imports:
import torch.nn as nn
"""
import bitsandbytes as bnb

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_int8_training,
    set_peft_model_state_dict,
)

from transformers import AutoModelForCausalLM, AutoTokenizer
from modeling_mpt import MPTForCausalLM
from adapt_tokenizer import AutoTokenizerForMOD

from utils.prompter import Prompter

Setting ds_accelerator to cuda (auto detect)

Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/leucha/ls/envs/MPT/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/leucha/ls/envs/MPT/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/leucha/ls/envs/MPT/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [2]:
device_map = {"": 0}

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print("Printing Trainable Params")
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


In [6]:

model = MPTForCausalLM.from_pretrained(
    'mosaicml/mpt-7b-instruct',
    #config=config,
    trust_remote_code=True,
    # base_model,
    #load_in_8bit=True,
    torch_dtype=torch.float16,
    device_map=device_map,
    # quantization_config=quantization_config,
    # load_in_8bit_fp32_cpu_offload=True
)
model

The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

MPTForCausalLM(
  (transformer): MPTModel(
    (wte): Embedding(50432, 4096)
    (emb_drop): Dropout(p=0, inplace=False)
    (blocks): ModuleList(
      (0-31): 32 x MPTBlock(
        (norm_1): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): MultiheadAttention(
          (Wqkv): Linear(in_features=4096, out_features=12288, bias=False)
          (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (norm_2): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (ffn): MPTMLP(
          (up_proj): Linear(in_features=4096, out_features=16384, bias=False)
          (act): GELU(approximate='none')
          (down_proj): Linear(in_features=16384, out_features=4096, bias=False)
        )
        (resid_attn_dropout): Dropout(p=0, inplace=False)
        (resid_ffn_dropout): Dropout(p=0, inplace=False)
      )
    )
    (norm_f): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
  )
)

In [4]:
model = prepare_model_for_int8_training(model, use_gradient_checkpointing=True)
    
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=['Wqkv'],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    inference_mode=True
)
model = get_peft_model(model, config)
model



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MPTForCausalLM(
      (transformer): MPTModel(
        (wte): Embedding(50432, 4096)
        (emb_drop): Dropout(p=0, inplace=False)
        (blocks): ModuleList(
          (0-31): 32 x MPTBlock(
            (norm_1): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (attn): MultiheadAttention(
              (Wqkv): Linear8bitLt(
                in_features=4096, out_features=12288, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=12288, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
   

In [9]:
model = PeftModel.from_pretrained(
    model,
    'lora-mpt',
    torch_dtype=torch.float16,
    device_map={'': 0}
)
model.merge_and_unload()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MPTForCausalLM(
      (transformer): MPTModel(
        (wte): Embedding(50432, 4096)
        (emb_drop): Dropout(p=0, inplace=False)
        (blocks): ModuleList(
          (0-31): 32 x MPTBlock(
            (norm_1): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (attn): MultiheadAttention(
              (Wqkv): Linear(in_features=4096, out_features=12288, bias=False)
              (out_proj): Linear(in_features=4096, out_features=4096, bias=False)
            )
            (norm_2): LPLayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (ffn): MPTMLP(
              (up_proj): Linear(in_features=4096, out_features=16384, bias=False)
              (act): GELU(approximate='none')
              (down_proj): Linear(in_features=16384, out_features=4096, bias=False)
            )
            (resid_attn_dropout): Dropout(p=0, inplace=False)
            (resid_ffn_dropout): Dropout(p=0, inplace=

In [19]:
base_model="mosaicml/mpt-7b-instruct"
tokenizer = AutoTokenizerForMOD.from_pretrained(base_model)
prompt_template_name = "alpaca"
prompter = Prompter(prompt_template_name)
cutoff_len = 64
train_on_inputs=True
tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"  # Allow batched inference

def tokenize(prompt, add_eos_token=True):
    # there's probably a way to do this with the tokenizer settings
    # but again, gotta move fast
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=cutoff_len,
        padding=False, 
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < cutoff_len
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

def generate_and_tokenize_prompt(data_point):
    full_prompt = prompter.generate_prompt(
        data_point["instruction"],
        data_point["input"],
        data_point["output"],
    )
    tokenized_full_prompt = tokenize(full_prompt)
    if not train_on_inputs:
        user_prompt = prompter.generate_prompt(
            data_point["instruction"], data_point["input"]
        )
        tokenized_user_prompt = tokenize(
            user_prompt, add_eos_token=add_eos_token
        )
        user_prompt_len = len(tokenized_user_prompt["input_ids"])
        if add_eos_token:
            user_prompt_len -= 1

        tokenized_full_prompt["labels"] = [-100] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]  # could be sped up, probably
    return tokenized_full_prompt

Using pad_token, but it is not set yet.


In [20]:
val_set_size = 200
data_path = "yahma/alpaca-cleaned"
if data_path.endswith(".json") or data_path.endswith(".jsonl"):
    data = load_dataset("json", data_files=data_path)
else:
    data = load_dataset(data_path)

if val_set_size > 0:
    train_val = data["train"].train_test_split(
        test_size=val_set_size, shuffle=True, seed=42
    )
    train_data = (
        train_val["train"].shuffle().map(generate_and_tokenize_prompt)
    )
    val_data = (
        train_val["test"].shuffle().map(generate_and_tokenize_prompt)
    )
else:
    train_data = data["train"].shuffle().map(generate_and_tokenize_prompt)
    val_data = None


Found cached dataset json (/home/leucha/.cache/huggingface/datasets/yahma___json/yahma--alpaca-cleaned-5d24553f76c14acc/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at /home/leucha/.cache/huggingface/datasets/yahma___json/yahma--alpaca-cleaned-5d24553f76c14acc/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-de58c1b008d5400c.arrow and /home/leucha/.cache/huggingface/datasets/yahma___json/yahma--alpaca-cleaned-5d24553f76c14acc/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4/cache-f51d2ccd499bdf78.arrow


Map:   0%|          | 0/51560 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [28]:
train_data.features["input"]

AttributeError: 'Value' object has no attribute 'value'

In [30]:
next(iter(train_data))

{'input': 'Question 1: What is the capital of Bangladesh?\nQuestion 2: Who is the current Prime Minister of Bangladesh?',
 'output': 'Answer 1: The capital of Bangladesh is Dhaka.\nAnswer 2: The current Prime Minister of Bangladesh is Sheikh Hasina.',
 'instruction': 'Find the answers to the following questions.',
 'input_ids': [30003,
  310,
  271,
  9775,
  326,
  8631,
  247,
  4836,
  13,
  18433,
  342,
  271,
  3280,
  326,
  3400,
  2007,
  3634,
  15,
  19566,
  247,
  2380,
  326,
  20420,
  29141,
  253,
  2748,
  15,
  187,
  187,
  4118,
  41959,
  27,
  187,
  9867,
  253,
  9172,
  281,
  253,
  1563,
  3533,
  15,
  187,
  187,
  4118,
  19832,
  27,
  187,
  23433,
  337,
  27,
  1737,
  310,
  253,
  5347,
  273,
  29310,
  32,
  187,
  23433,
  374,
  27,
  8452,
  310,
  253],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,


In [1]:
import numpy as np

x = np.array([[2,-1,-1], [0,3,2],[-1,-1,0]])
y = np.array([[-1,0],[0,-2],[-1,2]])
x @ y

array([[-1,  0],
       [-2, -2],
       [ 1,  2]])

In [12]:
x = np.array([[127,-63,-63],[0,127,84],[-127,-127,0]])
y = np.array([[-127,0],[0,-127],[-127,127]])
z = x @ y
z

array([[ -8128,      0],
       [-10668,  -5461],
       [ 16129,  16129]])

In [13]:
z*[[2,4],[3,6],[1,2]]/(127**2)

array([[-1.00787402,  0.        ],
       [-1.98425197, -2.03149606],
       [ 1.        ,  2.        ]])

In [11]:
(127**2)-(63*127)


8128

In [None]:
!python src/finetune.py --base_model 'mosaicml/mpt-7b-instruct' --data_path 'yahma/alpaca-cleaned' --output_dir './lora-mpt' --lora_target_modules '[Wqkv]'

/bin/bash: /home/leucha/ls/lib/libtinfo.so.6: no version information available (required by /bin/bash)

Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/leucha/ls/envs/MPT/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so
CUDA SETUP: CUDA runtime path found: /home/leucha/ls/envs/MPT/lib/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /home/leucha/ls/envs/MPT/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...
Setting ds_accelerator to cuda (auto detect)
Training mosaicml/mpt-7b-instruct model with params:
base_model: mosaicml/mpt-7b-instruct
data_path: yahma/alpaca-cleaned
output_dir: ./lora-mpt
batch_size: 128
micro_batch_size: 4
num_epochs: 3
learning_rate: 2e-05
cutoff_len: 256
val_set_size: 2000