## 1. Getting started


In [1]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl torch datasets

In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


## 2. Model configuration

In [18]:
# Model from Hugging Face hub
base_model = "meta-llama/Llama-2-7b-hf"

# tokenizer_mode = "meta-llama/Llama-2-7b-hf"
# tokenizer_mode = "csebuetnlp/mT5_multilingual_XLSum"
tokenizer_mode = "dagim/amharic_tokenizer"



# New instruction dataset
guanaco_dataset = "mlabonne/guanaco-llama2-1k"

# Fine-tuned model
new_model = "llama-2-7b-hf-finetunned"

## 3. Loading dataset, model, and tokenizer

In [5]:
dataset = load_dataset('text', data_files={'train': '../data/cleaned/cleaned.txt'})

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 569264
    })
})

## 4. 8-bit quantization configuration

In [7]:
# Set the compute dtype to float16 for 8-bit quantization
compute_dtype = getattr(torch, "float16")

# Create the BitsAndBytesConfig for 8-bit quantization
quant_config = BitsAndBytesConfig(
    load_in_8bit=True,  # Enable 8-bit loading
    bnb_8bit_quant_type="nuq",  # Use No-Uniform Quantization (NUQ)
    bnb_8bit_compute_dtype=compute_dtype,  # Set the compute dtype
    bnb_8bit_use_double_quant=False,  # Disable double quantization
)

## 5. Loading Llama 2 model

In [8]:
access_token = "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"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    token=access_token,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards: 100%|██████████| 2/2 [00:16<00:00,  8.45s/it]


## 6. Loading tokenizer

In [14]:
access_token = "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"

tokenizer = AutoTokenizer.from_pretrained(tokenizer_mode, trust_remote_code=True,
token=access_token)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# !pip install sentencepiece

In [19]:
#  !pip install huggingface_hub
# !huggingface-cli login

## 7. PEFT parameters

In [15]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

## 8. Training parameters

In [16]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

## 9. Model fine-tuning

In [17]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset['train'],  # Use the key 'train' to access the training split
    peft_config=peft_params,
    dataset_text_field="text",  # Specify the key for the text field
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)

# !pip install tensorboardX

Map: 100%|██████████| 569264/569264 [00:25<00:00, 21908.66 examples/s]


In [19]:
trainer.model.save_pretrained(new_model)
trainer.tokenizer.save_pretrained(new_model)

('llama-2-7b-hf-finetunned/tokenizer_config.json',
 'llama-2-7b-hf-finetunned/special_tokens_map.json',
 'llama-2-7b-hf-finetunned/tokenizer.json')

## 10. Evaluation

In [20]:
from tensorboard import notebook
log_dir = "results/runs"
notebook.start("--logdir {} --port 4000".format(log_dir))

# !pip install tensorboard

In [25]:
logging.set_verbosity(logging.CRITICAL)

prompt = "የ ኢትዮጽያ ዋና ከተማ ማን ናት?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [37]:
logging.set_verbosity(logging.CRITICAL)

prompt = "what is the capital city of addis ababa?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] what is the capital city of addis ababa? [/INST]
[INST] what is the capital city of addis ababa? [/INST]
[INST] what is the capital city of addis ababa?
[INST] what is the capital city of addis ababa? [/INST]


In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Replace 'new_model' with the path to the directory where you saved your fine-tuned model
# new_model = "llama-2-7b-chat-guanaco"
new_model = "meta-llama/Llama-2-7b-hf"

# Load the fine-tuned model and tokenizer
model = AutoModelForCausalLM.from_pretrained(new_model)
# tokenizer = AutoTokenizer.from_pretrained(new_model)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards:  50%|█████     | 1/2 [00:37<00:37, 37.30s/it]

: 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(new_model)

# Provide a prompt
prompt = "hi"

# Tokenize and generate text
input_ids = tokenizer.e# tokenizer = AutoTokenizer.from_pretrained(new_model)ncode(prompt, return_tensors="pt")
output = model.generate(input_ids, max_length=50, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95)

# Decode the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)