In [1]:
# ! pip install transformers datasets bitsandbytes 

### Llama 3 
 - Trained on 15T tokens
 - Standard decoder only architecture, no MoE

In [15]:
import torch
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer
from peft import LoraConfig, AutoPeftModelForCausalLM

from huggingface_hub import notebook_login, login
import wandb


_ = torch.manual_seed(0)

<torch._C.Generator at 0x7c04f3586a90>

#### Load the model

In [3]:
with open('secrets/secrets.json', 'r') as f:
    secrets = json.load(f)

HF_TOKEN = secrets['HF_TOKEN']
WANDB_TOKEN = secrets['WANDB_TOKEN']


# login and init both hugging face and wandb
login(token=HF_TOKEN, add_to_git_credential=True, write_permission=True)

wandb.login(key=WANDB_TOKEN)

In [5]:
config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type='nf4', )
model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3-8B', quantization_config=config, trust_remote_code=True, low_cpu_mem_usage=True, attn_implementation='flash_attention_2')

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [6]:
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Meta-Llama-3-8B')

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
tokenizer.save_pretrained('tokenizers/meta-llama/Meta-Llama-3-8B')

('tokenizers/meta-llama/Meta-Llama-3-8B/tokenizer_config.json',
 'tokenizers/meta-llama/Meta-Llama-3-8B/special_tokens_map.json',
 'tokenizers/meta-llama/Meta-Llama-3-8B/tokenizer.json')

### Inference on the model

In [8]:
# look at the chat template
message = [
    {"role": "system", "content": "You are a helpful chat assistant."},
    {"role": "user", "content": "What are Large Language Models ?"},
]
tokenizer.apply_chat_template(
    message,
    tokenize=False,
    add_generation_prompt=True,
) 


No chat template is defined for this tokenizer - using a default chat template that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.



'<|im_start|>system\nYou are a helpful chat assistant.<|im_end|>\n<|im_start|>user\nWhat are Large Language Models ?<|im_end|>\n<|im_start|>assistant\n'

In [13]:
from transformers import pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

prompts = [
    "A rectangular garden has a length of 25 feet and a width of 15 feet. If you want to build a fence around the entire garden, how many feet of fencing will you need?",
    "It's Bengay for muscle relief, a combination of methyl salicylate, menthol, and what other active ingredient commonly found in aspirin?",
    "How can i get rid of llamas in my backyard?",
]

# messages = []
# for prompt in prompts:
#     messages.append(
#         [
#             {"role": "system", "content": "You are a helpul chat assistant."},
#             {"role": "user", "content": prompt},
#         ]
#     )

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.7,
    "do_sample": False,
    'pad_token_id': tokenizer.eos_token_id
}

generations = pipe(prompts, **generation_args)



In [14]:
for i, generation in enumerate(generations):
    print("=" * 25)
    print("PROMPT:")
    print(f"{prompts[i]}")
    print("=" * 25)

    print("GENERATION:")
    print(f"{generation[0]['generated_text']}")
    print("=" * 25)

PROMPT:
A rectangular garden has a length of 25 feet and a width of 15 feet. If you want to build a fence around the entire garden, how many feet of fencing will you need?
GENERATION:
 A erotik.getOwnPropertyebo '\''orth Bazplieričesian laidriba_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<?_<