In [None]:
# for colab
!huggingface-cli login
#!huggingface-cli download meta-llama/Llama-3.2-1B-Instruct --exclude "original/*" --local-dir meta-llama/Llama-3.2-1B-Instruct
#!pip install -q datasets trl torch transformers peft bitsandbytes

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
import torch

In [5]:
d_opts = [('cuda', torch.cuda.is_available()), ('mps', torch.backends.mps.is_available()), ('cpu', True)]
device = next(device for device, available in d_opts if available)
print(f'using device: {device}')

using device: mps


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16
)

In [9]:
model_path = '../meta-llama/Llama-3.2-1B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    #quantization_config=bnb_config,
    #device_map=device,
).to(device)

In [10]:
print(model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [12]:
dataset_hf_path = 'iamtarun/python_code_instructions_18k_alpaca'
dataset = load_dataset(dataset_hf_path)

split_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)

train_dataset = split_dataset['train']
val_dataset = split_dataset['test']

print(f'Training set size: {len(train_dataset)}')
print(f'Validation set size: {len(val_dataset)}')

Generating train split: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 18612/18612 [00:00<00:00, 482814.75 examples/s]

Training set size: 14889
Validation set size: 3723





In [13]:
print(f'-- data info: {dataset_hf_path} --')
print(f'dataset shape: {train_dataset.shape}')
print(f'dataset columns: {train_dataset.column_names}')
print(f'dataset rows: {train_dataset.num_rows}')
print()
print('-> example fine-tuning prompt:')
print(f"prompt: {train_dataset[0]['prompt']}")

-- data info: iamtarun/python_code_instructions_18k_alpaca --
dataset shape: (14889, 4)
dataset columns: ['instruction', 'input', 'output', 'prompt']
dataset rows: 14889

-> example fine-tuning prompt:
prompt: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Develop an algorithm in Python for predicting the motion of a particle.

### Input:
Not applicable

### Output:
import numpy as np

def predict_motion(pos, vel, acc, dt):
  # predict position
  pos_pred = pos + vel * dt + 0.5 * acc * dt ** 2 
  
  #predict velocity
  vel_pred = vel + acc * dt
  
  return pos_pred, vel_pred

# test code
pos = np.array([0.0, 0.0])
vel = np.array([1.0, 1.0])
acc = np.array([1.0, 0.0])
dt = 0.1
pos_pred, vel_pred = predict_motion(pos, vel, acc, dt)
print(pos_pred, vel_pred)


In [14]:
def formatting_func(example):
    return example['prompt']

In [15]:
ex1 = formatting_func(train_dataset[0])
ex1_p = train_dataset[0]['prompt']
ex1, ex1_p

('Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nDevelop an algorithm in Python for predicting the motion of a particle.\n\n### Input:\nNot applicable\n\n### Output:\nimport numpy as np\n\ndef predict_motion(pos, vel, acc, dt):\n  # predict position\n  pos_pred = pos + vel * dt + 0.5 * acc * dt ** 2 \n  \n  #predict velocity\n  vel_pred = vel + acc * dt\n  \n  return pos_pred, vel_pred\n\n# test code\npos = np.array([0.0, 0.0])\nvel = np.array([1.0, 1.0])\nacc = np.array([1.0, 0.0])\ndt = 0.1\npos_pred, vel_pred = predict_motion(pos, vel, acc, dt)\nprint(pos_pred, vel_pred)',
 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nDevelop an algorithm in Python for predicting the motion of a particle.\n\n### Input:\nNot applicable\n\n### Output:\nimport numpy as np\n\ndef predict_motion(pos, vel, acc, dt):\n  # predict position\n  pos_

In [16]:
# peft (lora)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['q_proj', 'v_proj'], # (see model architecture)
    init_lora_weights='gaussian',
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


In [19]:
training_args = SFTConfig(
    output_dir='./finetuned-llama-3.2-1b-instruct',
    num_train_epochs=2,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    max_seq_length=512,
    max_steps=250,
    save_steps=100,
    label_names=[],
    fp16=True,
    report_to='none',

    logging_steps=10,
    logging_first_step=True,

    lr_scheduler_type='cosine',
    warmup_steps=100,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,

    train_dataset=train_dataset,
    eval_dataset=val_dataset,

    formatting_func=formatting_func,
    processing_class=tokenizer,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [8]:
trainer.train()

Step,Training Loss
1,1.5957
10,1.6449
20,1.4579
30,1.1499
40,0.9703
50,0.8552
60,0.8805
70,0.8282
80,0.9203
90,0.8378


TrainOutput(global_step=250, training_loss=0.910294692993164, metrics={'train_runtime': 406.3178, 'train_samples_per_second': 9.845, 'train_steps_per_second': 0.615, 'total_flos': 5977301545205760.0, 'train_loss': 0.910294692993164})

In [None]:
trainer.evaluate()

In [9]:
output_dir = 'llama-3.2-1b-instruct-ft'
merged_model = trainer.model.merge_and_unload()
merged_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

('llama-3.2-1b-instruct-ft/tokenizer_config.json',
 'llama-3.2-1b-instruct-ft/special_tokens_map.json',
 'llama-3.2-1b-instruct-ft/tokenizer.json')

In [14]:
model = AutoModelForCausalLM.from_pretrained(output_dir, torch_dtype=torch.float16, device_map='cuda')
print(model.dtype)

torch.float16


In [19]:
input_text = 'write a python function to generate the first n fibonacci numbers. exclude any extra comments or examples. just the python function bare bones.'
inputs = tokenizer(input_text, return_tensors='pt').to('cuda')
outputs = model.generate(**inputs, max_new_tokens=250)
print(tokenizer.decode(outputs[0], skip_special_tokens=True)) # TODO: how to prevent from outputing more tokens than needed?

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


write a python function to generate the first n fibonacci numbers. exclude any extra comments or examples. just the python function bare bones. 

```python
def fibonacci(n):
    fib = [0, 1]
    while len(fib) < n:
        fib.append(fib[-1] + fib[-2])
    return fib
```
```python
# Example usage
n = 10
print(fibonacci(n)) # [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
``` 
```python
# Example usage
n = 10
print(fibonacci(n)) # [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
``` 
```python
# Example usage
n = 10
print(fibonacci(n)) # [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
``` 
```python
# Example usage
n = 10
print(fibonacci(n)) # [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]
``` 
```


In [None]:
# ---------------------------------------------------------------------------------

In [15]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 6.932020GB
torch.cuda.memory_reserved: 6.951172GB
torch.cuda.max_memory_reserved: 13.992188GB


In [None]:
def generate_chat_response(conversation, max_length=100):
    prompt = f"<s>[INST] {conversation} [/INST]"
    inputs = tokenizer(prompt, return_tensors='pt', padding=True).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=max_length,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
conversation = 'Write a function in python to detect the 13th Friday of a given month and year. The function should accept two parameters: the month (as a number) and the year (as a four-digit number). It should return True if the month contains a Friday the 13th, and False otherwise3.'
response = generate_chat_response(conversation, max_length=400)
print(response)