In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv
load_dotenv(override=True)

False

In [3]:
from src.env import print_env_details
print_env_details()

  from .autonotebook import tqdm as notebook_tqdm


* CUDA:
	- GPU:
		- NVIDIA H100 PCIe
	- available:         True
	- version:           12.1
* Packages:
	- numpy:             1.23.1
	- pandas:            1.4.4
	- pyTorch_debug:     False
	- pyTorch_version:   2.3.1+cu121
	- pytorch-lightning: 2.3.3
	- sklearn:           1.3.2
	- transformers:      4.42.4
* System:
	- OS:                Linux
	- architecture:
		- 64bit
		- ELF
	- processor:         x86_64
	- python:            3.10.12
	- ram:               202329743360
	- version:           #10-Ubuntu SMP PREEMPT_DYNAMIC Wed Apr 26 00:40:27 UTC 2023


In [58]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils import is_flash_attn_2_available

## Load model and tokenizer

In [50]:
model_id = "microsoft/Phi-3-mini-4k-instruct"

In [51]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.model_max_length = 2048
tokenizer.pad_token = tokenizer.unk_token  # use unk rather than eos token to prevent endless generation
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [60]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    use_cache=False,
    device_map="auto", 
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "eager"
)

Loading checkpoint shards: 100%|██████████████████| 2/2 [00:01<00:00,  1.48it/s]


## Process dataset

In [43]:
from src.utils import load_jsonl

samples = list(load_jsonl("data/MATH/train_gpt4.jsonl"))


In [45]:
with open("prompts/python.md", "r") as f:
    prompt = f.read()

messages = []
for s in samples:
    messages.append(
        {
            "messages": [
                {
                    "content": f"{prompt}\n Question: {s['problem'].strip()}\n Solution: {s['solution'].strip()}",
                    "role": "user",
                },
            ]
        }
    )

In [52]:
from datasets import Dataset, load_dataset

def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False)
    return example

train_dataset = Dataset.from_list(messages).map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer}, num_proc=10)


Map (num_proc=10): 100%|███████████| 7500/7500 [00:02<00:00, 3198.72 examples/s]


## Prepare for training

In [63]:
from trl import SFTTrainer
from transformers import TrainingArguments, BitsAndBytesConfig

In [64]:
training_config = {
    "bf16": True,
    "do_eval": False,
    "learning_rate": 2e-5,
    "log_level": "info",
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "linear",
    "num_train_epochs": 3,
    "max_steps": -1,
    "output_dir": "./models",
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "remove_unused_columns": True,
    "save_steps": 100,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.03,
    "optim": "adamw_torch_fused"
    }

In [65]:
trainer = SFTTrainer(
    model=model,
    args=TrainingArguments(**training_config),
    peft_config=None,
    train_dataset=train_dataset,
    max_seq_length=2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    packing=True
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
Generating train split: 0 examples [00:00, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2686 > 2048). Running this sequence through the model will result in indexing errors
Generating train split: 8399 examples [00:19, 430.15 examples/s]
Using auto half precision backend


In [66]:
train_result = trainer.train()

***** Running training *****
  Num examples = 8,399
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 6,300
  Number of trainable parameters = 3,821,079,552


Step,Training Loss
20,0.6719
40,0.6321
60,0.4951
80,0.2992
100,0.1954
120,0.1499
140,0.1433
160,0.1385
180,0.1336
200,0.1223


Saving model checkpoint to ./models/checkpoint-100
Configuration saved in ./models/checkpoint-100/config.json
Configuration saved in ./models/checkpoint-100/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at ./models/checkpoint-100/model.safetensors.index.json.
tokenizer config file saved in ./models/checkpoint-100/tokenizer_config.json
Special tokens file saved in ./models/checkpoint-100/special_tokens_map.json
Saving model checkpoint to ./models/checkpoint-200
Configuration saved in ./models/checkpoint-200/config.json
Configuration saved in ./models/checkpoint-200/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at ./models/checkpoint-200/model.safetensors.index.json.
tokeni

In [68]:
trainer.save_model("models/phi3_ft")

Saving model checkpoint to models/phi3_ft
Configuration saved in models/phi3_ft/config.json
Configuration saved in models/phi3_ft/generation_config.json
The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 2 checkpoint shards. You can find where each parameters has been saved in the index located at models/phi3_ft/model.safetensors.index.json.
tokenizer config file saved in models/phi3_ft/tokenizer_config.json
Special tokens file saved in models/phi3_ft/special_tokens_map.json


In [71]:
# model.push_to_hub("parasdahal/Phi-3-mini-4k-instruct-finetuned_MATH")

In [72]:
del model
del trainer
torch.cuda.empty_cache()