## Fine tune Llama 2 7B on a single node with 8 GPUs

### Docker setup
```bash
sudo docker run --gpus all -it --rm \
	-v $(pwd)/epfllm-megatron-llm/:/epfllm/ \
  --workdir /epfllm \
	--shm-size=128gb \
	--ulimit memlock=-1 \
	--ulimit stack=67108864 \
 	--memory 480G \
	nvcr.io/nvidia/pytorch:23.07-py3
```

install
```bash
cd Megatron-LLM
pip install -r requirements.txt
cd megatron/data/
make
cd ../../../
```

login 
`huggingface-cli login`



### 1. Setup Environment 

```bash
# to install torch with the correct cuda version, check nvcc --version for apex
pip install torch --extra-index-url https://download.pytorch.org/whl/cu117 --upgrade
# regular install
pip install packaging ninja pybind11 
# apex (when not using the container)
git clone https://github.com/NVIDIA/apex
cd apex
python setup.py install --cuda_ext
cd ../
rm -rf apex

# Megatron LLM
git clone https://github.com/epfLLM/Megatron-LLM.git
cd Megatron-LLM/
pip install -r requirements.txt
cd megatron/data/
make
cd ../../
```

In [30]:
%%bash
export CACHE_PATH="./cache"
export DATASET_PATH="./dataset"
export MEGATRON_PATH="./Megatron-LLM"
export MODEL_PATH="./model"
export MODEL_ID="meta-llama/Llama-2-7b-hf"

### 2. load dataset 

In [32]:
import os
import json
from datasets import load_dataset


# the `cache_dir` argument is optional
dataset = load_dataset("philschmid/wikipedia-230601-de-minhash-dedup",
                       split="train", cache_dir=CACHE_PATH)

os.makedirs(DATASET_PATH, exist_ok=True)
with open(f"{DATASET_PATH}/raw.jsonl", "w") as f:
    for document in dataset:
        document = {"id": document["id"], "text": document["text"]}
        f.write(json.dumps(document) + "\n")

## 3. Preprocess dataset 

We need the tokenizer

In [34]:
from huggingface_hub import hf_hub_download

os.makedirs(MODEL_PATH, exist_ok=True)

hf_hub_download(repo_id=MODEL_ID, filename="tokenizer.model", repo_type="model",local_dir=MODEL_PATH,local_dir_use_symlinks=False )

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

'./model/tokenizer.model'

process the dataset, 

check out https://github.com/LAION-AI/Open-Assistant/tree/main/model/pretokenizer

In [None]:
!python {MEGATRON_PATH}/tools/preprocess_data.py --input={DATASET_PATH}/raw.jsonl \
	--output_prefix={DATASET_PATH}/megatron \
	--tokenizer_type=SentencePieceTokenizer \
	--vocab_file={MODEL_PATH}/tokenizer.model \
	--chunk_size=32 \
	--workers=96 \
	--append_eod \
  --log_interval 10000 \
	--no_new_tokens 


## 3. Weight conversion

In [39]:
%%bash
python ${MEGATRON_PATH}/weights_conversion/hf_to_megatron.py llama2 --size=7 --model-path ${MODEL_ID} \
	--out=${MODEL_PATH} --cache-dir=${CACHE_PATH}

[2023-09-22 15:28:28,110] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)
Getting llama...
Weights at cache do not look like a meta checkpoint, assuming huggingface cache_dir instead
Loading checkpoint shards: 100%|██████████████████| 2/2 [00:44<00:00, 22.01s/it]
Converting weights: 100%|███████████████████████| 32/32 [00:03<00:00,  8.86it/s]
Saved weights in model
Saved tokenizer.model in model/tokenizer.model
Done


validate model

In [None]:
%%bash
# arguments required by `torchrun`
DISTRIBUTED_ARGS="--nproc_per_node 1 --nnodes 1 --node_rank 0 --master_addr localhost --master_port 8000"
LLAMA_ARGS="--use_rms_norm --glu_activation swiglu --no_tie_embed_logits --no_new_tokens --layernorm_epsilon 1e-5"
COMMON_ARGS="--hidden_dropout 0.0 --attention_dropout 0.0 --no_bias_gelu_fusion"
torchrun $DISTRIBUTED_ARGS ${MEGATRON_PATH}/verify_correctness.py \
	--model_name=llama2 \
	--model_size=7 \
	--load=${MODEL_PATH} \
	--data_path=${DATASET_PATH}/ \
	--tokenizer_type=SentencePieceTokenizer \
	--vocab_file=${MODEL_PATH}/tokenizer.model \
	--huggingface_cache=${CACHE_PATH} \
	--huggingface_device=cuda:1 \
	$COMMON_ARGS $LLAMA_ARGS 

## 3. Shard model

In [None]:
%%bash
python ${MEGATRON_PATH}/tools/checkpoint_util.py \
	--target_tensor_parallel_size 4 \
	--target_pipeline_parallel_size 1 \
	--load_dir ${MODEL_PATH} \
	--save_dir ${MODEL_PATH}_sharded \
	--model_type llama2 \
	--true_vocab_size 32000 \
	--bf16

## 4. Train

In [None]:
%%bash
LOG_ARGS="--log_interval 1 --save_interval 100 --eval_interval 50"
TRAIN_ARGS="--train_iters 500 --lr_decay_style cosine --lr_warmup_iters 50 --lr 3e-4 --min_lr 1e-6"
DISTRIBUTED_ARGS="--nproc_per_node 8 --nnodes 1 --node_rank 0 --master_addr localhost --master_port 8000"
torchrun $DISTRIBUTED_ARGS ${MEGATRON_PATH}/finetune.py \
	--tensor_model_parallel_size 4 \
	--pipeline_model_parallel_size 1 \
	--load ${MODEL_PATH}_sharded \
	--save ${MODEL_PATH}_sharded \
	--tensorboard_dir ${MODEL_PATH}_sharded \
	--data_path ${DATASET_PATH}/megatron_text_document \
	--model_name llama2 \
	--tokenizer_type SentencePieceTokenizer \
	--vocab_file=${MODEL_PATH}/tokenizer.model \
	--bf16 \
	--use_flash_attn \
	--micro_batch_size 5 \
	--global_batch_size 1000 \
	--sequence_parallel \
	--recompute_granularity selective \
	--use_checkpoint_args \
	$COMMON_ARGS $LOG_ARGS $TRAIN_ARGS $LLAMA_ARGS