# ZeRO with Deepspeed

In [1]:
%%writefile phi3_guanaco_accelerate_deepspeed.py
import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
import pynvml
import deepspeed

def print_gpu_utilization():
    pynvml.nvmlInit()
    device_count = pynvml.nvmlDeviceGetCount()
    memory_used = []
    for device_index in range(device_count):
        device_handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
        device_info = pynvml.nvmlDeviceGetMemoryInfo(device_handle)
        memory_used.append(device_info.used / 1024**3)
    print('Memory occupied on GPUs: ' + ' + '.join([f'{mem:.1f}' for mem in memory_used]) + ' GB.')

def main():
    # Initialize Accelerator; its configuration (including DeepSpeed) is loaded from the config file.
    accelerator = Accelerator()
    device = accelerator.device

    if accelerator.is_main_process:
        print(f"Running on device: {device}")

    # Define model name and load tokenizer.
    model_name = '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/models/microsoft--phi-3.5-mini-instruct'
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.padding_side = 'right'

    # Load the model with 4-bit quantization.
    # Note: We no longer specify a manual device map because DeepSpeed (via. Accelerate) will handle device placement.
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=torch.bfloat16,
            bnb_4bit_quant_storage=torch.bfloat16,
        ),
        attn_implementation='eager',
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
    )
  
    # For DeepSpeed integration, you can remove or comment out the following:
    # model.to(device)
    
    # Disable caching (KV cache is only useful during inference).
    #model.config.use_cache = False

    # Add LoRA adapters.
    peft_config = LoraConfig(
        task_type='CAUSAL_LM',
        r=16,
        lora_alpha=32,       # rule of thumb: lora_alpha should be about 2 * r
        lora_dropout=0.05,
        bias='none',
        target_modules='all-linear',
    )
    model = get_peft_model(model, peft_config)

    # Load and preprocess the dataset.
    guanaco_train = load_dataset(
        '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco', 
        split='train'
    )
    guanaco_test = load_dataset(
        '/leonardo_scratch/fast/EUHPC_D20_063/huggingface/datasets/timdettmers--openassistant-guanaco', 
        split='test'
    )
    # Process each example to extract the user prompt and assistant response.
    guanaco_train = guanaco_train.map(lambda entry: {
        'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
        'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
    })
    guanaco_test = guanaco_test.map(lambda entry: {
        'question1': entry['text'].split('###')[1].removeprefix(' Human: '),
        'answer1': entry['text'].split('###')[2].removeprefix(' Assistant: ')
    })
    # Restructure to a chat format expected by our formatting function.
    guanaco_train = guanaco_train.map(lambda entry: {'messages': [
        {'role': 'user', 'content': entry['question1']},
        {'role': 'assistant', 'content': entry['answer1']}
    ]})
    guanaco_test = guanaco_test.map(lambda entry: {'messages': [
        {'role': 'user', 'content': entry['question1']},
        {'role': 'assistant', 'content': entry['answer1']}
    ]})

    # Define training arguments.
    # Here, we add the `deepspeed` parameter so that the Trainer will use DeepSpeed.
    training_arguments = SFTConfig(
        output_dir='output/phi-3.5-mini-instruct-guanaco-deepspeed',
        #per_device_train_batch_size=8,
        #gradient_accumulation_steps=1,
        gradient_checkpointing=True,
        #gradient_checkpointing_kwargs={'use_reentrant': False},
        optim='adamw_torch',
        learning_rate=2e-4, # QLoRA suggestions: 2e-4 for 7B or 13B, 1e-4 for 33B or 65B
        logging_strategy='no',
        save_strategy='no',
        max_steps=100,
        bf16=True,
        report_to='none',
        max_seq_length=1024,
    )
    
    
    def formatting_func(entry):
        return tokenizer.apply_chat_template(entry['messages'], tokenize=False)

    # Create the SFTTrainer.
    trainer = SFTTrainer(
        model=model,
        args=training_arguments,
        train_dataset=guanaco_train,
        eval_dataset=guanaco_test,
        processing_class=tokenizer,
        formatting_func=formatting_func,
    )

    # Optionally print trainable parameters on the main process only.
    if accelerator.is_main_process and hasattr(trainer.model, "print_trainable_parameters"):
        trainer.model.print_trainable_parameters()

    # Evaluate before training.
    eval_result = trainer.evaluate()
    if accelerator.is_main_process:
        print("Evaluation on test dataset before finetuning:")
        print(eval_result)

    # Train the model.
    train_result = trainer.train()
    if accelerator.is_main_process:
        print("Training result:")
        print(train_result)

    # Evaluate after training.
    eval_result = trainer.evaluate()
    if accelerator.is_main_process:
        print("Evaluation on test dataset after finetuning:")
        print(eval_result)

    # Print GPU memory usage (only once per node).
    if accelerator.local_process_index == 0:
        print_gpu_utilization()

if __name__ == "__main__":
    main()


Writing phi3_guanaco_accelerate_deepspeed.py


In [2]:
# Inline DeepSpeed configuration meant to roughly correspond to your FSDP settings
# Use bf16 for mixed precision, similar to mixed_precision: bf16 in the FSDP config.
# Optimizer settings (you can adjust these as needed, or remove if not required):
# Use ZeRO optimization stage 3 to mimic FSDP's full sharding:
# Do not offload parameters (fsdp_offload_params: false)
# These options are chosen to be simple; they differ from FSDP’s wrapping or prefetching policies.

In [3]:
%%writefile accelerate_deepspeed_config.yaml
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: DEEPSPEED
downcast_bf16: 'no'
deepspeed_config:
    bf16: true
    zero_stage: 3
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 2
num_processes: 4
rdzv_backend: c10d
same_network: true
use_cpu: false

Writing accelerate_deepspeed_config.yaml


In [4]:
%%writefile run_phi3_guanaco_accelerate_deepspeed.slurm
#!/bin/bash

#SBATCH --partition=boost_usr_prod
# #SBATCH --qos=boost_qos_dbg
#SBATCH --account=EUHPC_D20_063
#SBATCH --reservation=s_tra_ncc

## Specify resources:
## Leonardo Booster: 32 CPU cores and 4 GPUs per node => request 8 * number of GPUs CPU cores
## Leonardo Booster: 512 GB in total => request approx. 120 GB * number of GPUs requested
#SBATCH --nodes=2
#SBATCH --gpus-per-task=2  # up to 4 on Leonardo
#SBATCH --ntasks-per-node=1  # always 1
#SBATCH --mem=120GB  # should be 120GB * gpus-per-task on Leonardo
#SBATCH --cpus-per-task=16  # should be 8 * gpus-per-task on Leonardo

#SBATCH --time=0:30:00

# Load conda:
module purge
module load anaconda3
eval "$(conda shell.bash hook)"
conda activate /leonardo/pub/userexternal/mpfister/conda_env_martin24

# Include commands in output:
set -x

# Print current time and date:
date

# Print host name:
hostname

# List available GPUs:
nvidia-smi

# Set environment variables for communication between nodes:
export MASTER_PORT=$(shuf -i 20000-30000 -n 1)  # Choose a random port
export MASTER_ADDR=$(scontrol show hostnames ${SLURM_JOB_NODELIST} | head -n 1)
export OMP_NUM_THREADS=$SLURM_CPUS_PER_TASK

# Set launcher and launcher arguments:
export LAUNCHER="accelerate launch \
    --num_machines $SLURM_NNODES \
    --num_processes $((SLURM_NNODES * SLURM_GPUS_ON_NODE/2)) \
    --num_cpu_threads_per_process 8 \
    --main_process_ip $MASTER_ADDR \
    --main_process_port $MASTER_PORT \
    --machine_rank \$SLURM_PROCID \
    --config_file \"./accelerate_deepspeed_config.yaml\" \
    "
# Set training script that will be executed:
export PROGRAM="phi3_guanaco_accelerate_deepspeed.py"

# Run:
time srun bash -c "$LAUNCHER $PROGRAM"

Writing run_phi3_guanaco_accelerate_deepspeed.slurm


#### We can now execute the SLURM script and, once the job ran, look at the output:

In [5]:
!sbatch run_phi3_guanaco_accelerate_deepspeed.slurm

Submitted batch job 19837961


In [6]:
!squeue --me

             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
          19826558 boost_usr trainee4 a08trb42  R    4:50:25      1 lrdn0851
          19837961 boost_usr run_phi3 a08trb42  R       0:00      2 lrdn[3428,3434]


In [7]:
!cat slurm-19837961.out

ERROR: Unable to locate a modulefile for 'anaconda3'
/var/spool/slurmd/job19837961/slurm_script: line 22: conda: command not found
/var/spool/slurmd/job19837961/slurm_script: line 23: conda: command not found
+ date
Thu Sep 11 13:08:52 CEST 2025
+ hostname
lrdn3428.leonardo.local
+ nvidia-smi
Thu Sep 11 13:08:52 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM-64GB           On  | 00000000:1D:00.0 Off |                    0 |
| N/A   43C    P0              61W / 465W | 

#### Before we close the notebook, we should clean up the files created:

In [8]:
!rm phi3_guanaco_accelerate_deepspeed.py run_phi3_guanaco_accelerate_deepspeed.slurm slurm-*.out accelerate_deepspeed_config*.yaml deepspeed_config*.json

rm: cannot remove 'deepspeed_config*.json': No such file or directory
