In [2]:
# Preliminary setup of execution environment
import os
from pathlib import Path
import subprocess

nntile_dir = Path.cwd() / ".."

# Set environment variables
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Limit CUDA visibility
os.environ["OMP_NUM_THREADS"] = "1" # Disable BLAS parallelism
os.environ["PYTHONPATH"] = str(nntile_dir / "build" / "wrappers" / "python") # Path to a binary dir of NNTile Python wrappers

# All StarPU environment variables are available at https://files.inria.fr/starpu/doc/html/ExecutionConfigurationThroughEnvironmentVariables.html
os.environ["STARPU_NCPU"] = "1" # Use only 1 CPU core
os.environ["STARPU_NCUDA"] = "1" # Use only 1 CUDA device
os.environ["STARPU_SILENT"] = "1" # Do not show lots of StarPU outputs
os.environ["STARPU_SCHED"] = "dmdasd" # Name StarPU scheduler to be used
os.environ["STARPU_FXT_TRACE"] = "1" # Do not generate FXT traces
os.environ["STARPU_WORKERS_NOBIND"] = "1" # Do not bind workers (it helps if several instances of StarPU run in parallel)
os.environ["STARPU_PROFILING"] = "1" # This enables logging performance of workers and bandwidth of memory nodes
os.environ["STARPU_HOME"] = str(Path.cwd() / "starpu") # Main directory in which StarPU stores its configuration files
os.environ["STARPU_PERF_MODEL_DIR"] = str(Path(os.environ["STARPU_HOME"]) / "sampling") # Main directory in which StarPU stores its performance model files
os.environ["STARPU_PERF_MODEL_HOMOGENEOUS_CPU"] = "1" # Assume all CPU cores are equal
os.environ["STARPU_PERF_MODEL_HOMOGENEOUS_CUDA"] = "1" # Assume all CUDA devices are equal
os.environ["STARPU_HOSTNAME"] = "Llama_example" # Force the hostname to be used when managing performance model files
os.environ["STARPU_FXT_PREFIX"] = str(Path(os.environ["STARPU_HOME"]) / "fxt") # Directory to store FXT traces if enabled

# NNTile-related
os.environ["NNTILE_LOGGER"] = "0" # Enable logger
os.environ["NNTILE_LOGGER_SERVER_ADDR"] = "127.0.0.1" # Logger will be running on the localhost
os.environ["NNTILE_LOGGER_SERVER_PORT"] = "5001" # Port for logger server
os.environ["NNTILE_LOGGER_CLIENT_PORT"] = "6006" # Port for client web interface of the logger
os.environ["NNTILE_LOGGER_SERVER_DIR"] = str(Path.cwd() / "logs") # Directory to store logs on the logger server

In [3]:
# Launch logger if needed
if os.getenv("NNTILE_LOGGER", "0") != "0":
    logger_env = os.environ.copy()
    logger_env.update({
        "LOG_DIR": os.getenv("NNTILE_LOGGER_SERVER_DIR"),
        "SPLIT_HOURS": "720",
        "CLEAR_LOGS": "0",
        "SERVER_PORT": os.getenv("NNTILE_LOGGER_SERVER_PORT")
    })
    logger_proc = subprocess.Popen(["python", nntile_dir / "logger" / "server.py"], env=logger_env)

In [5]:
# Prepare TinyStories dataset into train.bin file
!python ../wrappers/python/examples/causal_lm_data_preparation.py --seq-len=1024 --batch-size=64 --dataset-select=500

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.


In [6]:
# Launch an external python process to finetune a pretrained LLaMa model on TinyStories
# If logger server is launched, then TensorBoard results can be accessed at localhost:6006
!PYTHONPATH=${HOME}/mikhalev/nntile/build-1.4.7/wrappers/python LD_LIBRARY_PATH=${HOME}/mikhalev/install/starpu-1.4.7/lib python ../wrappers/python/examples/llama_training.py \
    --restrict="cuda" --pretrained=local --config-path="../wrappers/python/examples/llama_1.3b_config.json" \
    --optimizer="adamw" --lr=1e-4 --dtype=bf16 --nepochs=1 --batch-size=64 --minibatch-size=8 --minibatch-size-tile=8 \
    --dataset-file="tinystories/train.bin" `#--logger --logger-server-addr=127.0.0.1`

Namespace(remote_model_name='kimihailv/llama-1.3b', pretrained='local', checkpoint_path='', config_path='../wrappers/python/examples/llama_1.3b_config.json', save_checkpoint_path='.model', optimizer='adamw', model_path='.model', seq_len=1024, seq_len_tile=-1, batch_size=64, minibatch_size=8, minibatch_size_tile=8, hidden_size_tile=-1, intermediate_size_tile=-1, n_head_tile=-1, dtype='bf16', restrict='cuda', flash_attention=False, use_redux=False, dataset_path='.data', dataset_file='tinystories/train.bin', lr=0.0001, nepochs=1, logger=False, logger_server_addr='localhost', logger_server_port=5001)
LlamaConfig {
  "_attn_implementation_autoset": true,
  "activation_function": "silu",
  "architectures": [
    "LlamaCasualForLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "fp32",
  "eos_token_id": 2,
  "flashattention": false,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size":

In [15]:
# Launch an external python process to finetune a pretrained LLaMa model on TinyStories
# If logger server is launched, then TensorBoard results can be accessed at localhost:6006
!PYTHONPATH=${HOME}/mikhalev/nntile/build-1.4.7/wrappers/python LD_LIBRARY_PATH=${HOME}/mikhalev/install/starpu-1.4.7/lib python ../wrappers/python/examples/llama_training.py \
    --restrict="cuda" --pretrained=local --config-path="../wrappers/python/examples/llama_1.3b_config.json" \
    --optimizer="adamw" --lr=1e-4 --dtype=bf16 --nepochs=1 --batch-size=1024 --minibatch-size=64 --minibatch-size-tile=8 \
    --dataset-file="tinystories/train.bin" `#--logger --logger-server-addr=127.0.0.1`

Namespace(remote_model_name='kimihailv/llama-1.3b', pretrained='local', checkpoint_path='', config_path='../wrappers/python/examples/llama_1.3b_config.json', save_checkpoint_path='.model', optimizer='adamw', model_path='.model', seq_len=1024, seq_len_tile=-1, batch_size=1024, minibatch_size=64, minibatch_size_tile=8, hidden_size_tile=-1, intermediate_size_tile=-1, n_head_tile=-1, dtype='bf16', restrict='cuda', flash_attention=False, use_redux=False, dataset_path='.data', dataset_file='tinystories/train.bin', lr=0.0001, nepochs=1, logger=False, logger_server_addr='localhost', logger_server_port=5001)
LlamaConfig {
  "_attn_implementation_autoset": true,
  "activation_function": "silu",
  "architectures": [
    "LlamaCasualForLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "dtype": "fp32",
  "eos_token_id": 2,
  "flashattention": false,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_siz