In [1]:
pip install -r requirements.txt

Collecting datasets==2.14.6
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m258.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch==2.0.1
  Downloading torch-2.0.1-cp39-cp39-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m179.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting transformers==4.31.0
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m226.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl==0.4.7
  Downloading trl-0.4.7-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [2]:
import logging
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [1]:
import IPython
IPython.Application.instance().kernel.do_shutdown(True) # "True" is to restart


{'status': 'ok', 'restart': True}

In [7]:
!nvidia-smi

import torch

# Loop over all available GPUs
for i in range(torch.cuda.device_count()):
    print(f"Device Name (GPU {i}): {torch.cuda.get_device_name(i)}")

    # Get the total memory of the current GPU
    total_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
    print(f'Total CUDA memory on GPU {i}: {total_memory:.2f} GB')

    # Get the memory currently allocated on the current GPU
    allocated_memory = torch.cuda.memory_allocated(i) / 1024**3
    print(f'Used CUDA memory on GPU {i}: {allocated_memory:.2f} GB')
    
    # Get the memory currently allocated on the current GPU
    used_memory = torch.cuda.memory_stats(i)
    print(f'Used CUDA memory on GPU: {used_memory}')

    # Get the memory currently reserved on the current GPU
    reserved_memory = torch.cuda.memory_reserved(i) / 1024**3
    print(f'Reserved CUDA memory on GPU {i}: {reserved_memory:.2f} GB')
    
    print()  # Print a newline for better readability between GPU reports

Thu Nov  9 19:46:36 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       On  | 00000000:00:1B.0 Off |                    0 |
| N/A   40C    P0              69W /  70W |  14662MiB / 15360MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla T4                       On  | 00000000:00:1C.0 Off |  

In [4]:
# Check for available GPU devices and list them
if torch.cuda.is_available():
    available_gpus = [f'cuda:{i}' for i in range(torch.cuda.device_count())]
    logger.info(f"Available CUDA devices: {available_gpus}")
else:
    logger.warning("No CUDA devices available. Using CPU.")
    available_gpus = ['cpu']

INFO:__main__:Available CUDA devices: ['cuda:0', 'cuda:1', 'cuda:2', 'cuda:3']


In [5]:
# Load dataset
train_dataset = load_dataset("tatsu-lab/alpaca", split="train")
logger.info("Dataset loaded successfully.")

INFO:__main__:Dataset loaded successfully.


In [6]:
# Prepare tokenizer
tokenizer = AutoTokenizer.from_pretrained("daryl149/llama-2-7b-chat-hf", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
logger.info("Tokenizer prepared successfully.")

INFO:__main__:Tokenizer prepared successfully.


In [7]:
# Prepare model for quantization and load pretrained weights
quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)

model = AutoModelForCausalLM.from_pretrained(
    "daryl149/llama-2-7b-chat-hf",
    load_in_4bit=True,
    torch_dtype=torch.float16,
    device_map = "auto",
    # Remove the device_map argument since we are using DataParallel

    quantization_config=quantization_config,
)

model.resize_token_embeddings(len(tokenizer))
logger.info("Model loaded and token embeddings resized successfully.")


Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.87s/it]
INFO:__main__:Model loaded and token embeddings resized successfully.


In [16]:

# # If we have multiple GPUs, wrap the model with nn.DataParallel
# if torch.cuda.device_count() > 1:
#     logger.info(f"Using {torch.cuda.device_count()} GPUs for DataParallel")
#     model = torch.nn.DataParallel(model)

# # Move the model to GPU
# model = model.to('cuda:2')  # DataParallel will automatically use the other GPUs

INFO:__main__:Using 4 GPUs for DataParallel


In [8]:

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)
logger.info("Model Prepared for kbit training")

INFO:__main__:Model Prepared for kbit training


In [9]:
# Define PEFT configuration
peft_config = LoraConfig(r=16, lora_alpha=32, lora_dropout=0.05, bias="none", task_type="CAUSAL_LM")
model = get_peft_model(model, peft_config)
logger.info("PEFT configuration prepared successfully.")


INFO:__main__:PEFT configuration prepared successfully.


In [15]:
# Define training arguments
use_fp16 = torch.cuda.is_available()
training_args = TrainingArguments(
    output_dir="llama-finetuned-7b2",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    optim="adamw_torch",
    logging_steps=100,
    learning_rate=2e-4,
    fp16=use_fp16,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    num_train_epochs=1,
    save_strategy="epoch",
    push_to_hub=False,
    push_to_hub_token="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",
)
logger.info(f"Training arguments: {training_args}")

INFO:__main__:Training arguments: TrainingArguments(
_n_gpu=4,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=16,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=Non

In [19]:
# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args,
    packing=True,
    peft_config=peft_config,
)
logger.info("Trainer initialized successfully.")


INFO:__main__:Trainer initialized successfully.


In [22]:
# Clear any cached memory to free up as much GPU memory as possible
torch.cuda.empty_cache()

# Initialize DDP if you're using multiple GPUs
from torch.nn.parallel import DistributedDataParallel as DDP
if torch.cuda.device_count() > 1:
    model = DDP(model)

# Now, when you call methods on this model, it will spread the work across the GPUs more efficiently.
trainer.train()
logger.info("Training started.")

RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

In [8]:
import torch

# Loop over all available GPUs
for i in range(torch.cuda.device_count()):
    print(f"Device Name (GPU {i}): {torch.cuda.get_device_name(i)}")

    # Get the total memory of the current GPU
    total_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
    print(f'Total CUDA memory on GPU {i}: {total_memory:.2f} GB')

    # Get the memory currently allocated on the current GPU
    allocated_memory = torch.cuda.memory_allocated(i) / 1024**3
    print(f'Used CUDA memory on GPU {i}: {allocated_memory:.2f} GB')

    # Get the memory currently reserved on the current GPU
    reserved_memory = torch.cuda.memory_reserved(i) / 1024**3
    print(f'Reserved CUDA memory on GPU {i}: {reserved_memory:.2f} GB')
    
    print()  # Print a newline for better readability between GPU reports

Device Name (GPU 0): Tesla T4
Total CUDA memory on GPU 0: 14.58 GB
Used CUDA memory on GPU 0: 0.00 GB
Reserved CUDA memory on GPU 0: 0.00 GB

Device Name (GPU 1): Tesla T4
Total CUDA memory on GPU 1: 14.58 GB
Used CUDA memory on GPU 1: 0.00 GB
Reserved CUDA memory on GPU 1: 0.00 GB

Device Name (GPU 2): Tesla T4
Total CUDA memory on GPU 2: 14.58 GB
Used CUDA memory on GPU 2: 0.00 GB
Reserved CUDA memory on GPU 2: 0.00 GB

Device Name (GPU 3): Tesla T4
Total CUDA memory on GPU 3: 14.58 GB
Used CUDA memory on GPU 3: 0.00 GB
Reserved CUDA memory on GPU 3: 0.00 GB



In [9]:
!nvidia-smi


Thu Nov  9 21:02:45 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       On  | 00000000:00:1B.0 Off |                    0 |
| N/A   40C    P0              72W /  70W |  14662MiB / 15360MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla T4                       On  | 00000000:00:1C.0 Off |  