In [1]:
# Install required libraries
!pip install transformers datasets bitsandbytes accelerate peft
!pip install scikit-learn torch evaluate flash-attn wandb

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.13.0-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl (1

In [None]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
from sklearn.model_selection import train_test_split
import json
import hashlib
import wandb
import logging
import warnings
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

warnings.filterwarnings("ignore", category=FutureWarning, module="torch.utils.checkpoint")


In [None]:

# Data loading and processing functions
def load_jsonl(path):
    with open(path, 'r') as file:
        return [json.loads(line) for line in file]

def format_ultrachat_data(data):
    formatted_data = []
    for item in data:
        text = item['text']
        query_start = text.find("### Query:") + len("### Query:")
        response_start = text.find("### Response:") + len("### Response:")
        references_start = text.find("### References:") + len("### References:")

        query = text[query_start:response_start - len("### Response:")].strip()
        response = text[response_start:references_start - len("### References:")].strip()

        prompt_id = hashlib.sha256(query.encode()).hexdigest()

        formatted_item = {
            "prompt": query,
            "prompt_id": prompt_id,
            "messages": [
                {"content": query, "role": "user"},
                {"content": response, "role": "assistant"}
            ]
        }
        formatted_data.append(formatted_item)
    return formatted_data

def collate_and_tokenize(examples, tokenizer, max_length):
    texts = [" ".join([msg['content'] for msg in example['messages']]) for example in examples['data']]

    encoded = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    encoded['labels'] = encoded['input_ids'].clone()
    return encoded

def prepare_datasets(data_path, tokenizer, max_length=2048):
    try:
        data = load_jsonl(data_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"The file {data_path} was not found. Please check the file path and try again.")

    if not data:
        raise ValueError(f"The file {data_path} is empty or could not be read properly.")

    train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

    train_data_formatted = format_ultrachat_data(train_data)
    test_data_formatted = format_ultrachat_data(test_data)

    train_dataset = Dataset.from_dict({"data": train_data_formatted})
    test_dataset = Dataset.from_dict({"data": test_data_formatted})

    print(f"Dataset size - Train: {len(train_dataset)}, Test: {len(test_dataset)}")

    tokenized_train = train_dataset.map(
        lambda examples: collate_and_tokenize(examples, tokenizer, max_length),
        batched=True,
        remove_columns=train_dataset.column_names
    )
    tokenized_test = test_dataset.map(
        lambda examples: collate_and_tokenize(examples, tokenizer, max_length),
        batched=True,
        remove_columns=test_dataset.column_names
    )

    return tokenized_train, tokenized_test



In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# Model and tokenizer setup
model_name = "microsoft/Phi-3.5-mini-instruct"

# Quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3.5-mini-instruct/snapshots/af0dfb8029e8a74545d0736d30cb6b58d2f0f3f0/config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3.5-mini-instruct/snapshots/af0dfb8029e8a74545d0736d30cb6b58d2f0f3f0/config.json
Model config Phi3Config {
  "_name_or_path": "microsoft/Phi-3.5-mini-instruct",
  "architectures": [
    "Phi3ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "auto_map": {
    "AutoConfig": "microsoft/Phi-3.5-mini-instruct--configuration_phi3.Phi3Config",
    "AutoModelForCausalLM": "microsoft/Phi-3.5-mini-instruct--modeling_phi3.Phi3ForCausalLM"
  },
  "bos_token_id": 1,
  "embd_pdrop": 0.0,
  "eos_token_id": 32000,
  "hidden_act": "silu",
  "hidden_size": 3072,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "model_type": "phi3",
  "num_attenti

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing Phi3ForCausalLM.

All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3.5-mini-instruct.
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.
loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3.5-mini-instruct/snapshots/af0dfb8029e8a74545d0736d30cb6b58d2f0f3f0/generation_config.json
Generate config GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": [
    32007,
    32001,
    32000
  ],
  "pad_token_id": 32000
}

loading file tokenizer.model from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3.5-mini-instruct/snapshots/af0dfb8029e8a74545d0736d30cb6b58d2f0f3f0/tokenizer.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--microsoft--Phi-3.5-mini-instruct/snapshots/af0

In [None]:
# LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=[
        "self_attn.o_proj",
        "self_attn.qkv_proj",
        "mlp.gate_up_proj",
        "mlp.down_proj",
    ]
)

# Wrap the model with LoRA
model = get_peft_model(model, peft_config)

# Ensure only LoRA parameters are trainable
for name, param in model.named_parameters():
    if "lora" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Prepare optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

# Training loop
def train(model, train_dataloader, optimizer, num_epochs=3):
    device = next(model.parameters()).device
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
            batch = {k: v.to(device) for k, v in batch.items()}

            optimizer.zero_grad()

            outputs = model(**batch)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch + 1}, Average loss: {total_loss / len(train_dataloader)}")

In [None]:
from peft import get_peft_model, LoraConfig, TaskType
import torch

# Define QLoRA Config
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=[
        "self_attn.o_proj",
        "self_attn.qkv_proj",
        "mlp.gate_up_proj",
        "mlp.down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Get PEFT model
lora_model = get_peft_model(model, lora_config)

# Function to print trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for name, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
        print(f"Parameter: {name}, Shape: {param.shape}, Requires Grad: {param.requires_grad}, dtype: {param.dtype}")
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}")

# Print information about the model parameters
print_trainable_parameters(lora_model)

Parameter: base_model.model.model.embed_tokens.weight, Shape: torch.Size([32064, 3072]), Requires Grad: False, dtype: torch.float16
Parameter: base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight, Shape: torch.Size([4718592, 1]), Requires Grad: False, dtype: torch.uint8
Parameter: base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight, Shape: torch.Size([32, 3072]), Requires Grad: True, dtype: torch.float32
Parameter: base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight, Shape: torch.Size([3072, 32]), Requires Grad: True, dtype: torch.float32
Parameter: base_model.model.model.layers.0.self_attn.qkv_proj.base_layer.weight, Shape: torch.Size([14155776, 1]), Requires Grad: False, dtype: torch.uint8
Parameter: base_model.model.model.layers.0.self_attn.qkv_proj.lora_A.default.weight, Shape: torch.Size([32, 3072]), Requires Grad: True, dtype: torch.float32
Parameter: base_model.model.model.layers.0.self_attn.qkv_proj.lora_B.default.weight, Shap

In [None]:
# Ensure only appropriate parameters require gradients
for name, param in lora_model.named_parameters():
    if 'lora' in name:  # Only LoRA parameters should be trainable
        param.requires_grad = True
    else:
        param.requires_grad = False

print("\nAfter setting requires_grad:")
print_trainable_parameters(lora_model)

# Prepare the model for training
lora_model.train()


After setting requires_grad:
Parameter: base_model.model.model.embed_tokens.weight, Shape: torch.Size([32064, 3072]), Requires Grad: False, dtype: torch.float16
Parameter: base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight, Shape: torch.Size([4718592, 1]), Requires Grad: False, dtype: torch.uint8
Parameter: base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight, Shape: torch.Size([32, 3072]), Requires Grad: True, dtype: torch.float32
Parameter: base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight, Shape: torch.Size([3072, 32]), Requires Grad: True, dtype: torch.float32
Parameter: base_model.model.model.layers.0.self_attn.qkv_proj.base_layer.weight, Shape: torch.Size([14155776, 1]), Requires Grad: False, dtype: torch.uint8
Parameter: base_model.model.model.layers.0.self_attn.qkv_proj.lora_A.default.weight, Shape: torch.Size([32, 3072]), Requires Grad: True, dtype: torch.float32
Parameter: base_model.model.model.layers.0.self_attn.qkv_pr

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnit

In [None]:
# Prepare datasets
train_dataset, test_dataset = prepare_datasets("combined_UnitOps_Training_ZAR.jsonl", tokenizer, max_length=2048)


Dataset size - Train: 4370, Test: 1873


Map:   0%|          | 0/4370 [00:00<?, ? examples/s]

Map:   0%|          | 0/1873 [00:00<?, ? examples/s]

In [None]:
# Initialize wandb
wandb.init(project="DataScience_CapStone", entity="kunalraghuvanshi-the-university-of-western-australia")

# Training arguments
training_args = TrainingArguments(
    output_dir="./phi3_5_mini_instruct_qlora_chemical_eng",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    logging_dir='./logs',
    logging_steps=1,
    save_strategy="steps",
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    fp16=True,
    max_grad_norm=0.3,
    report_to=["wandb"],
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


PyTorch: setting up devices


In [None]:

# Data collator
def data_collator(examples):
    return tokenizer.pad(examples, padding=True, return_tensors="pt")

# Setup DataLoader
train_dataloader = DataLoader(
    train_dataset,
    batch_size=training_args.per_device_train_batch_size,
    shuffle=True,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    test_dataset,
    batch_size=training_args.per_device_eval_batch_size,
    collate_fn=data_collator
)

# Prepare optimizer
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=training_args.learning_rate)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lora_model.to(device)


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnit

In [None]:

# Data collator
def data_collator(examples):
    return tokenizer.pad(examples, padding=True, return_tensors="pt")

# Setup DataLoader
train_dataloader = DataLoader(
    train_dataset,
    batch_size=training_args.per_device_train_batch_size,
    shuffle=True,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    test_dataset,
    batch_size=training_args.per_device_eval_batch_size,
    collate_fn=data_collator
)

# Prepare optimizer
optimizer = torch.optim.AdamW(lora_model.parameters(), lr=training_args.learning_rate)

# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lora_model.to(device)


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Phi3ForCausalLM(
      (model): Phi3Model(
        (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
        (embed_dropout): Dropout(p=0.0, inplace=False)
        (layers): ModuleList(
          (0-31): 32 x Phi3DecoderLayer(
            (self_attn): Phi3Attention(
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3072, out_features=3072, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3072, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnit

In [None]:
lora_model.train()
for epoch in range(int(training_args.num_train_epochs)):
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch+1}")):
        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()

        outputs = lora_model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            avg_loss = total_loss / (step + 1)
            wandb.log({"train_loss": avg_loss, "epoch": epoch, "step": step})

    # Evaluation
    lora_model.eval()
    eval_loss = 0
    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = lora_model(**batch)
            eval_loss += outputs.loss.item()

    avg_eval_loss = eval_loss / len(eval_dataloader)
    wandb.log({"eval_loss": avg_eval_loss, "epoch": epoch})

    lora_model.train()


Epoch 1:   0%|          | 0/1093 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
# Define QLoRA Config
target_modules = []
for i in range(10):
    target_modules.extend([
        f'model.layers.{i}.self_attn.o_proj',
        f'model.layers.{i}.self_attn.qkv_proj',
        f'model.layers.{i}.mlp.gate_up_proj',
        f'model.layers.{i}.mlp.down_proj',
    ])

config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Get PEFT model
lora_model = get_peft_model(model, config)

# Ensure all parameters require gradients
for param in lora_model.parameters():
    param.requires_grad = True

# Print trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

print_trainable_parameters(lora_model)


RuntimeError: only Tensors of floating point and complex dtype can require gradients