In [9]:
#pip install --upgrade transformers


In [10]:
#Install the required packages for this project
!pip install transformers==4.31.0
!pip install peft==0.4.0
!pip install bitsandbytes==0.40.0
!pip install accelerate==0.21.0
!pip install trl==0.4.7

#!pip install transformers datasets bitsandbytes accelerate peft
!pip install scikit-learn
!pip install torch --upgrade
!pip install evaluate
!pip install flash-attn
!pip install wandb
#!pip install logging



In [11]:
pip list | grep transformers.

transformers                     4.31.0


In [16]:
import os
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, AutoTokenizer, BitsAndBytesConfig
from transformers import EarlyStoppingCallback, TrainerCallback
from peft import get_peft_model, LoraConfig, TaskType
from datasets import Dataset
from sklearn.model_selection import train_test_split
import json
import hashlib
import random
import evaluate
import numpy as np
from huggingface_hub import notebook_login
import time
import math
import warnings
import wandb
import logging
warnings.filterwarnings("ignore", category=FutureWarning, module="torch.utils.checkpoint")
from torch.utils.data import DataLoader
from peft import prepare_model_for_kbit_training
#from bitsandbytes import AutoTokenizer as AutoTokenizer8bit
#from transformers import AutoTokenizer

In [19]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Data loading and preprocessing functions
def load_jsonl(path):
    with open(path, 'r') as file:
        return [json.loads(line) for line in file]

def format_ultrachat_data(data):
    formatted_data = []
    for item in data:
        text = item['text']
        query_start = text.find("### Query:") + len("### Query:")
        response_start = text.find("### Response:") + len("### Response:")
        references_start = text.find("### References:") + len("### References:")

        query = text[query_start:response_start - len("### Response:")].strip()
        response = text[response_start:references_start - len("### References:")].strip()

        prompt_id = hashlib.sha256(query.encode()).hexdigest()

        formatted_item = {
            "prompt": query,
            "prompt_id": prompt_id,
            "messages": [
                {"content": query, "role": "user"},
                {"content": response, "role": "assistant"}
            ]
        }
        formatted_data.append(formatted_item)
    return formatted_data

def collate_and_tokenize(examples, tokenizer, max_length):
    texts = [" ".join([msg['content'] for msg in example['messages']]) for example in examples['data']]

    encoded = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    encoded['labels'] = encoded['input_ids'].clone()
    return encoded

def prepare_datasets(data_path, tokenizer, max_length=2048):
    try:
        data = load_jsonl(data_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"The file {data_path} was not found. Please check the file path and try again.")

    if not data:
        raise ValueError(f"The file {data_path} is empty or could not be read properly.")

    # Use 70-30 split
    train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

    train_data_formatted = format_ultrachat_data(train_data)
    test_data_formatted = format_ultrachat_data(test_data)

    train_dataset = Dataset.from_dict({"data": train_data_formatted})
    test_dataset = Dataset.from_dict({"data": test_data_formatted})

    print(f"Dataset size - Train: {len(train_dataset)}, Test: {len(test_dataset)}")

    # Tokenize datasets
    tokenized_train = train_dataset.map(
        lambda examples: collate_and_tokenize(examples, tokenizer, max_length),
        batched=True,
        remove_columns=train_dataset.column_names
    )
    tokenized_test = test_dataset.map(
        lambda examples: collate_and_tokenize(examples, tokenizer, max_length),
        batched=True,
        remove_columns=test_dataset.column_names
    )

    return tokenized_train, tokenized_test

In [14]:
import os

from huggingface_hub import notebook_login
# Set the token as an environment variable
os.environ["HUGGINGFACE_TOKEN"] = "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"

# Login to Hugging Face
notebook_login()

# Set HF_HOME
os.environ['HF_HOME'] = 'REDACTED'

#model_name = "microsoft/phi-3-mini"

model_name = "microsoft/Phi-3.5-mini-instruct"



# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)



# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"

# Enable gradient checkpointing
model.gradient_checkpointing_enable()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



ModuleNotFoundError: No module named 'transformers_modules.microsoft.Phi-3'

In [None]:
# Define LoRA Config
target_modules = []
for i in range(10):  # Phi-3.5-mini has 10 layers
    target_modules.extend([
        f'model.layers.{i}.self_attn.o_proj',
        f'model.layers.{i}.self_attn.qkv_proj',
        f'model.layers.{i}.mlp.gate_up_proj',
        f'model.layers.{i}.mlp.down_proj',
    ])

config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Get PEFT model
qlora_model = get_peft_model(model, config)

# Print initial trainable parameters
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

print_trainable_parameters(qlora_model)

# Prepare datasets
train_dataset, test_dataset = prepare_datasets("combined_UnitOps_Training_ZAR.jsonl", tokenizer, max_length=2048)



trainable params: 15728640 || all params: 3836808192 || trainable%: 0.41
Dataset size - Train: 4370, Test: 1873


Map:   0%|          | 0/4370 [00:00<?, ? examples/s]

Map:   0%|          | 0/1873 [00:00<?, ? examples/s]

In [None]:
!wandb.login(key='9355717ea3791047d912fe694267ef105fd67648')

/bin/bash: -c: line 1: syntax error near unexpected token `key='9355717ea3791047d912fe694267ef105fd67648''
/bin/bash: -c: line 1: `wandb.login(key='9355717ea3791047d912fe694267ef105fd67648')'


In [None]:
# Initialize wandb
wandb.init(project="DataScience_CapStone", entity="kunalraghuvanshi-the-university-of-western-australia")
#logging.basicConfig(level=logging.INFO)
import random
training_args = TrainingArguments(
    output_dir="./phi3_5_mini_instruct_lora_chemical_eng",
    run_name=f"phi3-5-mini-instruct-lora-run-{time.strftime('%Y%m%d-%H%M%S')}",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=1e-4,
    weight_decay=0.01,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    optim="paged_adamw_32bit",
    logging_dir='./logs',
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=10,
    eval_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    fp16=False,
    fp16_full_eval=True,
    bf16=True,
    max_grad_norm=0.3,
    report_to=["wandb"],
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3
)

class DetailedLoggingCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if state.is_local_process_zero:
            if 'loss' in logs:
                wandb.log({"train_loss": logs['loss'], "step": state.global_step})
            if 'eval_loss' in logs:
                wandb.log({"eval_loss": logs['eval_loss'], "step": state.global_step})
                perplexity = math.exp(logs['eval_loss'])
                wandb.log({"perplexity": perplexity, "step": state.global_step})

            # Log memory usage
            memory_used = torch.cuda.memory_allocated() / 1e9  # Convert to GB
            wandb.log({"memory_used_gb": memory_used, "step": state.global_step})

def data_collator(examples):
    return tokenizer.pad(examples, padding=True, return_tensors="pt")

train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    shuffle=True,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    test_dataset,
    batch_size=4,
    collate_fn=data_collator
)

trainer = Trainer(
    model=qlora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    callbacks=[early_stopping_callback, DetailedLoggingCallback()],
)

# Disable cache to prevent warning, re-enable for inference
model.config.use_cache = False

# Efficiency metrics
start_time = time.time()
start_memory = torch.cuda.memory_allocated()
trainer.train()
end_time = time.time()
end_memory = torch.cuda.memory_allocated()

training_time = end_time - start_time
memory_used = end_memory - start_memory

# Performance evaluation
eval_results = trainer.evaluate()

print(f"Training Time: {training_time:.2f} seconds")
print(f"Memory Used: {memory_used / 1e9:.2f} GB")
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

# Add this after training
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


In [None]:
# Save the fine-tuned model
qlora_model.save_pretrained("./phi3_mini_qlora_chemical_eng_final")
tokenizer.save_pretrained("./phi3_mini_qlora_chemical_eng_final")

# Push the model and tokenizer to Hugging Face hub
qlora_model.push_to_hub("KunalRaghuvanshi/phi3_mini_qlora_chemical_eng")
tokenizer.push_to_hub("KunalRaghuvanshi/phi3_mini_qlora_chemical_eng")

# Example of generating text with the fine-tuned model
input_text = "Explain the basic principles in chemical engineering."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(qlora_model.device)
with torch.no_grad():
    outputs = qlora_model.generate(input_ids, max_new_tokens=200, temperature=0.7, top_p=0.9)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Optional: Merge LoRA weights with the base model for easier deployment
from peft import AutoPeftModelForCausalLM

merged_model = AutoPeftModelForCausalLM.from_pretrained("./phi3_mini_qlora_chemical_eng_final", torch_dtype=torch.float16)
merged_model = merged_model.merge_and_unload()
merged_model.save_pretrained("./phi3_mini_merged_qlora_chemical_eng_model")
tokenizer.save_pretrained("./phi3_mini_merged_chemical_eng_model")





