# **Install** required packages

In [None]:

!pip install transformers datasets bitsandbytes accelerate peft
!pip install scikit-learn torch --upgrade
!pip install evaluate optuna
!pip install unsloth


Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (1

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting optuna
  Downloading optuna-4.0.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
[31mERROR: Operation cancelled by user[0m[31m
[0mCollecting unsloth
  Downloading unsloth-2024.9-py3-none-any.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.7/54.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting xformers==0.0.27.post2 (from unsloth)
  Downloading xformers-0.0.27.post2-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)
Collecting triton>=3.0.0 (from unsloth)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadat

In [3]:
import os
import torch
from transformers import AutoTokenizer, TrainingArguments, Trainer
from transformers import BitsAndBytesConfig, EarlyStoppingCallback
#from datasets import Dataset
from sklearn.model_selection import train_test_split
import json
import hashlib
import evaluate
import numpy as np
from huggingface_hub import notebook_login
import time
import math
import warnings
import optuna
from torch.utils.data import DataLoader

ModuleNotFoundError: No module named 'evaluate'

# **Import Unsloth**

In [4]:
from unsloth import FastLanguageModel
from unsloth.models.phi2 import modeling_phi
warnings.filterwarnings("ignore", category=FutureWarning, module="torch.utils.checkpoint")

ModuleNotFoundError: No module named 'unsloth'

Data loading and preprocessing functions

In [5]:
def load_jsonl(path):
    with open(path, 'r') as file:
        return [json.loads(line) for line in file]

SyntaxError: 'return' outside function (<ipython-input-5-3395ab5306d5>, line 1)

In [None]:
def format_ultrachat_data(data):
    formatted_data = []
    for item in data:
        text = item['text']
        query_start = text.find("### Query:") + len("### Query:")
        response_start = text.find("### Response:") + len("### Response:")
        references_start = text.find("### References:") + len("### References:")

        query = text[query_start:response_start - len("### Response:")].strip()
        response = text[response_start:references_start - len("### References:")].strip()

        prompt_id = hashlib.sha256(query.encode()).hexdigest()

        formatted_item = {
            "prompt": query,
            "prompt_id": prompt_id,
            "messages": [
                {"content": query, "role": "user"},
                {"content": response, "role": "assistant"}
            ]
        }
        formatted_data.append(formatted_item)
    return formatted_data



In [None]:
def collate_and_tokenize(examples, tokenizer, max_length):
    texts = [" ".join([msg['content'] for msg in example['messages']]) for example in examples['data']]

    encoded = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

    encoded['labels'] = encoded['input_ids'].clone()
    return encoded

In [None]:
def prepare_datasets(data_path, tokenizer, max_length=2048):
    try:
        data = load_jsonl(data_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"The file {data_path} was not found. Please check the file path and try again.")

    if not data:
        raise ValueError(f"The file {data_path} is empty or could not be read properly.")

    train_data, test_data = train_test_split(data, test_size=0.3, random_state=42)

    train_data_formatted = format_ultrachat_data(train_data)
    test_data_formatted = format_ultrachat_data(test_data)

    train_dataset = Dataset.from_dict({"data": train_data_formatted})
    test_dataset = Dataset.from_dict({"data": test_data_formatted})

    print(f"Dataset size - Train: {len(train_dataset)}, Test: {len(test_dataset)}")

    tokenized_train = train_dataset.map(
        lambda examples: collate_and_tokenize(examples, tokenizer, max_length),
        batched=True,
        remove_columns=train_dataset.column_names
    )
    tokenized_test = test_dataset.map(
        lambda examples: collate_and_tokenize(examples, tokenizer, max_length),
        batched=True,
        remove_columns=test_dataset.column_names
    )

    return tokenized_train, tokenized_test

Set up environment and login

In [None]:
os.environ["HUGGINGFACE_TOKEN"] = "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"
notebook_login()
os.environ['HF_HOME'] = 'REDACTED'

# Load model and tokenizer using Unsloth
model_name = "microsoft/Phi-3-medium-4k-instruct"
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_sequence_length = 2048,
    dtype = None,
    load_in_4bit = True,
)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.truncation_side = "left"


In [None]:
# Prepare datasets
train_dataset, test_dataset = prepare_datasets("combined_UnitOps_Training_ZAR.jsonl", tokenizer, max_length=512)

Define LoRA config using Unsloth

In [None]:
lora_config = FastLanguageModel.get_peft_config(
    r=32,
    lora_alpha=32,
    target_modules=["qkv_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
# Apply LoRA
model = FastLanguageModel.get_peft_model(model, lora_config)

In [None]:
# Define the objective function for Optuna
def objective(trial):
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-5, 1e-1)

    training_args = TrainingArguments(
        output_dir="./phi3_unsloth_chemical_eng",
        num_train_epochs=5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=8,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        warmup_ratio=0.1,
        lr_scheduler_type="cosine",
        logging_steps=10,
        save_strategy="steps",
        save_steps=100,
        evaluation_strategy="steps",
        eval_steps=50,
        load_best_model_at_end=True,
        push_to_hub=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    trainer.train()
    eval_results = trainer.evaluate()
    return eval_results['eval_loss']

# Run Optuna optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Train with the best hyperparameters
best_lr = study.best_params['learning_rate']
best_wd = study.best_params['weight_decay']

final_training_args = TrainingArguments(
    output_dir="./phi3_unsloth_chemical_eng_final",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=best_lr,
    weight_decay=best_wd,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    evaluation_strategy="steps",
    eval_steps=50,
    load_best_model_at_end=True,
    push_to_hub=True,
)

final_trainer = Trainer(
    model=model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

final_trainer.train()

# Evaluate the model
eval_results = final_trainer.evaluate()
print(f"Final Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

# Save the fine-tuned model
model.save_pretrained("./phi3_unsloth_chemical_eng_final")
tokenizer.save_pretrained("./phi3_unsloth_chemical_eng_final")

# Push to Hugging Face Hub
model.push_to_hub("ShilpaSandhya/phi3_unsloth_chemical_eng")
tokenizer.push_to_hub("ShilpaSandhya/phi3_unsloth_chemical_eng")

# Generate example text
input_text = "Explain the basic principles in chemical engineering."
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
with torch.no_grad():
    outputs = model.generate(**inputs, max_new_tokens=200)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))