In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from trl import SFTTrainer  # Hugging Face Fine-Tuning API
import os

# Use Flash Attention 2 if available

In [None]:

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TORCH_USE_CUDA_DSA"] = "1"

# Enable xFormers for faster inference

In [None]:
try:
    from torch._dynamo import optimizations
    torch.compile(optimizations=True)
except ImportError:
    print("Torch.compile is not available. Skipping.")

# Enable GPU/CPU Selection

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()

# Load Model Efficiently with 8-bit/4-bit Quantization (bitsandbytes)

In [None]:
model_name = "mistralai/Mistral-7B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16,  # Efficient FP16 loading
    device_map="auto",
    load_in_4bit=True,  # Optimized 4-bit quantization
    trust_remote_code=True
)

# Use AutoTokenizer (Faster Tokenization)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # Fix padding

# Use LoRA (Optimized Fine-Tuning)

In [None]:
config = LoraConfig(
    r=16,  # Increase Rank (better performance)
    lora_alpha=32, 
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], 
    lora_dropout=0.1, 
    bias="none", 
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config)

# Load Dataset Efficiently (Pre-tokenized for Faster Training)

In [None]:
dataset = load_dataset("json", data_files="business_qlora.jsonl")
split_dataset = dataset["train"].train_test_split(test_size=0.1)

In [None]:
def format_prompt(example):
    return {"example": f"[INST] {example['instruction']} [/INST] {example['response']}"}

dataset = split_dataset.map(format_prompt).remove_columns(["instruction", "response"])

# Tokenize Dataset Efficiently

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["example"], truncation=True, max_length=1024, padding="max_length")


In [None]:
tokenized_data = dataset.map(tokenize_function, batched=True)

# Optimized Training Arguments

training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=16,  # Increased for better parallelism
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=3,  # 3 Epochs with LoRA is often enough
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    learning_rate=5e-5,  # Optimized LR
    warmup_ratio=0.03,  # Stable warmup
    bf16=True,  # Uses Brain Floating Point for stability
    optim="adamw_torch",  # Faster optimizer
    deepspeed="ds_config.json",  # Enable DeepSpeed 3 (Zero Redundancy)
    report_to="none"
)

# Train Model using SFTTrainer (TRL)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    dataset_text_field="example",
    packing=True,  # Efficient packing
)

# Train

model.config.use_cache = False
trainer.train()

# Merge LoRA with Base Model

model = model.merge_and_unload()
model.save_pretrained("./business_llm")
tokenizer.save_pretrained("./business_llm")

# Cleanup GPU Memory

del model
torch.cuda.empty_cache()

# Load Fine-Tuned Model

base_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
model = get_peft_model(base_model, "./business_llm")

# Generate Response

In [None]:
def generate_response(user_input):
    prompt = f"[INST] {user_input} [/INST]"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=200)
    return tokenizer.decode(output[0], skip_special_tokens=True)

In [None]:
comment = "I want to start a foodbank in Washington, how do I?"
response = generate_response(comment)
print(response.split("[/INST]")[1])