In [1]:
import torch
import warnings
from datasets import load_dataset
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
warnings.filterwarnings("ignore")
import os
PUBLISH_TO_HUB = False
env_path = "/media/volume/LegalEase/Repos/CPSC5830-Team1/.env"
load_dotenv(env_path)
HF_READ_TOKEN = os.getenv("BENS_HUGGING_FACE_READ_TOKEN")
HF_WRITE_TOKEN = os.getenv("BENS_HUGGING_FACE_WRITE_TOKEN")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


torch.cuda.empty_cache()
print(f"Using device: {device}")

print(f"Read Token: {HF_READ_TOKEN}")  # Ensure this is not None


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda
Read Token: None


In [2]:
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.bfloat16,  
    device_map="auto",
    use_auth_token=HF_READ_TOKEN,
    cache_dir="/media/volume/LegalEaseMaxim/cache"  # Ensure this path exists
)


Loading checkpoint shards: 100%|██████████| 3/3 [00:04<00:00,  1.34s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


In [3]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 39.38 GiB of which 122.25 MiB is free. Process 427695 has 27.52 GiB memory in use. Including non-PyTorch memory, this process has 11.72 GiB memory in use. Of the allocated memory 10.94 GiB is allocated by PyTorch, and 303.61 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="/media/volume/LegalEaseMaxim/cache")

tokenizer.pad_token = tokenizer.eos_token  # Ensure padding is set correctly
tokenizer.padding_side = "left"
#Load Dataset (Structured Chat Data Format)
dataset_path = {"train": "my_dataset.json"}  # Ensures correct train key
data = load_dataset("json", data_files=dataset_path)



In [None]:

#Split Train and Test
split_data = data["train"].train_test_split(test_size=0.1)
def format_prompt(example):
    messages = example["messages"]
    formatted_text = ""
    for msg in messages:
        role = msg["role"]
        content = msg["content"]
        if role == "system":
            formatted_text += f"[SYSTEM] {content} [/SYSTEM]\n"
        elif role == "user":
            formatted_text += f"[INST] {content} [/INST]\n"
        elif role == "assistant":
            formatted_text += f"{content}\n"
    return {"formatted_text": formatted_text}
formatted_data = split_data.map(format_prompt).remove_columns(["messages"])




Map: 100%|██████████| 92/92 [00:00<00:00, 7183.36 examples/s]
Map: 100%|██████████| 11/11 [00:00<00:00, 2596.07 examples/s]


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["formatted_text"], truncation=True, padding="max_length", max_length=1024, add_special_tokens=True)


In [None]:

tokenized_data = formatted_data.map(tokenize_function, batched=True)
config = LoraConfig(
    r=16, 
    lora_alpha=32, 
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)


Map: 100%|██████████| 92/92 [00:00<00:00, 2752.17 examples/s]


Map: 100%|██████████| 11/11 [00:00<00:00, 1267.20 examples/s]


In [None]:

model = get_peft_model(model, config)
model.print_trainable_parameters() 
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


trainable params: 13,631,488 || all params: 7,255,363,584 || trainable%: 0.1879


In [None]:
training_args = TrainingArguments(
    output_dir="/media/volume/LegalEaseMaxim/output",  # Store model checkpoints on volume
    learning_rate=2e-4,  # Adjusted for Stability
    per_device_train_batch_size=4,  # Adjust Based on Available VRAM
    per_device_eval_batch_size=2,
    num_train_epochs=5,  # 5 Epochs is Sufficient for 600 Pairs
    weight_decay=0.01,
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    gradient_accumulation_steps=4,  # Helps with Large Batch Sizes
    warmup_steps=30,  #Dynamic Warmup (Shorter for Small Dataset)
    fp16=False,  
    bf16=True,#Use BF16 Instead for A100
    optim="paged_adamw_8bit"  # Optimized for Large Models
)
trainer = Trainer(
    model=model,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    args=training_args,
    data_collator=data_collator
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
model.config.use_cache = False 
trainer.train()
model.config.use_cache = True 
model.save_pretrained("./business_llm")
tokenizer.save_pretrained("./business_llm")
model.cpu()
torch.cuda.empty_cache()

PUBLISH_TO_HUB = True
if PUBLISH_TO_HUB:
    repo_name = "XCIT3D247/LegalEaseV2"
    model.push_to_hub(repo_name, use_auth_token=HF_WRITE_TOKEN)
    tokenizer.push_to_hub(repo_name, use_auth_token=HF_WRITE_TOKEN)
    print(f"Model successfully uploaded to: https://huggingface.co/{repo_name}")


Epoch,Training Loss,Validation Loss
1,3.868,3.169191
2,2.5446,1.586795
3,1.1246,0.656691
4,0.3701,0.453396


adapter_model.safetensors: 100%|██████████| 54.6M/54.6M [00:01<00:00, 30.8MB/s]
No files have been modified since last commit. Skipping to prevent empty commit.


Model successfully uploaded to: https://huggingface.co/XCIT3D247/LegalEaseV2


In [None]:

model.cpu()
torch.cuda.empty_cache()


In [None]:

# Define the repo name where the model is stored
repo_name = "XCIT3D247/LegalEaseV2"

# Load the tokenizer and model from Hugging Face Hub
tokenizer = AutoTokenizer.from_pretrained(repo_name)
model = AutoModelForCausalLM.from_pretrained(repo_name, torch_dtype=torch.bfloat16, device_map="auto")


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.08it/s]


In [None]:

# Function to generate responses
def generate_response(prompt, max_length=1028):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)


In [None]:

# Test the model with a few questions
test_questions = [
    "What type of business entity should I choose for a tech startup?",
    "What are the tax implications of forming an LLC?",
    "How does Delaware compare to Washington for incorporating a business?"
]


In [None]:

for question in test_questions:
    print(f"Q: {question}")
    print(f"A: {generate_response(question)}\n")


Q: What type of business entity should I choose for a tech startup?
A: What type of business entity should I choose for a tech startup?

A Delaware C-Corporation is a popular choice for tech startups due to its flexibility, ability to issue stock, and favorable corporate tax laws. However, an LLC or an S-Corporation may also be suitable depending on the specific needs of the business.

What are the benefits of a Delaware C-Corporation for a tech startup?

A Delaware C-Corporation offers several benefits for a tech startup, including:

- Flexibility: A C-Corporation can issue different classes of stock, allowing founders to retain control while attracting investors.
- Limited Liability: Shareholders are not personally liable for corporate debts or liabilities.
- Tax Advantages: C-Corporations can deduct losses and depreciation, reducing taxable income.
- Attracting Investors: C-Corporations can issue preferred stock, which can be attractive to investors seeking dividends and liquidation