# Cortex Compliance AI - Fine-Tuning Notebook

Fine-tunes Mistral-7B for Russian compliance document generation.

## Quick Start:
1. Runtime → Change runtime type → **T4 GPU**
2. Run all cells
3. Enter your HuggingFace token when prompted

In [None]:
# Step 1: Install dependencies (fixed compatible versions)
!pip install -q transformers==4.44.0 accelerate==0.33.0 peft==0.12.0 bitsandbytes==0.43.1 datasets huggingface_hub trl
!pip install -q sentencepiece protobuf

In [None]:
# Step 2: Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

In [None]:
# Step 3: Load Training Data - 217 Russian Business Document Templates
# Includes: Contracts, Corporate Docs, Financial, HR, Legal, Tax, Industry, Specialized and more!

import json

# Download training data from GitHub (217 examples from 265 templates)
!wget -q https://raw.githubusercontent.com/maanisingh/cortex-compliance-ai/main/combined_training_data.jsonl -O training_data.jsonl

# Load training data
TRAINING_DATA = []
with open('training_data.jsonl', 'r') as f:
    for line in f:
        TRAINING_DATA.append(json.loads(line))

print(f"Loaded {len(TRAINING_DATA)} training examples")
print(f"\nSample categories:")
for i, item in enumerate(TRAINING_DATA[:5]):
    print(f"  {i+1}. {item['instruction'][:60]}...")

In [None]:
# Step 4: Prepare dataset
from datasets import Dataset

def format_prompt(example):
    return {"text": f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"}

dataset = Dataset.from_list(TRAINING_DATA)
dataset = dataset.map(format_prompt)
print(dataset)

In [None]:
# Step 5: Load model with 4-bit quantization
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Using Mistral-7B-Instruct for better instruction following
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with quantization
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
print(f"Model loaded: {MODEL_NAME}")

In [None]:
# Step 6: Configure LoRA
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
# Step 7: Train using Trainer (compatible with all versions)
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)

training_args = TrainingArguments(
    output_dir="./cortex-compliance-ai",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_steps=10,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=1,
    save_strategy="epoch",
    optim="paged_adamw_8bit",
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()
print("Training complete!")

In [None]:
# Step 8: Save to Hugging Face Hub
from huggingface_hub import whoami

# Get your HuggingFace username automatically
hf_user = whoami()["name"]
MODEL_REPO = f"{hf_user}/cortex-compliance-ai"

print(f"Uploading to: {MODEL_REPO}")
model.push_to_hub(MODEL_REPO)
tokenizer.push_to_hub(MODEL_REPO)
print(f"Model saved to: https://huggingface.co/{MODEL_REPO}")

In [None]:
# Step 9: Test the model
test_prompt = "### Instruction:\nGenerate a Personal Data Processing Policy for ООО Тест (INN: 1234567890)\n\n### Response:\n"
inputs = tokenizer(test_prompt, return_tensors="pt").to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=300,
    temperature=0.7,
    do_sample=True,
    pad_token_id=tokenizer.eos_token_id
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

## Done!

Your model is now available at: https://huggingface.co/maaninder/cortex-compliance-ai

The Cortex GRC backend is configured to use this model for AI document generation.