# Fine-tuning CodeLlama-7b for Minder Rules and Profiles Generation

This notebook implements the fine-tuning process for generating Minder rules and profiles. We'll use CodeLlama-7b as our base model and fine-tune it using the Alpaca format.

## Setup and Dependencies

In [None]:
!pip install transformers datasets peft torch pyyaml tqdm wandb

In [None]:
import os
import yaml
import json
import glob
from pathlib import Path
import torch
from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)

## Data Processing

First, we'll create functions to process the YAML files and convert them into training data in Alpaca format.

In [None]:
def load_yaml_file(file_path):
    """Load and parse a YAML file."""
    with open(file_path, 'r') as f:
        try:
            return yaml.safe_load(f)
        except yaml.YAMLError as e:
            print(f"Error parsing {file_path}: {e}")
            return None

def create_instruction_for_rule(rule_name):
    """Create an instruction prompt for a rule."""
    instructions = [
        f"Create a Minder rule for {rule_name}",
        f"Generate a YAML configuration for {rule_name}",
        f"Write a Minder security rule to implement {rule_name}",
        f"Define a Minder rule specification for {rule_name}"
    ]
    return instructions

def create_instruction_for_profile(profile_name):
    """Create an instruction prompt for a profile."""
    instructions = [
        f"Create a Minder profile for {profile_name}",
        f"Generate a YAML profile configuration for {profile_name}",
        f"Write a Minder security profile to implement {profile_name}",
        f"Define a Minder profile specification for {profile_name}"
    ]
    return instructions

def yaml_to_string(yaml_data):
    """Convert YAML data back to string format."""
    return yaml.dump(yaml_data, sort_keys=False)

def process_rules_and_profiles(base_path):
    """Process all rules and profiles into training data."""
    training_data = []
    
    # Process rules
    rules_path = os.path.join(base_path, 'rule-types', 'github', '*.yaml')
    for rule_file in glob.glob(rules_path):
        rule_data = load_yaml_file(rule_file)
        if rule_data:
            rule_name = rule_data.get('name', '')
            instructions = create_instruction_for_rule(rule_name)
            response = yaml_to_string(rule_data)
            
            for instruction in instructions:
                training_data.append({
                    "instruction": instruction,
                    "input": "",
                    "output": response
                })
    
    # Process profiles
    profiles_path = os.path.join(base_path, 'profiles', 'github', '*.yaml')
    for profile_file in glob.glob(profiles_path):
        profile_data = load_yaml_file(profile_file)
        if profile_data:
            profile_name = profile_data.get('name', '')
            instructions = create_instruction_for_profile(profile_name)
            response = yaml_to_string(profile_data)
            
            for instruction in instructions:
                training_data.append({
                    "instruction": instruction,
                    "input": "",
                    "output": response
                })
    
    return training_data

## Prepare Training Data

In [None]:
# Process the data
base_path = "minder-rules-and-profiles"  # Update this path as needed
training_data = process_rules_and_profiles(base_path)

# Convert to HuggingFace dataset
dataset = Dataset.from_list(training_data)

# Split into train/validation
dataset = dataset.train_test_split(test_size=0.1)

print(f"Training examples: {len(dataset['train'])}")
print(f"Validation examples: {len(dataset['test'])}")

## Model Setup and Fine-tuning

We'll use CodeLlama-7b with LoRA for efficient fine-tuning.

In [None]:
def prepare_model():
    """Prepare the model for fine-tuning."""
    model_name = "codellama/CodeLlama-7b-hf"
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Load model in 8-bit
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_8bit=True,
        device_map="auto",
        torch_dtype=torch.float16
    )
    
    # Prepare for k-bit training
    model = prepare_model_for_kbit_training(model)
    
    # Configure LoRA
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    # Get PEFT model
    model = get_peft_model(model, lora_config)
    
    return model, tokenizer

def format_prompt(example):
    """Format the prompt for training."""
    if example["input"]:
        prompt = f"### Instruction: {example['instruction']}\n### Input: {example['input']}\n### Response: {example['output']}"
    else:
        prompt = f"### Instruction: {example['instruction']}\n### Response: {example['output']}"
    return prompt

def preprocess_function(examples):
    """Preprocess the examples for training."""
    prompts = [format_prompt({
        "instruction": instruction,
        "input": input_text,
        "output": output
    }) for instruction, input_text, output in zip(examples["instruction"], examples["input"], examples["output"])]
    
    tokenized = tokenizer(prompts, truncation=True, padding=True, max_length=2048)
    return tokenized

In [None]:
# Prepare model and tokenizer
model, tokenizer = prepare_model()

# Preprocess datasets
tokenized_train = dataset["train"].map(preprocess_function, batched=True)
tokenized_val = dataset["test"].map(preprocess_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="minder-codellama-7b",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    warmup_steps=100,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    report_to="wandb"
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
# Start training
trainer.train()

## Save and Test the Model

In [None]:
# Save the fine-tuned model
trainer.save_model("minder-codellama-7b-final")

def generate_minder_config(instruction, model, tokenizer):
    """Generate Minder configuration using the fine-tuned model."""
    prompt = f"### Instruction: {instruction}\n### Response:"
    
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(
        **inputs,
        max_length=2048,
        temperature=0.7,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("### Response:")[-1].strip()

# Test the model
test_instruction = "Create a Minder rule for checking if repository has branch protection enabled"
generated_config = generate_minder_config(test_instruction, model, tokenizer)
print(generated_config)