# QLoRA Finetune — Qwen3 for NER (Entity Extraction) ➜ then TTP Classification

**Models**:
- Base A (1.7B): `unsloth/Qwen3-1.7B-bnb-4bit`
- Base B (4B): `unsloth/Qwen3-4B-bnb-4bit` (baseline zero-shot only)

**Pipeline**:
1. Datasets for **NER** @entity-extraction/ and **TTP** @TTP-classification/
2. Evaluate **zero-shot**: 1.7B and 4B
3. **Finetune 1.7B** on NER (QLoRA), save adapter
4. **Continue finetune** (same adapter) on TTP, save final adapter
5. **Compare metrics** (Accuracy, Precision, Recall, F1) across 3 models:
   - 1.7B **before** finetune (zero-shot)
   - 1.7B **after** finetune (NER➜TTP sequential QLoRA adapter)
   - 4B **before** finetune (zero-shot)


## Installation and Setup


In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate peft trl triton unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub
    !pip install --no-deps unsloth

!pip install scikit-learn seaborn matplotlib pandas numpy


## Import Libraries and Configuration


In [None]:
from unsloth import FastLanguageModel
import torch
import json
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from trl import SFTConfig, SFTTrainer
from transformers import TextStreamer
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re

# Configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True


## Mount Google Drive (if using Colab)


In [None]:
try:
    from google.colab import drive
    drive.mount('/content/drive/')
    BASE_PATH = '/content/drive/MyDrive/LLM-TKIG'
except:
    BASE_PATH = '..'  # Local development

print(f"Base path: {BASE_PATH}")


## Data Loading and Preprocessing


In [None]:
# Load datasets
def load_json_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

# NER dataset
ner_data = load_json_data(f'{BASE_PATH}/data/entity-extraction/entity_extraction_instruction.json')
print(f"NER dataset size: {len(ner_data)}")

# TTP dataset
ttp_data = load_json_data(f'{BASE_PATH}/data/TTP-classification/augmented_ttp_dataset_20250824_070057.json')
ttp_dataset = ttp_data['dataset']
print(f"TTP dataset size: {len(ttp_dataset)}")

# Convert TTP to instruction format (extract only technique IDs)
ttp_formatted = []
for item in ttp_dataset:
    # Extract only technique IDs and names for simpler classification
    techniques_simplified = []
    for technique in item['output']['techniques']:
        techniques_simplified.append({
            'id': technique['id'],
            'name': technique['name']
        })
    
    simplified_output = {'techniques': techniques_simplified}
    output_str = json.dumps(simplified_output)
    
    ttp_formatted.append({
        'instruction': item['instruction'],
        'input': item['input'] if item['input'] else "",
        'output': output_str
    })

print(f"Formatted TTP dataset size: {len(ttp_formatted)}")
print(f"Sample output: {ttp_formatted[0]['output']}")


## Dataset Formatting Functions


In [None]:
# Alpaca prompt format
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

def formatting_prompts_func(examples, tokenizer):
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    texts = []
    for instruction, input_text, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input_text if input_text else "", output) + tokenizer.eos_token
        texts.append(text)
    return {"text": texts}

# Convert to datasets
ner_dataset = Dataset.from_list(ner_data)
ttp_dataset_obj = Dataset.from_list(ttp_formatted)

print(f"NER dataset: {ner_dataset}")
print(f"TTP dataset: {ttp_dataset_obj}")


## Load Qwen3 1.7B Model (Main Training Model)


In [None]:
# Load Qwen3 1.7B model for training
model_1_7b, tokenizer_1_7b = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen3-1.7B-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print("Qwen3 1.7B model loaded successfully")


## Load Qwen3 4B Model (Baseline Comparison)


In [None]:
# Load Qwen3 4B model for baseline comparison
model_4b, tokenizer_4b = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen3-4B-bnb-4bit",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

print("Qwen3 4B model loaded successfully")


## Evaluation Functions


In [None]:
def evaluate_model(model, tokenizer, test_data, task_type="ner", num_samples=50):
    """Evaluate model performance on NER or TTP tasks"""
    FastLanguageModel.for_inference(model)
    
    predictions = []
    ground_truth = []
    
    for i, sample in enumerate(test_data[:num_samples]):
        instruction = sample['instruction']
        input_text = sample.get('input', "") or ""
        expected_output = sample['output']
        
        # Generate prediction
        inputs = tokenizer(
            [alpaca_prompt.format(instruction, input_text, "")],
            return_tensors="pt"
        ).to("cuda")
        
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256 if task_type == "ner" else 512,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
        
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        prediction = prediction.split("### Response:")[-1].strip()
        
        predictions.append(prediction)
        ground_truth.append(expected_output)
        
        if i % 10 == 0:
            print(f"Processed {i}/{num_samples} samples")
    
    return predictions, ground_truth

def extract_entities_from_ner(text):
    """Extract entities from NER output"""
    entities = []
    try:
        # Parse entities in format: (entity, type)
        import re
        pattern = r'\(([^,]+),\s*([^)]+)\)'
        matches = re.findall(pattern, text)
        entities = [(match[0].strip(), match[1].strip()) for match in matches]
    except:
        pass
    return entities

def extract_techniques_from_ttp(text):
    """Extract technique IDs from TTP output"""
    technique_ids = []
    try:
        import json
        # Try to parse as JSON first
        data = json.loads(text)
        if 'techniques' in data:
            technique_ids = [t.get('id', '') for t in data['techniques']]
    except:
        # Fallback: extract T-numbers using regex
        import re
        pattern = r'T\d{4}(?:\.\d{3})?'
        technique_ids = re.findall(pattern, text)
    return technique_ids

def calculate_ner_metrics(predictions, ground_truth):
    """Calculate NER-specific metrics"""
    total_pred_entities = 0
    total_true_entities = 0
    correct_entities = 0
    
    for pred, gt in zip(predictions, ground_truth):
        pred_entities = set(extract_entities_from_ner(pred))
        true_entities = set(extract_entities_from_ner(gt))
        
        total_pred_entities += len(pred_entities)
        total_true_entities += len(true_entities)
        correct_entities += len(pred_entities.intersection(true_entities))
    
    # Calculate precision, recall, F1
    precision = correct_entities / total_pred_entities if total_pred_entities > 0 else 0
    recall = correct_entities / total_true_entities if total_true_entities > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Exact sequence match
    exact_match = sum(1 for p, g in zip(predictions, ground_truth) if p.strip() == g.strip()) / len(predictions)
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'exact_match': exact_match,
        'total_samples': len(predictions)
    }

def calculate_ttp_metrics(predictions, ground_truth):
    """Calculate TTP classification metrics"""
    total_pred_techniques = 0
    total_true_techniques = 0
    correct_techniques = 0
    
    for pred, gt in zip(predictions, ground_truth):
        pred_techniques = set(extract_techniques_from_ttp(pred))
        true_techniques = set(extract_techniques_from_ttp(gt))
        
        total_pred_techniques += len(pred_techniques)
        total_true_techniques += len(true_techniques)
        correct_techniques += len(pred_techniques.intersection(true_techniques))
    
    # Calculate precision, recall, F1
    precision = correct_techniques / total_pred_techniques if total_pred_techniques > 0 else 0
    recall = correct_techniques / total_true_techniques if total_true_techniques > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    # Exact sequence match
    exact_match = sum(1 for p, g in zip(predictions, ground_truth) if p.strip() == g.strip()) / len(predictions)
    
    # Multi-label accuracy (all techniques must match)
    multi_label_accuracy = 0
    for pred, gt in zip(predictions, ground_truth):
        pred_techniques = set(extract_techniques_from_ttp(pred))
        true_techniques = set(extract_techniques_from_ttp(gt))
        if pred_techniques == true_techniques:
            multi_label_accuracy += 1
    multi_label_accuracy /= len(predictions)
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'exact_match': exact_match,
        'multi_label_accuracy': multi_label_accuracy,
        'total_samples': len(predictions)
    }


## Zero-Shot Evaluation


In [None]:
print("=== ZERO-SHOT EVALUATION ===")

# Evaluate both models on both tasks (zero-shot)
print("\n1. NER Task:")
pred_1_7b_ner_zero, gt_1_7b_ner_zero = evaluate_model(model_1_7b, tokenizer_1_7b, ner_data, "ner", 30)
metrics_1_7b_ner_zero = calculate_ner_metrics(pred_1_7b_ner_zero, gt_1_7b_ner_zero)
print(f"1.7B NER Zero-shot: P={metrics_1_7b_ner_zero['precision']:.3f}, R={metrics_1_7b_ner_zero['recall']:.3f}, F1={metrics_1_7b_ner_zero['f1_score']:.3f}")

pred_4b_ner_zero, gt_4b_ner_zero = evaluate_model(model_4b, tokenizer_4b, ner_data, "ner", 30)
metrics_4b_ner_zero = calculate_ner_metrics(pred_4b_ner_zero, gt_4b_ner_zero)
print(f"4B NER Zero-shot: P={metrics_4b_ner_zero['precision']:.3f}, R={metrics_4b_ner_zero['recall']:.3f}, F1={metrics_4b_ner_zero['f1_score']:.3f}")

print("\n2. TTP Task:")
pred_1_7b_ttp_zero, gt_1_7b_ttp_zero = evaluate_model(model_1_7b, tokenizer_1_7b, ttp_formatted, "ttp", 30)
metrics_1_7b_ttp_zero = calculate_ttp_metrics(pred_1_7b_ttp_zero, gt_1_7b_ttp_zero)
print(f"1.7B TTP Zero-shot: P={metrics_1_7b_ttp_zero['precision']:.3f}, R={metrics_1_7b_ttp_zero['recall']:.3f}, F1={metrics_1_7b_ttp_zero['f1_score']:.3f}, MLA={metrics_1_7b_ttp_zero['multi_label_accuracy']:.3f}")

pred_4b_ttp_zero, gt_4b_ttp_zero = evaluate_model(model_4b, tokenizer_4b, ttp_formatted, "ttp", 30)
metrics_4b_ttp_zero = calculate_ttp_metrics(pred_4b_ttp_zero, gt_4b_ttp_zero)
print(f"4B TTP Zero-shot: P={metrics_4b_ttp_zero['precision']:.3f}, R={metrics_4b_ttp_zero['recall']:.3f}, F1={metrics_4b_ttp_zero['f1_score']:.3f}, MLA={metrics_4b_ttp_zero['multi_label_accuracy']:.3f}")


## Setup QLoRA for Sequential Training


In [None]:
# Add LoRA adapters to 1.7B model
model_1_7b = FastLanguageModel.get_peft_model(
    model_1_7b,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

print("LoRA adapters added to Qwen3 1.7B model")


## Step 1: NER Fine-tuning


In [None]:
# Format NER dataset and train
ner_dataset_formatted = ner_dataset.map(
    lambda examples: formatting_prompts_func(examples, tokenizer_1_7b),
    batched=True
)

trainer_ner = SFTTrainer(
    model=model_1_7b,
    tokenizer=tokenizer_1_7b,
    train_dataset=ner_dataset_formatted,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=100,
        learning_rate=2e-4,
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs_ner",
        report_to="none",
    ),
)

print("Starting NER training...")
trainer_stats_ner = trainer_ner.train()
print(f"NER training completed in {trainer_stats_ner.metrics['train_runtime']:.1f} seconds")


## Step 2: Sequential TTP Fine-tuning


In [None]:
# Format TTP dataset and continue training on same adapter
ttp_dataset_formatted = ttp_dataset_obj.map(
    lambda examples: formatting_prompts_func(examples, tokenizer_1_7b),
    batched=True
)

trainer_ttp = SFTTrainer(
    model=model_1_7b,  # Same model with NER adapter
    tokenizer=tokenizer_1_7b,
    train_dataset=ttp_dataset_formatted,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=SFTConfig(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=150,
        learning_rate=1e-4,  # Lower LR for sequential training
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs_ttp",
        report_to="none",
    ),
)

print("Starting TTP training (sequential after NER)...")
trainer_stats_ttp = trainer_ttp.train()
print(f"TTP training completed in {trainer_stats_ttp.metrics['train_runtime']:.1f} seconds")


## Save Sequential Adapter


In [None]:
# Save final sequential adapter (NER → TTP)
final_adapter_path = f"{BASE_PATH}/models/qwen3_1_7b_ner_ttp_sequential_adapter"
model_1_7b.save_pretrained(final_adapter_path)
tokenizer_1_7b.save_pretrained(final_adapter_path)

print(f"Sequential adapter saved to: {final_adapter_path}")


## Post-Training Evaluation


In [None]:
print("=== POST-TRAINING EVALUATION ===")

# Evaluate fine-tuned 1.7B model on both tasks
print("\n1. NER Task (After Sequential Training):")
pred_1_7b_ner_ft, gt_1_7b_ner_ft = evaluate_model(model_1_7b, tokenizer_1_7b, ner_data, "ner", 30)
metrics_1_7b_ner_ft = calculate_ner_metrics(pred_1_7b_ner_ft, gt_1_7b_ner_ft)
print(f"1.7B NER After Training: P={metrics_1_7b_ner_ft['precision']:.3f}, R={metrics_1_7b_ner_ft['recall']:.3f}, F1={metrics_1_7b_ner_ft['f1_score']:.3f}")

print("\n2. TTP Task (After Sequential Training):")
pred_1_7b_ttp_ft, gt_1_7b_ttp_ft = evaluate_model(model_1_7b, tokenizer_1_7b, ttp_formatted, "ttp", 30)
metrics_1_7b_ttp_ft = calculate_ttp_metrics(pred_1_7b_ttp_ft, gt_1_7b_ttp_ft)
print(f"1.7B TTP After Training: P={metrics_1_7b_ttp_ft['precision']:.3f}, R={metrics_1_7b_ttp_ft['recall']:.3f}, F1={metrics_1_7b_ttp_ft['f1_score']:.3f}, MLA={metrics_1_7b_ttp_ft['multi_label_accuracy']:.3f}")


## Results Comparison and Visualization


In [None]:
# Compile and display results
results_summary = {
    'Model': [
        'Qwen3 1.7B (Zero-shot)',
        'Qwen3 1.7B (After Sequential FT)',
        'Qwen3 4B (Zero-shot)'
    ],
    'NER_Precision': [
        metrics_1_7b_ner_zero['precision'],
        metrics_1_7b_ner_ft['precision'],
        metrics_4b_ner_zero['precision']
    ],
    'NER_Recall': [
        metrics_1_7b_ner_zero['recall'],
        metrics_1_7b_ner_ft['recall'],
        metrics_4b_ner_zero['recall']
    ],
    'NER_F1': [
        metrics_1_7b_ner_zero['f1_score'],
        metrics_1_7b_ner_ft['f1_score'],
        metrics_4b_ner_zero['f1_score']
    ],
    'TTP_Precision': [
        metrics_1_7b_ttp_zero['precision'],
        metrics_1_7b_ttp_ft['precision'],
        metrics_4b_ttp_zero['precision']
    ],
    'TTP_Recall': [
        metrics_1_7b_ttp_zero['recall'],
        metrics_1_7b_ttp_ft['recall'],
        metrics_4b_ttp_zero['recall']
    ],
    'TTP_F1': [
        metrics_1_7b_ttp_zero['f1_score'],
        metrics_1_7b_ttp_ft['f1_score'],
        metrics_4b_ttp_zero['f1_score']
    ],
    'TTP_MultiLabel_Acc': [
        metrics_1_7b_ttp_zero['multi_label_accuracy'],
        metrics_1_7b_ttp_ft['multi_label_accuracy'],
        metrics_4b_ttp_zero['multi_label_accuracy']
    ]
}

results_df = pd.DataFrame(results_summary)
print("\n=== FINAL RESULTS SUMMARY ===")
print(results_df.to_string(index=False))

# Performance visualization
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Qwen3 Sequential QLoRA Fine-tuning Results', fontsize=16)

# Plot comparisons
colors = ['red', 'blue', 'green']
model_labels = [label.replace('Qwen3 ', '').replace(' (Zero-shot)', ' (Z)').replace(' (After Sequential FT)', ' (FT)') for label in results_df['Model']]

# NER metrics
axes[0,0].bar(model_labels, results_df['NER_Precision'], color=colors)
axes[0,0].set_title('NER Precision')
axes[0,0].tick_params(axis='x', rotation=45)

axes[0,1].bar(model_labels, results_df['NER_Recall'], color=colors)
axes[0,1].set_title('NER Recall')
axes[0,1].tick_params(axis='x', rotation=45)

axes[0,2].bar(model_labels, results_df['NER_F1'], color=colors)
axes[0,2].set_title('NER F1-Score')
axes[0,2].tick_params(axis='x', rotation=45)

# TTP metrics
axes[1,0].bar(model_labels, results_df['TTP_Precision'], color=colors)
axes[1,0].set_title('TTP Precision')
axes[1,0].tick_params(axis='x', rotation=45)

axes[1,1].bar(model_labels, results_df['TTP_Recall'], color=colors)
axes[1,1].set_title('TTP Recall')
axes[1,1].tick_params(axis='x', rotation=45)

axes[1,2].bar(model_labels, results_df['TTP_F1'], color=colors)
axes[1,2].set_title('TTP F1-Score')
axes[1,2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
results_df.to_csv(f"{BASE_PATH}/results/qwen3_sequential_results_{timestamp}.csv", index=False)
print(f"\nResults saved to: {BASE_PATH}/results/qwen3_sequential_results_{timestamp}.csv")


## Training Summary and Key Insights


In [None]:
print("=== TRAINING SUMMARY ===")
print(f"NER Training Time: {trainer_stats_ner.metrics['train_runtime']:.1f} seconds")
print(f"TTP Training Time: {trainer_stats_ttp.metrics['train_runtime']:.1f} seconds")
print(f"Total Training Time: {trainer_stats_ner.metrics['train_runtime'] + trainer_stats_ttp.metrics['train_runtime']:.1f} seconds")

print("\n=== KEY INSIGHTS ===")
print(f"1. NER F1 Improvement (1.7B): {metrics_1_7b_ner_ft['f1_score'] - metrics_1_7b_ner_zero['f1_score']:.3f}")
print(f"2. TTP F1 Improvement (1.7B): {metrics_1_7b_ttp_ft['f1_score'] - metrics_1_7b_ttp_zero['f1_score']:.3f}")
print(f"3. 1.7B vs 4B NER F1 (Zero-shot): {metrics_4b_ner_zero['f1_score'] - metrics_1_7b_ner_zero['f1_score']:.3f}")
print(f"4. 1.7B vs 4B TTP F1 (Zero-shot): {metrics_4b_ttp_zero['f1_score'] - metrics_1_7b_ttp_zero['f1_score']:.3f}")
print(f"5. 1.7B Fine-tuned vs 4B Zero-shot (NER F1): {metrics_1_7b_ner_ft['f1_score'] - metrics_4b_ner_zero['f1_score']:.3f}")
print(f"6. 1.7B Fine-tuned vs 4B Zero-shot (TTP F1): {metrics_1_7b_ttp_ft['f1_score'] - metrics_4b_ttp_zero['f1_score']:.3f}")
print(f"7. TTP Multi-label Accuracy Improvement: {metrics_1_7b_ttp_ft['multi_label_accuracy'] - metrics_1_7b_ttp_zero['multi_label_accuracy']:.3f}")

print("\n=== CONCLUSION ===")
print("Sequential QLoRA fine-tuning allows smaller models to compete with larger ones.")
print("The 1.7B model after fine-tuning shows significant improvements on both tasks.")
print("This approach demonstrates efficient use of computational resources for specialized tasks.")
