In [1]:
import json
import os
import torch
import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Tuple
import logging
import re
from tqdm import tqdm

# Transformers and training
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("📚 Dependencies loaded successfully!")
print(f"🔥 PyTorch version: {torch.__version__}")
print(f"🤖 CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"🎮 GPU: {torch.cuda.get_device_name(0)}")
    print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


📚 Dependencies loaded successfully!
🔥 PyTorch version: 2.8.0
🤖 CUDA available: False


In [2]:
# Load merged MITRE ATT&CK dataset
dataset_path = "../data/TTP-classification/merged_mitre_attack_dataset.json"

print("📂 Loading MITRE ATT&CK dataset...")
with open(dataset_path, 'r', encoding='utf-8') as f:
    mitre_data = json.load(f)

# Extract training samples
training_samples = mitre_data['dataset']
print(f"✅ Loaded {len(training_samples):,} training samples")

# Analyze dataset structure
print("\n📊 Dataset Analysis:")
matrices = {'enterprise': 0, 'mobile': 0, 'ics': 0}
techniques_count = 0
sub_techniques_count = 0
technique_ids = set()

for sample in training_samples:
    technique = sample['output']['techniques'][0]
    matrix = technique['matrix']
    technique_id = technique['id']

    matrices[matrix] += 1
    technique_ids.add(technique_id)

    if '.' in technique_id:
        sub_techniques_count += 1
    else:
        techniques_count += 1

print(f"  • Enterprise: {matrices['enterprise']:,} samples")
print(f"  • Mobile: {matrices['mobile']:,} samples")
print(f"  • ICS: {matrices['ics']:,} samples")
print(f"  • Techniques: {techniques_count:,} samples")
print(f"  • Sub-techniques: {sub_techniques_count:,} samples")
print(f"  • Unique technique IDs: {len(technique_ids):,}")

# Show sample data
print("\n📝 Sample Training Example:")
sample = training_samples[0]
print(f"Instruction: {sample['instruction'][:100]}...")
print(f"Technique ID: {sample['output']['techniques'][0]['id']}")
print(f"Technique Name: {sample['output']['techniques'][0]['name']}")
print(f"Matrix: {sample['output']['techniques'][0]['matrix']}")


📂 Loading MITRE ATT&CK dataset...
✅ Loaded 921 training samples

📊 Dataset Analysis:
  • Enterprise: 691 samples
  • Mobile: 135 samples
  • ICS: 95 samples
  • Techniques: 406 samples
  • Sub-techniques: 515 samples
  • Unique technique IDs: 921

📝 Sample Training Example:
Instruction: Adversaries may inject malicious code into process via Extra Window Memory (EWM) in order to evade p...
Technique ID: T1055.011
Technique Name: Extra Window Memory Injection
Matrix: enterprise


In [3]:
def format_training_sample(sample: Dict[str, Any]) -> Dict[str, str]:
    """
    Format training sample for instruction tuning

    Input format:
    {
        "instruction": "Adversary behavior description...",
        "output": {
            "techniques": [{
                "id": "T1055.011",
                "name": "Extra Window Memory Injection",
                "description": "...",
                "matrix": "enterprise"
            }]
        }
    }

    Output format for instruction tuning:
    {
        "text": "<|im_start|>system\nYou are a cybersecurity expert...\n<|im_end|>\n<|im_start|>user\n...\n<|im_end|>\n<|im_start|>assistant\n...\n<|im_end|>"
    }
    """
    instruction = sample['instruction']
    technique = sample['output']['techniques'][0]

    # Create system prompt
    system_prompt = """You are a cybersecurity expert specializing in MITRE ATT&CK framework. Your task is to analyze threat intelligence descriptions and identify the corresponding MITRE ATT&CK techniques.

Given a description of adversary behavior, identify the most relevant MITRE ATT&CK technique and provide:
1. Technique ID (e.g., T1055.011)
2. Technique Name
3. Matrix (enterprise/mobile/ics)

Respond in JSON format."""

    # Create user input
    user_input = f"Analyze this threat behavior and identify the MITRE ATT&CK technique:\n\n{instruction}"

    # Create assistant response
    assistant_response = json.dumps({
        "technique_id": technique['id'],
        "technique_name": technique['name'],
        "matrix": technique['matrix'],
        "description": technique['description']
    }, ensure_ascii=False)

    # Format for Qwen chat template
    formatted_text = f"""<|im_start|>system
{system_prompt}<|im_end|>
<|im_start|>user
{user_input}<|im_end|>
<|im_start|>assistant
{assistant_response}<|im_end|>"""

    return {"text": formatted_text}

# Format all training samples
print("🔄 Formatting training data...")
formatted_samples = []
for sample in tqdm(training_samples):
    formatted_sample = format_training_sample(sample)
    formatted_samples.append(formatted_sample)

print(f"✅ Formatted {len(formatted_samples):,} training samples")

# Show formatted example
print("\n📝 Formatted Training Example:")
print(formatted_samples[0]['text'][:500] + "...")


🔄 Formatting training data...


100%|██████████| 921/921 [00:00<00:00, 194225.65it/s]

✅ Formatted 921 training samples

📝 Formatted Training Example:
<|im_start|>system
You are a cybersecurity expert specializing in MITRE ATT&CK framework. Your task is to analyze threat intelligence descriptions and identify the corresponding MITRE ATT&CK techniques.

Given a description of adversary behavior, identify the most relevant MITRE ATT&CK technique and provide:
1. Technique ID (e.g., T1055.011)
2. Technique Name
3. Matrix (enterprise/mobile/ics)

Respond in JSON format.<|im_end|>
<|im_start|>user
Analyze this threat behavior and identify the MITRE ...





In [4]:
# Model configuration
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
MAX_LENGTH = 2048
BATCH_SIZE = 4
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 100

print(f"🤖 Setting up model: {MODEL_NAME}")

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔧 Using device: {device}")

# Load tokenizer
print("📝 Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="right"
)

# Add padding token if not exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("🔧 Set pad_token = eos_token")

print(f"✅ Tokenizer loaded")
print(f"📊 Vocab size: {len(tokenizer):,}")
print(f"🔑 Special tokens: {tokenizer.special_tokens_map}")

# Load model
print("🤖 Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)

print(f"✅ Model loaded")
print(f"📊 Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"🎯 Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


🤖 Setting up model: Qwen/Qwen2.5-1.5B-Instruct
🔧 Using device: cpu
📝 Loading tokenizer...
✅ Tokenizer loaded
📊 Vocab size: 151,665
🔑 Special tokens: {'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}
🤖 Loading model...
✅ Model loaded
📊 Model parameters: 1,543,714,304
🎯 Trainable parameters: 1,543,714,304


In [5]:
def tokenize_function(examples):
    """
    Tokenize training examples
    """
    # Tokenize the text
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",  # SỬA LỖI: Pad to max length
        max_length=MAX_LENGTH,
        return_tensors=None
    )

    # For causal language modeling, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()

    return tokenized

# Create train/validation split
print("📊 Creating train/validation split...")
train_samples, val_samples = train_test_split(
    formatted_samples,
    test_size=0.1,
    random_state=42,
    stratify=[sample['text'].split('"matrix": "')[1].split('"')[0] for sample in formatted_samples]
)

print(f"📚 Training samples: {len(train_samples):,}")
print(f"🔍 Validation samples: {len(val_samples):,}")

# Create HuggingFace datasets
print("🔄 Creating datasets...")
train_dataset = Dataset.from_list(train_samples)
val_dataset = Dataset.from_list(val_samples)

# Tokenize datasets
print("🔤 Tokenizing datasets...")
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    desc="Tokenizing train data"
)

val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    desc="Tokenizing validation data"
)

print("✅ Datasets prepared")
print(f"📊 Train dataset: {len(train_dataset):,} samples")
print(f"📊 Validation dataset: {len(val_dataset):,} samples")

# Check tokenization
sample_tokens = train_dataset[0]
print(f"\n📝 Sample tokenization:")
print(f"Input IDs length: {len(sample_tokens['input_ids'])}")
print(f"Attention mask length: {len(sample_tokens['attention_mask'])}")
print(f"Labels length: {len(sample_tokens['labels'])}")


📊 Creating train/validation split...
📚 Training samples: 828
🔍 Validation samples: 93
🔄 Creating datasets...
🔤 Tokenizing datasets...


Tokenizing train data:   0%|          | 0/828 [00:00<?, ? examples/s]

Tokenizing validation data:   0%|          | 0/93 [00:00<?, ? examples/s]

✅ Datasets prepared
📊 Train dataset: 828 samples
📊 Validation dataset: 93 samples

📝 Sample tokenization:
Input IDs length: 2048
Attention mask length: 2048
Labels length: 2048


In [6]:
# Create output directory
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"../models/qwen-ttp-classification-{timestamp}"
os.makedirs(output_dir, exist_ok=True)

print(f"📁 Output directory: {output_dir}")

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,

    # Training parameters
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=WARMUP_STEPS,

    # Optimization
    fp16=True,
    dataloader_pin_memory=False,
    gradient_checkpointing=True,

    # Logging and saving
    logging_dir=f"{output_dir}/logs",
    logging_steps=10,
    save_steps=100,
    save_total_limit=3,
    eval_steps=100,
    eval_strategy="steps",  # FIXED: was evaluation_strategy

    # Other settings
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Report
    report_to=None,  # Disable wandb/tensorboard
    run_name=f"qwen-ttp-classification-{timestamp}"
)

print("✅ Training arguments configured")
print(f"🎯 Batch size: {BATCH_SIZE}")
print(f"📈 Learning rate: {LEARNING_RATE}")
print(f"🔄 Epochs: {NUM_EPOCHS}")
print(f"🔥 FP16: {training_args.fp16}")
print(f"💾 Gradient checkpointing: {training_args.gradient_checkpointing}")

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # No masking for causal LM
)

print("✅ Data collator configured")


📁 Output directory: ../models/qwen-ttp-classification-2025-08-08_14-38-31
✅ Training arguments configured
🎯 Batch size: 4
📈 Learning rate: 2e-05
🔄 Epochs: 3
🔥 FP16: True
💾 Gradient checkpointing: True
✅ Data collator configured


In [7]:
# ✅ CODE ĐÃ SỬA HOÀN CHỈNH CHO MPS (APPLE SILICON)
print("🔧 Initializing MPS-compatible training...")

import torch
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datetime import datetime
import os

# Clear memory
if torch.backends.mps.is_available():
    torch.mps.empty_cache()
    print("🧹 MPS cache cleared")

# Configuration
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
BATCH_SIZE = 2  # Smaller for MPS
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 100

# Create output directory
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"../models/qwen-ttp-mps-{timestamp}"
os.makedirs(output_dir, exist_ok=True)
print(f"📁 Output: {output_dir}")

# ✅ MPS-COMPATIBLE TrainingArguments (SỬA LỖI)
training_args_mps = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,

    # Training params
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=2,  # Tăng để bù đắp batch nhỏ
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=WARMUP_STEPS,

    # ⚠️ QUAN TRỌNG: MPS settings
    fp16=False,              # PHẢI False cho MPS
    bf16=False,              # PHẢI False cho MPS
    dataloader_pin_memory=False,
    gradient_checkpointing=True,

    # Logging
    logging_steps=10,
    save_steps=100,
    eval_steps=100,
    eval_strategy="steps",   # SỬA: không phải evaluation_strategy
    save_total_limit=3,

    # Evaluation
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    # Disable external services
    report_to=None,
    dataloader_num_workers=0,  # Single thread cho MPS
)

# ✅ Data collator (SỬA LỖI)
data_collator_mps = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

print("✅ Configuration created!")
print(f"🔥 FP16: {training_args_mps.fp16} (phải False)")
print(f"🍎 BF16: {training_args_mps.bf16} (phải False)")

# ✅ Initialize Trainer (SỬA LỖI - KHÔNG có accelerator argument)
print("\n🏃‍♂️ Creating trainer...")

os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
os.environ["ACCELERATE_DISABLE_RICH"] = "1"

# Khởi tạo trainer
trainer = Trainer(
    model=model,
    args=training_args_mps,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator_mps,
)

print("✅ Trainer initialized successfully!")
print("🍎 Ready for MPS training!")
print(f"📊 Batch size: {BATCH_SIZE}")
print(f"🔄 Effective batch: {BATCH_SIZE * 2} (với gradient accumulation)")
print(f"📈 Learning rate: {LEARNING_RATE}")

# ✅ Bây giờ có thể bắt đầu training
print("\n🚀 Ready to train! Run: trainer.train()")

🔧 Initializing MPS-compatible training...
🧹 MPS cache cleared
📁 Output: ../models/qwen-ttp-mps-2025-08-08_14-38-31
✅ Configuration created!
🔥 FP16: False (phải False)
🍎 BF16: False (phải False)

🏃‍♂️ Creating trainer...


  trainer = Trainer(
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


✅ Trainer initialized successfully!
🍎 Ready for MPS training!
📊 Batch size: 2
🔄 Effective batch: 4 (với gradient accumulation)
📈 Learning rate: 2e-05

🚀 Ready to train! Run: trainer.train()


In [8]:
# Evaluate the model
print("📊 Evaluating model...")
eval_result = trainer.evaluate()

print("\n📈 Evaluation Results:")
for key, value in eval_result.items():
    print(f"  {key}: {value:.4f}")

# Save evaluation results
eval_file = os.path.join(output_dir, "evaluation_results.json")
with open(eval_file, 'w') as f:
    json.dump(eval_result, f, indent=2)

print(f"✅ Evaluation results saved to: {eval_file}")


📊 Evaluating model...


KeyboardInterrupt: 

In [None]:
def test_ttp_classification(model, tokenizer, threat_description: str) -> Dict[str, Any]:
    """
    Test TTP classification on a threat description
    """
    system_prompt = """You are a cybersecurity expert specializing in MITRE ATT&CK framework. Your task is to analyze threat intelligence descriptions and identify the corresponding MITRE ATT&CK techniques.

Given a description of adversary behavior, identify the most relevant MITRE ATT&CK technique and provide:
1. Technique ID (e.g., T1055.011)
2. Technique Name
3. Matrix (enterprise/mobile/ics)

Respond in JSON format."""

    user_input = f"Analyze this threat behavior and identify the MITRE ATT&CK technique:\n\n{threat_description}"

    # Format input
    prompt = f"""<|im_start|>system
{system_prompt}<|im_end|>
<|im_start|>user
{user_input}<|im_end|>
<|im_start|>assistant
"""

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    assistant_response = full_response.split("<|im_start|>assistant\n")[-1]

    return {
        "input": threat_description,
        "response": assistant_response,
        "full_prompt": prompt
    }

# Test examples
test_cases = [
    "Adversaries may inject malicious code into processes in order to evade process-based defenses or elevate privileges.",
    "Attackers send phishing emails with malicious attachments to gain initial access to the target system.",
    "The malware establishes persistence by creating scheduled tasks that execute at system startup.",
    "Adversaries may abuse elevation control mechanisms to gain higher-level permissions on a system."
]

print("🧪 Testing model inference...")
for i, test_case in enumerate(test_cases, 1):
    print(f"\n📝 Test Case {i}:")
    print(f"Input: {test_case}")

    result = test_ttp_classification(model, tokenizer, test_case)
    print(f"Output: {result['response']}")
    print("-" * 80)


In [None]:
# Create training summary
# This cell is now safe to run even if training/evaluation was skipped.

print("📄 Creating training summary...")

# Use locals() to check if variables were defined in the session
training_result_obj = locals().get('training_result')
eval_result_obj = locals().get('eval_result')

# Define all variables safely
safe_model_name = locals().get('MODEL_NAME', 'N/A')
safe_timestamp = locals().get('timestamp', datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
safe_output_dir = locals().get('output_dir', 'N/A')
safe_training_samples = locals().get('training_samples', [])
safe_train_dataset = locals().get('train_dataset', [])
safe_val_dataset = locals().get('val_dataset', [])
safe_technique_ids = locals().get('technique_ids', set())
safe_matrices = locals().get('matrices', {})
safe_batch_size = locals().get('BATCH_SIZE', 'N/A')
safe_lr = locals().get('LEARNING_RATE', 'N/A')
safe_epochs = locals().get('NUM_EPOCHS', 'N/A')
safe_max_length = locals().get('MAX_LENGTH', 'N/A')
safe_warmup_steps = locals().get('WARMUP_STEPS', 'N/A')


training_summary = {
    "model_name": safe_model_name,
    "training_timestamp": safe_timestamp,
    "output_directory": safe_output_dir,
    "dataset_info": {
        "total_samples": len(safe_training_samples),
        "train_samples": len(safe_train_dataset),
        "val_samples": len(safe_val_dataset),
        "unique_techniques": len(safe_technique_ids),
        "matrix_distribution": safe_matrices
    },
    "training_config": {
        "batch_size": safe_batch_size,
        "learning_rate": safe_lr,
        "num_epochs": safe_epochs,
        "max_length": safe_max_length,
        "warmup_steps": safe_warmup_steps
    },
    "training_results": {
        "final_loss": getattr(training_result_obj, 'training_loss', None),
        "training_time_seconds": getattr(training_result_obj, 'training_time', None)
    },
    "evaluation_results": eval_result_obj
}

# Save summary
summary_file = os.path.join(safe_output_dir, "training_summary.json") if safe_output_dir != "N/A" else "training_summary.json"
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(training_summary, f, indent=2, ensure_ascii=False)

print(f"📄 Training summary saved to: {summary_file}")

# Display summary
print("\n" + "="*50)
print("📊 TRAINING SUMMARY 📊")
print("="*50)
print(f"  Model: {training_summary['model_name']}")
final_loss = training_summary['training_results']['final_loss']
eval_loss = training_summary['evaluation_results'].get('eval_loss') if training_summary['evaluation_results'] else None
training_time = training_summary['training_results']['training_time_seconds']

print(f"  Final training loss: {final_loss:.4f}" if final_loss is not None else "  Final training loss: N/A")
print(f"  Final validation loss: {eval_loss:.4f}" if eval_loss is not None else "  Final validation loss: N/A")
print(f"  Training time: {training_time:.1f} seconds" if training_time is not None else "  Training time: N/A")
print(f"  Model saved to: {training_summary['output_directory']}")
print("="*50)

if training_result_obj is None:
    print("\n⚠️ NOTE: 'training_result' not found. Run 'training_result = trainer.train()' to get training stats.")
if eval_result_obj is None:
    print("⚠️ NOTE: 'eval_result' not found. Run 'eval_result = trainer.evaluate()' to get evaluation stats.")

