In [1]:
import json
import os
import torch
import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Tuple
import logging
import re
from tqdm import tqdm

# Transformers and training
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("üìö Dependencies loaded successfully!")
print(f"üî• PyTorch version: {torch.__version__}")

# Check device availability
print(f"ü§ñ CUDA available: {torch.cuda.is_available()}")
print(f"üçé MPS available: {torch.backends.mps.is_available()}")

if torch.cuda.is_available():
    print(f"üéÆ GPU: {torch.cuda.get_device_name(0)}")
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
elif torch.backends.mps.is_available():
    print(f"üçé Using Apple Silicon MPS")
    print(f"üíª Device: Apple Silicon Mac")
else:
    print(f"‚ö†Ô∏è  Using CPU only")


üìö Dependencies loaded successfully!
üî• PyTorch version: 2.8.0
ü§ñ CUDA available: False
üçé MPS available: True
üçé Using Apple Silicon MPS
üíª Device: Apple Silicon Mac


In [2]:
# Load merged MITRE ATT&CK dataset
dataset_path = "../data/TTP-classification/merged_mitre_attack_dataset.json"

print("üìÇ Loading MITRE ATT&CK dataset...")
with open(dataset_path, 'r', encoding='utf-8') as f:
    mitre_data = json.load(f)

# Extract training samples
training_samples = mitre_data['dataset']
print(f"‚úÖ Loaded {len(training_samples):,} training samples")

# Analyze dataset structure
print("\nüìä Dataset Analysis:")
matrices = {'enterprise': 0, 'mobile': 0, 'ics': 0}
techniques_count = 0
sub_techniques_count = 0
technique_ids = set()

for sample in training_samples:
    technique = sample['output']['techniques'][0]
    matrix = technique['matrix']
    technique_id = technique['id']
    
    matrices[matrix] += 1
    technique_ids.add(technique_id)
    
    if '.' in technique_id:
        sub_techniques_count += 1
    else:
        techniques_count += 1

print(f"  ‚Ä¢ Enterprise: {matrices['enterprise']:,} samples")
print(f"  ‚Ä¢ Mobile: {matrices['mobile']:,} samples")
print(f"  ‚Ä¢ ICS: {matrices['ics']:,} samples")
print(f"  ‚Ä¢ Techniques: {techniques_count:,} samples")
print(f"  ‚Ä¢ Sub-techniques: {sub_techniques_count:,} samples")
print(f"  ‚Ä¢ Unique technique IDs: {len(technique_ids):,}")

# Show sample data
print("\nüìù Sample Training Example:")
sample = training_samples[0]
print(f"Instruction: {sample['instruction'][:100]}...")
print(f"Technique ID: {sample['output']['techniques'][0]['id']}")
print(f"Technique Name: {sample['output']['techniques'][0]['name']}")
print(f"Matrix: {sample['output']['techniques'][0]['matrix']}")


üìÇ Loading MITRE ATT&CK dataset...
‚úÖ Loaded 921 training samples

üìä Dataset Analysis:
  ‚Ä¢ Enterprise: 691 samples
  ‚Ä¢ Mobile: 135 samples
  ‚Ä¢ ICS: 95 samples
  ‚Ä¢ Techniques: 406 samples
  ‚Ä¢ Sub-techniques: 515 samples
  ‚Ä¢ Unique technique IDs: 921

üìù Sample Training Example:
Instruction: Adversaries may inject malicious code into process via Extra Window Memory (EWM) in order to evade p...
Technique ID: T1055.011
Technique Name: Extra Window Memory Injection
Matrix: enterprise


In [3]:
# Model configuration
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
MAX_LENGTH = 2048
BATCH_SIZE = 4
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 100

print(f"ü§ñ Setting up model: {MODEL_NAME}")

# Create output directory
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"../models/qwen-ttp-classification-{timestamp}"
os.makedirs(output_dir, exist_ok=True)

print(f"üìÅ Output directory: {output_dir}")

# Training arguments (FIXED - eval_strategy instead of evaluation_strategy)
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    
    # Training parameters
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=WARMUP_STEPS,
    
    # Optimization (MPS compatible - FIXED)
    fp16=torch.cuda.is_available(),  # Only use fp16 on CUDA
    bf16=False,  # Disable bf16 - MPS doesn't support it in TrainingArguments
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    
    # Logging and saving
    logging_dir=f"{output_dir}/logs",
    logging_steps=10,
    save_steps=100,
    save_total_limit=3,
    eval_steps=100,
    eval_strategy="steps",  # FIXED: was evaluation_strategy
    
    # Other settings
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Report
    report_to=None,  # Disable wandb/tensorboard
    run_name=f"qwen-ttp-classification-{timestamp}"
)

print("‚úÖ Training arguments configured (FIXED + MPS)")
print(f"üéØ Batch size: {BATCH_SIZE}")
print(f"üìà Learning rate: {LEARNING_RATE}")
print(f"üîÑ Epochs: {NUM_EPOCHS}")
print(f"üî• FP16 (CUDA only): {training_args.fp16}")
print(f"üçé BF16 (Disabled): {training_args.bf16}")
print(f"üíæ Gradient checkpointing: {training_args.gradient_checkpointing}")
print(f"üîß FIXED: eval_strategy + MPS compatibility")

# Show which precision will be used
if torch.cuda.is_available():
    print("‚ö° Using FP16 precision for CUDA training")
elif torch.backends.mps.is_available():
    print("üçé Using FP32 precision for MPS training (safest option)")
else:
    print("‚ö†Ô∏è  Using FP32 precision for CPU training")


ü§ñ Setting up model: Qwen/Qwen2.5-1.5B-Instruct
üìÅ Output directory: ../models/qwen-ttp-classification-2025-08-08_14-15-00
‚úÖ Training arguments configured (FIXED + MPS)
üéØ Batch size: 4
üìà Learning rate: 2e-05
üîÑ Epochs: 3
üî• FP16 (CUDA only): False
üçé BF16 (Disabled): False
üíæ Gradient checkpointing: True
üîß FIXED: eval_strategy + MPS compatibility
üçé Using FP32 precision for MPS training (safest option)


In [4]:
def format_training_sample(sample: Dict[str, Any]) -> Dict[str, str]:
    """
    Format training sample for instruction tuning
    """
    instruction = sample['instruction']
    technique = sample['output']['techniques'][0]
    
    # Create system prompt
    system_prompt = """You are a cybersecurity expert specializing in MITRE ATT&CK framework. Your task is to analyze threat intelligence descriptions and identify the corresponding MITRE ATT&CK techniques.

Given a description of adversary behavior, identify the most relevant MITRE ATT&CK technique and provide:
1. Technique ID (e.g., T1055.011)
2. Technique Name
3. Matrix (enterprise/mobile/ics)

Respond in JSON format."""
    
    # Create user input
    user_input = f"Analyze this threat behavior and identify the MITRE ATT&CK technique:\n\n{instruction}"
    
    # Create assistant response
    assistant_response = json.dumps({
        "technique_id": technique['id'],
        "technique_name": technique['name'],
        "matrix": technique['matrix'],
        "description": technique['description']
    }, ensure_ascii=False)
    
    # Format for Qwen chat template
    formatted_text = f"""<|im_start|>system
{system_prompt}<|im_end|>
<|im_start|>user
{user_input}<|im_end|>
<|im_start|>assistant
{assistant_response}<|im_end|>"""
    
    return {"text": formatted_text}

# Format all training samples
print("üîÑ Formatting training data...")
formatted_samples = []
for sample in tqdm(training_samples):
    formatted_sample = format_training_sample(sample)
    formatted_samples.append(formatted_sample)

print(f"‚úÖ Formatted {len(formatted_samples):,} training samples")

# Show formatted example
print("\nüìù Formatted Training Example:")
print(formatted_samples[0]['text'][:500] + "...")


üîÑ Formatting training data...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 921/921 [00:00<00:00, 72455.29it/s]

‚úÖ Formatted 921 training samples

üìù Formatted Training Example:
<|im_start|>system
You are a cybersecurity expert specializing in MITRE ATT&CK framework. Your task is to analyze threat intelligence descriptions and identify the corresponding MITRE ATT&CK techniques.

Given a description of adversary behavior, identify the most relevant MITRE ATT&CK technique and provide:
1. Technique ID (e.g., T1055.011)
2. Technique Name
3. Matrix (enterprise/mobile/ics)

Respond in JSON format.<|im_end|>
<|im_start|>user
Analyze this threat behavior and identify the MITRE ...





In [5]:
# Setup device with MPS support
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"üîß Using device: CUDA")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"üîß Using device: MPS (Apple Silicon)")
else:
    device = torch.device("cpu")
    print(f"üîß Using device: CPU")

print(f"üì± Selected device: {device}")

# Load tokenizer
print("üìù Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="right"
)

# Add padding token if not exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("üîß Set pad_token = eos_token")

print(f"‚úÖ Tokenizer loaded")
print(f"üìä Vocab size: {len(tokenizer):,}")

# Load model with MPS support
print("ü§ñ Loading model...")

# Set appropriate dtype based on device (FIXED for MPS)
if torch.cuda.is_available():
    torch_dtype = torch.float16  # FP16 for CUDA
    device_map = "auto"
elif torch.backends.mps.is_available():
    torch_dtype = torch.float32  # FP32 for MPS (safest option)
    device_map = None  # MPS doesn't support device_map="auto"
else:
    torch_dtype = torch.float32  # FP32 for CPU
    device_map = None

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=torch_dtype,
    device_map=device_map,
    low_cpu_mem_usage=True
)

# Move to MPS if needed (device_map="auto" doesn't work with MPS)
if torch.backends.mps.is_available() and not torch.cuda.is_available():
    model = model.to("mps")
    print("üçé Model moved to MPS device")

print(f"‚úÖ Model loaded")
print(f"üìä Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"üéØ Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


üîß Using device: MPS (Apple Silicon)
üì± Selected device: mps
üìù Loading tokenizer...
‚úÖ Tokenizer loaded
üìä Vocab size: 151,665
ü§ñ Loading model...
üçé Model moved to MPS device
‚úÖ Model loaded
üìä Model parameters: 1,543,714,304
üéØ Trainable parameters: 1,543,714,304


In [6]:
# Create train/validation split and start training
print("üìä Creating train/validation split...")
train_samples, val_samples = train_test_split(
    formatted_samples, 
    test_size=0.1, 
    random_state=42,
    stratify=[sample['text'].split('"matrix": "')[1].split('"')[0] for sample in formatted_samples]
)

print(f"üìö Training samples: {len(train_samples):,}")
print(f"üîç Validation samples: {len(val_samples):,}")

# Ready to train message
print("‚úÖ Setup completed!")
print("üöÄ You can now run training by executing the trainer.train() command")
print("‚ö†Ô∏è  Note: eval_strategy parameter has been fixed in TrainingArguments")
print("üçé MPS (Apple Silicon) support added for Mac users")

# Device info summary
if torch.cuda.is_available():
    print("üéÆ Training will use CUDA GPU acceleration")
elif torch.backends.mps.is_available():
    print("üçé Training will use MPS (Apple Silicon) acceleration")
else:
    print("‚ö†Ô∏è  Training will use CPU (slower)")

# Memory management tip for MPS (UPDATED)
if torch.backends.mps.is_available() and not torch.cuda.is_available():
    print("\nüí° MPS Tips (UPDATED):")
    print("   ‚Ä¢ Using FP32 precision (BF16 not supported in TrainingArguments)")
    print("   ‚Ä¢ Reduce batch size if you encounter memory issues")
    print("   ‚Ä¢ Use torch.mps.empty_cache() to clear memory")
    print("   ‚Ä¢ Training will be slower than FP16 but more stable")


üìä Creating train/validation split...
üìö Training samples: 828
üîç Validation samples: 93
‚úÖ Setup completed!
üöÄ You can now run training by executing the trainer.train() command
‚ö†Ô∏è  Note: eval_strategy parameter has been fixed in TrainingArguments
üçé MPS (Apple Silicon) support added for Mac users
üçé Training will use MPS (Apple Silicon) acceleration

üí° MPS Tips (UPDATED):
   ‚Ä¢ Using FP32 precision (BF16 not supported in TrainingArguments)
   ‚Ä¢ Reduce batch size if you encounter memory issues
   ‚Ä¢ Use torch.mps.empty_cache() to clear memory
   ‚Ä¢ Training will be slower than FP16 but more stable


In [7]:
# Optional: Start training (uncomment to run)
# Uncomment the lines below to start training

"""
# Clear memory cache before training
if torch.cuda.is_available():
    torch.cuda.empty_cache()
elif torch.backends.mps.is_available():
    torch.mps.empty_cache()

# Create tokenizer function
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# Create datasets
train_dataset = Dataset.from_list(train_samples)
val_dataset = Dataset.from_list(val_samples)

# Tokenize
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
print("üöÄ Starting training...")
training_result = trainer.train()

print("üéâ Training completed!")
print(f"üìä Final loss: {training_result.training_loss:.4f}")
"""

print("üìù Training code is ready!")
print("üîì Uncomment the code above to start training")
print("‚ö° Optimized for MPS (Apple Silicon) and CUDA")


üìù Training code is ready!
üîì Uncomment the code above to start training
‚ö° Optimized for MPS (Apple Silicon) and CUDA


In [8]:
# CORRECTED Training Arguments - Run this if you get TypeError
print("üîß Creating CORRECTED training arguments...")

# Model configuration
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
MAX_LENGTH = 2048
BATCH_SIZE = 4
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 100

# Create output directory
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"../models/qwen-ttp-classification-{timestamp}"
os.makedirs(output_dir, exist_ok=True)

print(f"üìÅ Output directory: {output_dir}")

# CORRECTED Training arguments (eval_strategy NOT evaluation_strategy)
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    
    # Training parameters
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=WARMUP_STEPS,
    
    # Optimization (MPS compatible)
    fp16=torch.cuda.is_available(),  # Only FP16 on CUDA
    bf16=False,  # Disabled for MPS compatibility
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    
    # Logging and saving
    logging_dir=f"{output_dir}/logs",
    logging_steps=10,
    save_steps=100,
    save_total_limit=3,
    eval_steps=100,
    eval_strategy="steps",  # CORRECT: eval_strategy (NOT evaluation_strategy)
    
    # Other settings
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Report
    report_to=None,
    run_name=f"qwen-ttp-classification-{timestamp}"
)

print("‚úÖ CORRECTED Training arguments configured!")
print(f"üéØ Batch size: {BATCH_SIZE}")
print(f"üìà Learning rate: {LEARNING_RATE}")
print(f"üîÑ Epochs: {NUM_EPOCHS}")
print(f"üî• FP16 (CUDA only): {training_args.fp16}")
print(f"üçé BF16 (Disabled): {training_args.bf16}")
print(f"üíæ Gradient checkpointing: {training_args.gradient_checkpointing}")
print(f"‚úÖ FIXED: eval_strategy (NOT evaluation_strategy)")

# Show device-specific precision
if torch.cuda.is_available():
    print("‚ö° Using FP16 precision for CUDA training")
elif torch.backends.mps.is_available():
    print("üçé Using FP32 precision for MPS training")
else:
    print("‚ö†Ô∏è  Using FP32 precision for CPU training")


üîß Creating CORRECTED training arguments...
üìÅ Output directory: ../models/qwen-ttp-classification-2025-08-08_14-15-12
‚úÖ CORRECTED Training arguments configured!
üéØ Batch size: 4
üìà Learning rate: 2e-05
üîÑ Epochs: 3
üî• FP16 (CUDA only): False
üçé BF16 (Disabled): False
üíæ Gradient checkpointing: True
‚úÖ FIXED: eval_strategy (NOT evaluation_strategy)
üçé Using FP32 precision for MPS training


In [9]:
import json
import os
import torch
import numpy as np
import pandas as pd
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Tuple
import logging
import re
from tqdm import tqdm

# Transformers and training
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("üìö Dependencies loaded successfully!")
print(f"üî• PyTorch version: {torch.__version__}")
print(f"ü§ñ CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üéÆ GPU: {torch.cuda.get_device_name(0)}")
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")


üìö Dependencies loaded successfully!
üî• PyTorch version: 2.8.0
ü§ñ CUDA available: False


In [10]:
# Load merged MITRE ATT&CK dataset
dataset_path = "../data/TTP-classification/merged_mitre_attack_dataset.json"

print("üìÇ Loading MITRE ATT&CK dataset...")
with open(dataset_path, 'r', encoding='utf-8') as f:
    mitre_data = json.load(f)

# Extract training samples
training_samples = mitre_data['dataset']
print(f"‚úÖ Loaded {len(training_samples):,} training samples")

# Analyze dataset structure
print("\nüìä Dataset Analysis:")
matrices = {'enterprise': 0, 'mobile': 0, 'ics': 0}
techniques_count = 0
sub_techniques_count = 0
technique_ids = set()

for sample in training_samples:
    technique = sample['output']['techniques'][0]
    matrix = technique['matrix']
    technique_id = technique['id']
    
    matrices[matrix] += 1
    technique_ids.add(technique_id)
    
    if '.' in technique_id:
        sub_techniques_count += 1
    else:
        techniques_count += 1

print(f"  ‚Ä¢ Enterprise: {matrices['enterprise']:,} samples")
print(f"  ‚Ä¢ Mobile: {matrices['mobile']:,} samples")
print(f"  ‚Ä¢ ICS: {matrices['ics']:,} samples")
print(f"  ‚Ä¢ Techniques: {techniques_count:,} samples")
print(f"  ‚Ä¢ Sub-techniques: {sub_techniques_count:,} samples")
print(f"  ‚Ä¢ Unique technique IDs: {len(technique_ids):,}")

# Show sample data
print("\nüìù Sample Training Example:")
sample = training_samples[0]
print(f"Instruction: {sample['instruction'][:100]}...")
print(f"Technique ID: {sample['output']['techniques'][0]['id']}")
print(f"Technique Name: {sample['output']['techniques'][0]['name']}")
print(f"Matrix: {sample['output']['techniques'][0]['matrix']}")


üìÇ Loading MITRE ATT&CK dataset...
‚úÖ Loaded 921 training samples

üìä Dataset Analysis:
  ‚Ä¢ Enterprise: 691 samples
  ‚Ä¢ Mobile: 135 samples
  ‚Ä¢ ICS: 95 samples
  ‚Ä¢ Techniques: 406 samples
  ‚Ä¢ Sub-techniques: 515 samples
  ‚Ä¢ Unique technique IDs: 921

üìù Sample Training Example:
Instruction: Adversaries may inject malicious code into process via Extra Window Memory (EWM) in order to evade p...
Technique ID: T1055.011
Technique Name: Extra Window Memory Injection
Matrix: enterprise


In [11]:
def format_training_sample(sample: Dict[str, Any]) -> Dict[str, str]:
    """
    Format training sample for instruction tuning
    
    Input format:
    {
        "instruction": "Adversary behavior description...",
        "output": {
            "techniques": [{
                "id": "T1055.011",
                "name": "Extra Window Memory Injection",
                "description": "...",
                "matrix": "enterprise"
            }]
        }
    }
    
    Output format for instruction tuning:
    {
        "text": "<|im_start|>system\nYou are a cybersecurity expert...\n<|im_end|>\n<|im_start|>user\n...\n<|im_end|>\n<|im_start|>assistant\n...\n<|im_end|>"
    }
    """
    instruction = sample['instruction']
    technique = sample['output']['techniques'][0]
    
    # Create system prompt
    system_prompt = """You are a cybersecurity expert specializing in MITRE ATT&CK framework. Your task is to analyze threat intelligence descriptions and identify the corresponding MITRE ATT&CK techniques.

Given a description of adversary behavior, identify the most relevant MITRE ATT&CK technique and provide:
1. Technique ID (e.g., T1055.011)
2. Technique Name
3. Matrix (enterprise/mobile/ics)

Respond in JSON format."""
    
    # Create user input
    user_input = f"Analyze this threat behavior and identify the MITRE ATT&CK technique:\n\n{instruction}"
    
    # Create assistant response
    assistant_response = json.dumps({
        "technique_id": technique['id'],
        "technique_name": technique['name'],
        "matrix": technique['matrix'],
        "description": technique['description']
    }, ensure_ascii=False)
    
    # Format for Qwen chat template
    formatted_text = f"""<|im_start|>system
{system_prompt}<|im_end|>
<|im_start|>user
{user_input}<|im_end|>
<|im_start|>assistant
{assistant_response}<|im_end|>"""
    
    return {"text": formatted_text}

# Format all training samples
print("üîÑ Formatting training data...")
formatted_samples = []
for sample in tqdm(training_samples):
    formatted_sample = format_training_sample(sample)
    formatted_samples.append(formatted_sample)

print(f"‚úÖ Formatted {len(formatted_samples):,} training samples")

# Show formatted example
print("\nüìù Formatted Training Example:")
print(formatted_samples[0]['text'][:500] + "...")


üîÑ Formatting training data...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 921/921 [00:00<00:00, 219598.32it/s]

‚úÖ Formatted 921 training samples

üìù Formatted Training Example:
<|im_start|>system
You are a cybersecurity expert specializing in MITRE ATT&CK framework. Your task is to analyze threat intelligence descriptions and identify the corresponding MITRE ATT&CK techniques.

Given a description of adversary behavior, identify the most relevant MITRE ATT&CK technique and provide:
1. Technique ID (e.g., T1055.011)
2. Technique Name
3. Matrix (enterprise/mobile/ics)

Respond in JSON format.<|im_end|>
<|im_start|>user
Analyze this threat behavior and identify the MITRE ...





In [12]:
# Model configuration
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
MAX_LENGTH = 2048
BATCH_SIZE = 4
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
WARMUP_STEPS = 100

print(f"ü§ñ Setting up model: {MODEL_NAME}")

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üîß Using device: {device}")

# Load tokenizer
print("üìù Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    padding_side="right"
)

# Add padding token if not exists
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("üîß Set pad_token = eos_token")

print(f"‚úÖ Tokenizer loaded")
print(f"üìä Vocab size: {len(tokenizer):,}")
print(f"üîë Special tokens: {tokenizer.special_tokens_map}")

# Load model
print("ü§ñ Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True
)

print(f"‚úÖ Model loaded")
print(f"üìä Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"üéØ Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")


ü§ñ Setting up model: Qwen/Qwen2.5-1.5B-Instruct
üîß Using device: cpu
üìù Loading tokenizer...
‚úÖ Tokenizer loaded
üìä Vocab size: 151,665
üîë Special tokens: {'eos_token': '<|im_end|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>', '<|object_ref_start|>', '<|object_ref_end|>', '<|box_start|>', '<|box_end|>', '<|quad_start|>', '<|quad_end|>', '<|vision_start|>', '<|vision_end|>', '<|vision_pad|>', '<|image_pad|>', '<|video_pad|>']}
ü§ñ Loading model...
‚úÖ Model loaded
üìä Model parameters: 1,543,714,304
üéØ Trainable parameters: 1,543,714,304


In [13]:
def tokenize_function(examples):
    """
    Tokenize training examples
    """
    # Tokenize the text
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding=False,
        max_length=MAX_LENGTH,
        return_tensors=None
    )
    
    # For causal language modeling, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    
    return tokenized

# Create train/validation split
print("üìä Creating train/validation split...")
train_samples, val_samples = train_test_split(
    formatted_samples, 
    test_size=0.1, 
    random_state=42,
    stratify=[sample['text'].split('"matrix": "')[1].split('"')[0] for sample in formatted_samples]
)

print(f"üìö Training samples: {len(train_samples):,}")
print(f"üîç Validation samples: {len(val_samples):,}")

# Create HuggingFace datasets
print("üîÑ Creating datasets...")
train_dataset = Dataset.from_list(train_samples)
val_dataset = Dataset.from_list(val_samples)

# Tokenize datasets
print("üî§ Tokenizing datasets...")
train_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    desc="Tokenizing train data"
)

val_dataset = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],
    desc="Tokenizing validation data"
)

print("‚úÖ Datasets prepared")
print(f"üìä Train dataset: {len(train_dataset):,} samples")
print(f"üìä Validation dataset: {len(val_dataset):,} samples")

# Check tokenization
sample_tokens = train_dataset[0]
print(f"\nüìù Sample tokenization:")
print(f"Input IDs length: {len(sample_tokens['input_ids'])}")
print(f"Attention mask length: {len(sample_tokens['attention_mask'])}")
print(f"Labels length: {len(sample_tokens['labels'])}")


üìä Creating train/validation split...
üìö Training samples: 828
üîç Validation samples: 93
üîÑ Creating datasets...
üî§ Tokenizing datasets...


Tokenizing train data:   0%|          | 0/828 [00:00<?, ? examples/s]

Tokenizing validation data:   0%|          | 0/93 [00:00<?, ? examples/s]

‚úÖ Datasets prepared
üìä Train dataset: 828 samples
üìä Validation dataset: 93 samples

üìù Sample tokenization:
Input IDs length: 368
Attention mask length: 368
Labels length: 368


In [14]:
# Create output directory
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_dir = f"../models/qwen-ttp-classification-{timestamp}"
os.makedirs(output_dir, exist_ok=True)

print(f"üìÅ Output directory: {output_dir}")

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    
    # Training parameters
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=1,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_steps=WARMUP_STEPS,
    
    # Optimization
    fp16=True,
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    
    # Logging and saving
    logging_dir=f"{output_dir}/logs",
    logging_steps=10,
    save_steps=100,
    save_total_limit=3,
    eval_steps=100,
    eval_strategy="steps",  # FIXED: was evaluation_strategy

    # Other settings
    remove_unused_columns=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Report
    report_to=None,  # Disable wandb/tensorboard
    run_name=f"qwen-ttp-classification-{timestamp}"
)

print("‚úÖ Training arguments configured")
print(f"üéØ Batch size: {BATCH_SIZE}")
print(f"üìà Learning rate: {LEARNING_RATE}")
print(f"üîÑ Epochs: {NUM_EPOCHS}")
print(f"üî• FP16: {training_args.fp16}")
print(f"üíæ Gradient checkpointing: {training_args.gradient_checkpointing}")

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # No masking for causal LM
)

print("‚úÖ Data collator configured")


üìÅ Output directory: ../models/qwen-ttp-classification-2025-08-08_14-15-20
‚úÖ Training arguments configured
üéØ Batch size: 4
üìà Learning rate: 2e-05
üîÑ Epochs: 3
üî• FP16: True
üíæ Gradient checkpointing: True
‚úÖ Data collator configured


In [15]:
# Initialize trainer
print("üèÉ‚Äç‚ôÇÔ∏è Initializing trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

print("‚úÖ Trainer initialized")

# Start training
print("\nüöÄ Starting training...")
print(f"üìä Total training samples: {len(train_dataset):,}")
print(f"üìä Total validation samples: {len(val_dataset):,}")
print(f"‚è±Ô∏è Estimated training time: ~{(len(train_dataset) // BATCH_SIZE) * NUM_EPOCHS // 60} minutes")

# Clear cache before training
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Train the model
training_result = trainer.train()

print("\nüéâ Training completed!")
print(f"üìä Final training loss: {training_result.training_loss:.4f}")
print(f"‚è±Ô∏è Training time: {training_result.training_time:.2f} seconds")

# Save the final model
print("üíæ Saving final model...")
trainer.save_model()
tokenizer.save_pretrained(output_dir)

print(f"‚úÖ Model saved to: {output_dir}")


üèÉ‚Äç‚ôÇÔ∏è Initializing trainer...


  trainer = Trainer(


ValueError: fp16 mixed precision requires a GPU (not 'mps').

In [None]:
# Evaluate the model
print("üìä Evaluating model...")
eval_result = trainer.evaluate()

print("\nüìà Evaluation Results:")
for key, value in eval_result.items():
    print(f"  {key}: {value:.4f}")

# Save evaluation results
eval_file = os.path.join(output_dir, "evaluation_results.json")
with open(eval_file, 'w') as f:
    json.dump(eval_result, f, indent=2)

print(f"‚úÖ Evaluation results saved to: {eval_file}")


In [None]:
def test_ttp_classification(model, tokenizer, threat_description: str) -> Dict[str, Any]:
    """
    Test TTP classification on a threat description
    """
    system_prompt = """You are a cybersecurity expert specializing in MITRE ATT&CK framework. Your task is to analyze threat intelligence descriptions and identify the corresponding MITRE ATT&CK techniques.

Given a description of adversary behavior, identify the most relevant MITRE ATT&CK technique and provide:
1. Technique ID (e.g., T1055.011)
2. Technique Name
3. Matrix (enterprise/mobile/ics)

Respond in JSON format."""
    
    user_input = f"Analyze this threat behavior and identify the MITRE ATT&CK technique:\n\n{threat_description}"
    
    # Format input
    prompt = f"""<|im_start|>system
{system_prompt}<|im_end|>
<|im_start|>user
{user_input}<|im_end|>
<|im_start|>assistant
"""
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.1,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode response
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    assistant_response = full_response.split("<|im_start|>assistant\n")[-1]
    
    return {
        "input": threat_description,
        "response": assistant_response,
        "full_prompt": prompt
    }

# Test examples
test_cases = [
    "Adversaries may inject malicious code into processes in order to evade process-based defenses or elevate privileges.",
    "Attackers send phishing emails with malicious attachments to gain initial access to the target system.",
    "The malware establishes persistence by creating scheduled tasks that execute at system startup.",
    "Adversaries may abuse elevation control mechanisms to gain higher-level permissions on a system."
]

print("üß™ Testing model inference...")
for i, test_case in enumerate(test_cases, 1):
    print(f"\nüìù Test Case {i}:")
    print(f"Input: {test_case}")
    
    result = test_ttp_classification(model, tokenizer, test_case)
    print(f"Output: {result['response']}")
    print("-" * 80)


In [None]:
# Create training summary
training_summary = {
    "model_name": MODEL_NAME,
    "training_timestamp": timestamp,
    "output_directory": output_dir,
    "dataset_info": {
        "total_samples": len(training_samples),
        "train_samples": len(train_dataset),
        "val_samples": len(val_dataset),
        "unique_techniques": len(technique_ids),
        "matrix_distribution": matrices
    },
    "training_config": {
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "num_epochs": NUM_EPOCHS,
        "max_length": MAX_LENGTH,
        "warmup_steps": WARMUP_STEPS
    },
    "training_results": {
        "final_loss": training_result.training_loss,
        "training_time_seconds": training_result.training_time
    },
    "evaluation_results": eval_result
}

# Save summary
summary_file = os.path.join(output_dir, "training_summary.json")
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(training_summary, f, indent=2, ensure_ascii=False)

print(f"üìÑ Training summary saved to: {summary_file}")

# Display summary
print("\nüìä Training Summary:")
print(f"  Model: {MODEL_NAME}")
print(f"  Training samples: {len(train_dataset):,}")
print(f"  Validation samples: {len(val_dataset):,}")
print(f"  Unique techniques: {len(technique_ids):,}")
print(f"  Final training loss: {training_result.training_loss:.4f}")
print(f"  Final validation loss: {eval_result['eval_loss']:.4f}")
print(f"  Training time: {training_result.training_time:.1f} seconds")
print(f"  Model saved to: {output_dir}")

print("\nüéâ TTP Classification training completed successfully!")
print(f"üìÅ All artifacts saved to: {output_dir}")
