# üöÄ A100 80GB - PMB Augmentation + QLoRA Training
## Enhanced with Comprehensive Validation & Semantic Preservation

**Complete Notebook untuk Augmentasi + Training**
- ‚ö° Ultra-fast augmentation (30-40s untuk 1452 items)
- ‚úÖ Multi-layer validation (Basic + Semantic + Domain + Dedup)
- üß† Semantic meaning preservation dengan embeddings
- üéì Full QLoRA training pipeline
- üìä Comprehensive quality metrics


## 1Ô∏è‚É£ Setup & Installation

In [None]:
# Install all dependencies
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q transformers accelerate bitsandbytes
!pip install -q huggingface-hub datasets tqdm
!pip install -q flash-attn --no-build-isolation
!pip install -q peft trl sentence-transformers
!pip install -q scikit-learn pandas

import torch
import json
import time
from pathlib import Path
from typing import List, Dict, Tuple, Optional
from tqdm.auto import tqdm
from difflib import SequenceMatcher
import re
from dataclasses import dataclass, asdict, field
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from collections import defaultdict
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

print(f'‚úÖ CUDA: {torch.cuda.is_available()}')
print(f'‚úÖ GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"}')
if torch.cuda.is_available():
    print(f'‚úÖ Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f}GB')
    print(f'‚úÖ Compute: {torch.cuda.get_device_capability(0)}')


## 2Ô∏è‚É£ Configuration (A100 Optimized)

In [None]:
@dataclass
class A100_EnhancedConfig:
    '''Maximum speed + quality untuk A100 80GB'''
    
    # GPU Configuration
    device: str = 'cuda:0'
    torch_dtype = torch.bfloat16
    
    # Model Configuration
    model_name: str = 'google/gemma-3-1b-instruct'
    training_model: str = 'google/gemma-3-1b'
    load_in_4bit: bool = False
    use_cache: bool = True
    
    # Batch Processing
    batch_size: int = 128
    num_workers: int = 8
    
    # Generation
    max_new_tokens: int = 200
    temperature: float = 0.5
    top_p: float = 0.85
    top_k: int = 50
    repetition_penalty: float = 1.05
    
    # Quality Validation
    max_q_similarity: float = 0.85
    min_q_length: int = 10
    max_q_length: int = 250
    min_q_words: int = 4
    max_q_words: int = 25
    
    # Semantic Validation
    semantic_model: str = 'distiluse-base-multilingual-cased-v2'
    semantic_similarity_threshold: float = 0.75
    meaning_drift_threshold: float = 0.60
    
    # Deduplication
    duplicate_threshold: float = 0.90
    
    # QLoRA Training
    lora_r: int = 16
    lora_alpha: int = 32
    lora_dropout: float = 0.05
    target_modules: List[str] = field(default_factory=lambda: ['q_proj', 'v_proj'])
    
    # Training Arguments
    num_train_epochs: int = 3
    per_device_train_batch_size: int = 4
    per_device_eval_batch_size: int = 4
    gradient_accumulation_steps: int = 2
    learning_rate: float = 2e-4
    warmup_steps: int = 100
    max_grad_norm: float = 1.0
    weight_decay: float = 0.01
    
    # Evaluation
    eval_steps: int = 100
    save_steps: int = 200
    save_total_limit: int = 3
    load_best_model_at_end: bool = True
    
    # Monitoring
    log_memory: bool = True
    save_every_n_batches: int = 10

config = A100_EnhancedConfig()

print('\n' + '='*70)
print('‚öôÔ∏è CONFIG - A100 80GB WITH ENHANCED VALIDATION')
print('='*70)
print(f'Model: {config.model_name}')
print(f'Batch size: {config.batch_size}')
print(f'Semantic model: {config.semantic_model}')
print(f'Meaning drift threshold: {config.meaning_drift_threshold}')
print(f'Training epochs: {config.num_train_epochs}')


## 3Ô∏è‚É£ GPU Monitor

In [None]:
class GPUMonitor:
    '''Real-time GPU memory monitoring'''
    
    def __init__(self):
        self.max_allocated = 0
        self.max_reserved = 0
        self.stage_history = []
    
    def log(self, stage: str = '', detailed: bool = False):
        if not torch.cuda.is_available():
            return
        
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        max_allocated = torch.cuda.max_memory_allocated() / 1e9
        
        self.max_allocated = max(self.max_allocated, max_allocated)
        self.max_reserved = max(self.max_reserved, reserved)
        
        self.stage_history.append({
            'stage': stage,
            'allocated': allocated,
            'reserved': reserved
        })
        
        if config.log_memory:
            print(f'[{stage:20}] Alloc: {allocated:6.1f}GB | Reserved: {reserved:6.1f}GB | Peak: {max_allocated:6.1f}GB')
    
    def summary(self):
        print(f'\nüìä GPU Memory Summary:')
        print(f' Peak allocated: {self.max_allocated:.1f}GB / 80GB')
        print(f' Peak reserved: {self.max_reserved:.1f}GB / 80GB')
        print(f' Utilization: {self.max_allocated/80*100:.1f}%')

gpu_monitor = GPUMonitor()
print('‚úÖ GPU Monitor initialized')


## 4Ô∏è‚É£ Semantic Validator

In [None]:
class SemanticValidator:
    '''Semantic-level validation untuk menjaga makna'''
    
    def __init__(self, config):
        self.config = config
        print(f'\nüì• Loading semantic model: {config.semantic_model}...')
        self.model = SentenceTransformer(config.semantic_model)
        self.stats = defaultdict(int)
    
    def get_embeddings(self, texts: List[str]) -> np.ndarray:
        '''Get semantic embeddings'''
        return self.model.encode(texts, show_progress_bar=False)
    
    def check_meaning_preservation(self, original_q: str, variation_q: str,
                                   original_embedding) -> Tuple[bool, float]:
        '''Check jika variation mempertahankan makna core'''
        variation_embedding = self.get_embeddings([variation_q])[0]
        similarity = cosine_similarity([original_embedding], [variation_embedding])[0][0]
        is_valid = similarity >= self.config.meaning_drift_threshold
        
        if is_valid:
            self.stats['valid'] += 1
        else:
            self.stats['rejected'] += 1
        
        return is_valid, similarity
    
    def validate_batch(self, original_q: str, variations: List[str]) -> Dict:
        '''Batch semantic validation'''
        results = {'valid': [], 'scores': [], 'rejected': []}
        original_embedding = self.get_embeddings([original_q])[0]
        
        for var in variations:
            is_valid, score = self.check_meaning_preservation(original_q, var, original_embedding)
            if is_valid:
                results['valid'].append(var)
                results['scores'].append(score)
            else:
                results['rejected'].append((var, score))
        
        return results
    
    def report(self):
        total = self.stats['valid'] + self.stats['rejected']
        if total == 0:
            return '0/0 (0%)'
        pct = self.stats['valid'] / total * 100
        return f"{self.stats['valid']}/{total} ({pct:.1f}%)"

semantic_validator = SemanticValidator(config)
print('‚úÖ Semantic Validator initialized')


## 5Ô∏è‚É£ Other Validators (Dedup + Domain + Quality)

In [None]:
class DeduplicationValidator:
    '''Deteksi near-duplicate variations'''
    
    def __init__(self, config):
        self.config = config
        self.seen_questions = {}
        self.stats = defaultdict(int)
    
    def is_duplicate(self, question: str, threshold: float = None) -> Tuple[bool, float]:
        if threshold is None:
            threshold = self.config.duplicate_threshold
        
        question_lower = question.lower().strip()
        
        if not self.seen_questions:
            self.seen_questions[question_lower] = question
            return False, 1.0
        
        max_similarity = 0
        for seen_q in self.seen_questions.keys():
            sim = SequenceMatcher(None, question_lower, seen_q).ratio()
            max_similarity = max(max_similarity, sim)
        
        is_dup = max_similarity >= threshold
        
        if not is_dup:
            self.seen_questions[question_lower] = question
            self.stats['unique'] += 1
        else:
            self.stats['duplicates'] += 1
        
        return is_dup, max_similarity
    
    def reset(self):
        self.seen_questions = {}
    
    def report(self):
        total = self.stats['unique'] + self.stats['duplicates']
        if total == 0:
            return '0/0 (0%)'
        pct = self.stats['unique'] / total * 100
        return f"{self.stats['unique']}/{total} unique ({pct:.1f}%)"


class PMBDomainValidator:
    '''Validasi domain-specific untuk PMB'''
    
    def __init__(self):
        self.pmb_keywords = {
            'biaya': ['biaya', 'bayar', 'cicilan', 'uang'],
            'program': ['program', 'jurusan', 'fakultas', 'prodi'],
            'persyaratan': ['syarat', 'requirement', 'kriteria'],
            'pendaftaran': ['daftar', 'registrasi', 'aplikasi'],
            'jadwal': ['jadwal', 'tanggal', 'kapan', 'waktu'],
            'dokumen': ['dokumen', 'berkas', 'file', 'surat']
        }
        self.stats = defaultdict(int)
    
    def extract_entities(self, text: str) -> List[str]:
        text_lower = text.lower()
        found_entities = []
        
        for category, keywords in self.pmb_keywords.items():
            for keyword in keywords:
                if keyword in text_lower:
                    found_entities.append(category)
                    break
        
        return list(set(found_entities))
    
    def validate_entity_preservation(self, original_q: str, variation_q: str) -> Tuple[bool, List[str]]:
        orig_entities = self.extract_entities(original_q)
        var_entities = self.extract_entities(variation_q)
        
        if len(orig_entities) == 0:
            is_valid = True
            lost_entities = []
        else:
            preserved = len(set(orig_entities) & set(var_entities))
            preservation_ratio = preserved / len(orig_entities)
            is_valid = preservation_ratio >= 0.8
            lost_entities = list(set(orig_entities) - set(var_entities))
        
        if is_valid:
            self.stats['valid'] += 1
        else:
            self.stats['invalid'] += 1
        
        return is_valid, lost_entities
    
    def report(self):
        total = self.stats['valid'] + self.stats['invalid']
        if total == 0:
            return '0/0 (0%)'
        pct = self.stats['valid'] / total * 100
        return f"{self.stats['valid']}/{total} ({pct:.1f}%)"


class ComprehensiveQualityValidator:
    '''Kombinasi dari semua validators'''
    
    def __init__(self, config):
        self.config = config
        self.stats = defaultdict(int)
    
    def validate_question(self, q: str, original_q: str) -> Tuple[bool, List[str]]:
        issues = []
        
        if len(q) < self.config.min_q_length or len(q) > self.config.max_q_length:
            issues.append(f'len:{len(q)}')
        
        wc = len(q.split())
        if wc < self.config.min_q_words or wc > self.config.max_q_words:
            issues.append(f'wc:{wc}')
        
        sim = SequenceMatcher(None, original_q.lower(), q.lower()).ratio()
        if sim > self.config.max_q_similarity:
            issues.append(f'too_similar:{sim:.2f}')
        
        if not q.strip().endswith('?'):
            issues.append('not_question')
        
        punct = sum(1 for c in q if c in '!?.,;:')
        if punct > 4:
            issues.append(f'punct:{punct}')
        
        is_valid = len(issues) == 0
        if is_valid:
            self.stats['valid'] += 1
        else:
            self.stats['invalid'] += 1
        
        return is_valid, issues


# Initialize validators
print('\nüì• Initializing validators...')
dedup_validator = DeduplicationValidator(config)
pmb_validator = PMBDomainValidator()
quality_validator = ComprehensiveQualityValidator(config)
print('‚úÖ All validators initialized')


## 6Ô∏è‚É£ Load Model & Dataset

In [None]:
print('\nüì• Loading Gemma-3-1B-Instruct...')
start = time.time()

tokenizer = AutoTokenizer.from_pretrained(
    config.model_name,
    trust_remote_code=True,
    use_fast=True
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

gpu_monitor.log('Before model load')

model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    torch_dtype=config.torch_dtype,
    device_map=config.device,
    load_in_4bit=False,
    use_cache=config.use_cache,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    attn_implementation='flash_attention_2',
)

model.eval()
gpu_monitor.log('After model load')

elapsed = time.time() - start
print(f'‚úÖ Model loaded in {elapsed:.1f}s')

# Load dataset
print('\nüì• Loading PMB dataset...')
jsonl_path = hf_hub_download(
    repo_id='Pandusu/pmb-v2',
    filename='fix.jsonl',
    repo_type='dataset'
)

data = []
with open(jsonl_path, 'r', encoding='utf-8') as f:
    for line in f:
        if line.strip():
            data.append(json.loads(line))

print(f'‚úÖ Loaded {len(data)} entries')


## 7Ô∏è‚É£ Enhanced Augmentation Pipeline

In [None]:
class EnhancedAugmentationPipeline:
    def __init__(self, config, model, tokenizer, semantic_validator,
                 dedup_validator, pmb_validator, quality_validator, gpu_monitor):
        self.config = config
        self.model = model
        self.tokenizer = tokenizer
        self.semantic_val = semantic_validator
        self.dedup_val = dedup_validator
        self.pmb_val = pmb_validator
        self.quality_val = quality_validator
        self.gpu_monitor = gpu_monitor
        
        self.stats = {
            'total_processed': 0,
            'success': 0,
            'failed': 0,
            'total_variations': 0,
            'rejected_basic': 0,
            'rejected_semantic': 0,
            'rejected_domain': 0,
            'rejected_duplicate': 0,
        }
    
    def create_prompt(self, question: str) -> str:
        return f'''Buat 3 variasi pertanyaan BERBEDA struktur tentang topik YANG SAMA:

Pertanyaan asli: {question}

PENTING:
- Setiap variasi HARUS berbeda struktur
- Tetap dalam konteks PMB/admisi universitas
- Jangan ubah maksud pertanyaan
- Format output WAJIB:

VARIATION 1: [pertanyaan]
VARIATION 2: [pertanyaan]
VARIATION 3: [pertanyaan]'''
    
    def parse_variations(self, text: str) -> List[str]:
        variations = []
        text = text.replace('Jawaban:', '').replace('**', '').strip()
        pattern = r'VARIATION\s*\d+:\s*([^\n]+)'
        matches = re.findall(pattern, text, re.IGNORECASE)
        
        for match in matches:
            q_text = match.strip().strip('"\'\''-‚Ä¢')
            q_text = re.sub(r'\([^)]*\)', '', q_text).strip()
            q_text = ' '.join(q_text.split())
            
            if len(q_text) > 10:
                variations.append(q_text)
        
        return variations
    
    @torch.inference_mode()
    def generate_batch(self, prompts: List[str]) -> List[str]:
        try:
            encoded = self.tokenizer(
                prompts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            )
            
            input_ids = encoded['input_ids'].to(self.config.device)
            attention_mask = encoded['attention_mask'].to(self.config.device)
            
            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                outputs = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_new_tokens=self.config.max_new_tokens,
                    temperature=self.config.temperature,
                    top_p=self.config.top_p,
                    top_k=self.config.top_k,
                    repetition_penalty=self.config.repetition_penalty,
                    do_sample=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    use_cache=True,
                )
            
            results = self.tokenizer.batch_decode(
                outputs,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            
            return results
        except Exception as e:
            print(f'‚ùå Generation error: {str(e)[:50]}')
            return [''] * len(prompts)
    
    def extract_qa(self, entry: Dict) -> Tuple[Optional[str], Optional[str]]:
        try:
            if 'messages' not in entry:
                return None, None
            q, a = None, None
            for msg in entry['messages']:
                if msg.get('role') == 'user':
                    q = msg.get('content')
                elif msg.get('role') == 'assistant':
                    a = msg.get('content')
            return q, a
        except:
            return None, None
    
    def create_output_entry(self, variation: str, answer: str,
                           orig_id: str, var_num: int, metadata: Dict = None) -> Dict:
        base_metadata = {
            'id': f'{orig_id}_var{var_num}',
            'format_type': 'variation',
            'category': 'augmented',
            'verified': True,
            'source_id': orig_id
        }
        if metadata:
            base_metadata.update(metadata)
        
        return {
            'messages': [
                {'role': 'user', 'content': variation},
                {'role': 'assistant', 'content': answer}
            ],
            'metadata': base_metadata
        }
    
    def process_batch_items(self, items: List[Dict]) -> List[Dict]:
        entries = []
        prompts = []
        mapping = []
        
        for item in items:
            q, a = self.extract_qa(item)
            if q and a:
                prompt = self.create_prompt(q)
                prompts.append(prompt)
                mapping.append((item, q, a))
        
        if not prompts:
            return entries
        
        outputs = self.generate_batch(prompts)
        
        for (item, original_q, answer), output in zip(mapping, outputs):
            try:
                variations = self.parse_variations(output)
                if not variations:
                    self.stats['failed'] += 1
                    continue
                
                self.dedup_val.reset()
                valid_entries = []
                
                for variation in variations:
                    # 1. BASIC VALIDATION
                    basic_valid, basic_issues = self.quality_val.validate_question(
                        variation, original_q
                    )
                    if not basic_valid:
                        self.stats['rejected_basic'] += 1
                        continue
                    
                    # 2. SEMANTIC VALIDATION
                    semantic_valid, semantic_score = self.semantic_val.check_meaning_preservation(
                        original_q, variation,
                        self.semantic_val.get_embeddings([original_q])[0]
                    )
                    if not semantic_valid:
                        self.stats['rejected_semantic'] += 1
                        continue
                    
                    # 3. DOMAIN VALIDATION
                    domain_valid, lost_entities = self.pmb_val.validate_entity_preservation(
                        original_q, variation
                    )
                    if not domain_valid:
                        self.stats['rejected_domain'] += 1
                        continue
                    
                    # 4. DEDUPLICATION
                    is_duplicate, dup_score = self.dedup_val.is_duplicate(
                        variation,
                        threshold=self.config.duplicate_threshold
                    )
                    if is_duplicate:
                        self.stats['rejected_duplicate'] += 1
                        continue
                    
                    valid_entries.append((variation, semantic_score))
                
                if not valid_entries:
                    self.stats['failed'] += 1
                    continue
                
                orig_id = item.get('metadata', {}).get('id', f'entry_{self.stats["total_processed"]}')
                
                for var_num, (variation, sem_score) in enumerate(valid_entries, 1):
                    entry = self.create_output_entry(
                        variation, answer, orig_id, var_num,
                        metadata={'semantic_score': float(sem_score)}
                    )
                    entries.append(entry)
                
                self.stats['success'] += 1
                self.stats['total_variations'] += len(valid_entries)
            
            except Exception as e:
                self.stats['failed'] += 1
        
        return entries
    
    def process_dataset(self, data: List[Dict],
                       output_file: str = 'pmb_augmented_validated.jsonl'):
        print(f'\n{\'=\'*80}')
        print(f'üöÄ ENHANCED AUGMENTATION WITH VALIDATION')
        print(f'{\'=\'*80}')
        print(f' Batch size: {self.config.batch_size}')
        print(f' Total items: {len(data)}')
        print(f' Validators: Basic + Semantic + Domain + Dedup\n')
        
        start_time = time.time()
        num_batches = (len(data) + self.config.batch_size - 1) // self.config.batch_size
        
        with open(output_file, 'w', encoding='utf-8') as f:
            for batch_idx in tqdm(range(num_batches), desc='Processing', unit='batch'):
                batch_start = batch_idx * self.config.batch_size
                batch_end = min(batch_start + self.config.batch_size, len(data))
                batch = data[batch_start:batch_end]
                
                entries = self.process_batch_items(batch)
                self.stats['total_processed'] += len(batch)
                
                for entry in entries:
                    f.write(json.dumps(entry, ensure_ascii=False) + '\n')
                
                if batch_idx % self.config.save_every_n_batches == 0:
                    torch.cuda.empty_cache()
                    if self.config.log_memory:
                        self.gpu_monitor.log(f'Batch {batch_idx}/{num_batches}')
        
        elapsed = time.time() - start_time
        
        print(f'\n{\'=\'*80}')
        print(f'‚úÖ AUGMENTATION RESULTS')
        print(f'{\'=\'*80}')
        print(f' Total processed: {self.stats["total_processed"]}')
        print(f' ‚úÖ Success: {self.stats["success"]}')
        print(f' ‚ùå Failed: {self.stats["failed"]}')
        print(f' üö´ Rejected (basic): {self.stats["rejected_basic"]}')
        print(f' üö´ Rejected (semantic): {self.stats["rejected_semantic"]}')
        print(f' üö´ Rejected (domain): {self.stats["rejected_domain"]}')
        print(f' üö´ Rejected (duplicate): {self.stats["rejected_duplicate"]}')
        print(f' üìä Total variations: {self.stats["total_variations"]}')
        print(f' ‚è±Ô∏è Time: {elapsed:.1f}s')
        if elapsed > 0:
            print(f' üèÉ Speed: {self.stats["total_processed"]/elapsed:.1f} items/sec')
        print(f' üíæ Output: {output_file}')
        print(f' üìà Quality - Semantic: {self.semantic_val.report()}')
        print(f' üìà Quality - Domain: {self.pmb_val.report()}')
        print(f' üìà Quality - Dedup: {self.dedup_val.report()}')
        print(f'{\'=\'*80}\n')
        
        return self.stats


pipeline = EnhancedAugmentationPipeline(
    config, model, tokenizer, semantic_validator,
    dedup_validator, pmb_validator, quality_validator, gpu_monitor
)
print('‚úÖ Enhanced Pipeline initialized')


## 8Ô∏è‚É£ Test Augmentation (5 items dulu)

In [None]:
print('\n' + '='*80)
print('üß™ TEST: Augmentation Pipeline (5 items)')
print('='*80 + '\n')

test_batch = data[50:55]
test_entries = pipeline.process_batch_items(test_batch)

print(f'\n‚úÖ Test batch completed')
print(f' Input items: {len(test_batch)}')
print(f' Output variations: {len(test_entries)}')
if len(test_batch) > 0:
    print(f' Success rate: {len(test_entries) / (len(test_batch) * 3) * 100:.1f}%\n')

# Show samples
for i, entry in enumerate(test_entries[:3], 1):
    q = entry['messages'][0]['content']
    meta = entry['metadata']
    print(f'[Sample {i}]')
    print(f' ID: {meta["id"]}')
    print(f' Q: {q[:70]}...')
    print(f' Words: {len(q.split())}')
    print(f' Semantic: {meta.get("semantic_score", "N/A")}')
    print()


## 9Ô∏è‚É£ Run Full Augmentation

In [None]:
# FULL AUGMENTATION
stats = pipeline.process_dataset(
    data,
    output_file='pmb_augmented_validated.jsonl'
)

gpu_monitor.summary()


## üîü Output Analysis & Export

In [None]:
print('\n' + '='*80)
print('üìä OUTPUT ANALYSIS')
print('='*80)

with open('pmb_augmented_validated.jsonl') as f:
    output_data = [json.loads(line) for line in f if line.strip()]

print(f'\nTotal entries: {len(output_data)}')

word_counts = []
semantic_scores = []
for entry in output_data:
    q = entry['messages'][0]['content']
    word_counts.append(len(q.split()))
    if 'semantic_score' in entry['metadata']:
        semantic_scores.append(entry['metadata']['semantic_score'])

print(f'\nüìù Question Statistics:')
print(f' Words: {min(word_counts)}-{max(word_counts)} (avg: {sum(word_counts)/len(word_counts):.1f})')
print(f' All end with ?: {all(entry["messages"][0]["content"].strip().endswith("?") for entry in output_data)}')

if semantic_scores:
    print(f'\nüß† Semantic Quality:')
    print(f' Mean: {np.mean(semantic_scores):.3f}')
    print(f' Min: {np.min(semantic_scores):.3f}')
    print(f' Max: {np.max(semantic_scores):.3f}')

print(f'\nüìà Dataset Growth:')
print(f' Original: {len(data)} entries')
print(f' Augmented: {len(output_data)} variations')
print(f' Ratio: {len(output_data)/len(data):.2f}x')

# Export CSV
print('\n' + '='*80)
print('üíæ EXPORTING FORMATS')
print('='*80)

qa_data = [
    {
        'question': entry['messages'][0]['content'],
        'answer': entry['messages'][1]['content'],
        'semantic_score': entry['metadata'].get('semantic_score', ''),
        'verified': entry['metadata'].get('verified', '')
    }
    for entry in output_data
]
df = pd.DataFrame(qa_data)
df.to_csv('pmb_augmented_validated.csv', index=False, encoding='utf-8')

summary = {
    'original_entries': len(data),
    'augmented_entries': len(output_data),
    'augmentation_ratio': len(output_data) / len(data),
    'semantic_mean': float(np.mean(semantic_scores)) if semantic_scores else 0,
    'all_verified': all(e['metadata'].get('verified', False) for e in output_data)
}

with open('augmentation_summary.json', 'w') as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print(f'\n‚úÖ Files saved:')
print(f' 1. pmb_augmented_validated.jsonl')
print(f' 2. pmb_augmented_validated.csv')
print(f' 3. augmentation_summary.json')
print('\n' + '='*80)
print('‚úÖ AUGMENTATION PIPELINE COMPLETE')
print('='*80)


## 1Ô∏è‚É£1Ô∏è‚É£ QLoRA Training (Next Phase)

Augmented dataset sudah siap! Sekarang untuk QLoRA training, gunakan file: `qlora_training_guide.md`

```bash
# Dataset sudah tersimpan di:
pmb_augmented_validated.jsonl  # Ready untuk SFTTrainer

# Training akan menggunakan:
- Gemma-3-1B base model (4-bit)
- LoRA rank 16, alpha 32
- 3 epochs training
- Full training pipeline included
```
