# 🧠 NPC AI — Complete Training & Integration Pipeline

**BD-NSCA: Behavior-Driven Neuro-Symbolic Cognitive Architecture**

| Step | Description |
|------|-------------|
| 1 | Environment Setup |
| 2 | Training Data Generation |
| 3 | QLoRA Fine-Tuning (checkpoint/resume) |
| 4 | GGUF Export |
| 5 | Ollama Serving |
| 6 | Integrated Demo |
| 7 | Quality Evaluation |
| 8 | C++ Engine Compilation |

> **Checkpoint/Resume**: Training auto-detects and resumes from existing checkpoints.


---
## 1. 🔧 Environment Setup & Dependencies


In [None]:
# ============================================================
# Cell 1: Environment Setup (Auto-Clone & Install)
# ============================================================
import os, sys, subprocess, shutil

IN_KAGGLE = os.path.exists('/kaggle')
IN_COLAB = 'google.colab' in sys.modules
ENV_NAME = 'Kaggle' if IN_KAGGLE else ('Colab' if IN_COLAB else 'Local')
print(f'🌍 Environment: {ENV_NAME}')

if IN_KAGGLE:
    if not os.path.exists('NPC-AI'):
        print('📥 Cloning NPC-AI repository...')
        subprocess.run(['git', 'clone', 'https://github.com/minhphuc477/NPC-AI.git'], check=True)
    
    for folder in ['cpp', 'data']:
        src = f'NPC-AI/{folder}'
        if os.path.exists(src):
            if not os.path.exists(folder):
                print(f'📂 Cloning {folder} to root...')
                shutil.copytree(src, folder)
            else:
                for item in os.listdir(src):
                    s, d = os.path.join(src, item), os.path.join(folder, item)
                    if not os.path.exists(d):
                        if os.path.isdir(s): shutil.copytree(s, d)
                        else: shutil.copy2(s, d)

if IN_KAGGLE or IN_COLAB:
    print('📦 Installing Unsloth and dependencies...')
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '--no-deps', 'trl>=0.12.0', 'peft>=0.7.1', 'accelerate>=0.26.0', 'bitsandbytes>=0.40.0'])
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'transformers>=4.45.0', 'datasets', 'sentencepiece', 'protobuf'])
    
    print('📦 Installing Ollama...')
    try:
        subprocess.run(['apt-get', 'update'], check=True, capture_output=True)
        subprocess.run(['apt-get', 'install', '-y', 'zstd'], check=True, capture_output=True)
        os.system('curl -fsSL https://ollama.com/install.sh | sh')
        if shutil.which('ollama'): print('✅ Ollama installed successfully!')
    except Exception as e: print(f'❌ Failed to install Ollama: {e}')

import torch
if torch.cuda.is_available():
    print(f'🎮 GPU: {torch.cuda.get_device_name(0)}')
else: print('⚠️  No GPU detected!')


In [ ]:
# ============================================================
# Step 1.5: Patching C++ Engine for Linux compatibility
# ============================================================
import os, textwrap, re
print('🛠️ Patching C++ engine code for Linux...')

def apply_patch(filepath, old_str, new_str):
    if not os.path.exists(filepath):
        print(f'⚠️ File not found: {filepath}')
        return
    with open(filepath, 'r', encoding='utf-8') as f: content = f.read()
    if old_str in content:
        with open(filepath, 'w', encoding='utf-8') as f: f.write(content.replace(old_str, new_str))
        print(f'✅ Patched {filepath}')
    else: print(f'☑️ No patch needed for {filepath}')

def prepend_to_file(filepath, prepend_str, guard=None):
    if not os.path.exists(filepath):
        print(f'⚠️ File not found: {filepath}')
        return
    with open(filepath, 'r', encoding='utf-8') as f: content = f.read()
    if guard and guard in content:
        print(f'☑️ Already has headers: {filepath}')
        return
    with open(filepath, 'w', encoding='utf-8') as f: f.write(prepend_str + content)
    print(f'✅ Prepended to {filepath}')

# Fix 1: PythonBridge.cpp — Linux process management headers
prepend_to_file('cpp/src/PythonBridge.cpp', 
    '#include <thread>\n#include <chrono>\n#ifdef _WIN32\n#include <windows.h>\n#else\n#include <unistd.h>\n#include <sys/types.h>\n#include <sys/wait.h>\n#include <fcntl.h>\n#include <signal.h>\n#endif\n', 
    guard='unistd.h')

# Fix 2: NPCInference.cpp — missing thread/chrono
prepend_to_file('cpp/src/NPCInference.cpp', '#include <thread>\n#include <chrono>\n', guard='<thread>')

# Fix 3: HybridRetriever.h — signature fix
apply_patch('cpp/include/HybridRetriever.h', 
    'Search(const std::string& query, const RetrievalConfig& config = {});', 
    'Search(const std::string& query);\n    std::vector<RetrievalResult> Search(const std::string& query, const RetrievalConfig& config);')

# Fix 4: PromptBuilder.cpp — Align with [NPC]/[PLAYER] format
pb_new_source = textwrap.dedent("""
    std::string PromptBuilder::BuildAdvanced(const json& npcData, const json& gameState, const std::string& playerInput, const std::string& language, const json& tools) {
        bool isVi = (language == "vi");
        std::stringstream ss;
        std::string persona = npcData.value(isVi ? "persona_vi" : "persona_en", npcData.value("persona", isVi ? "Bạn là một NPC." : "You are an NPC."));
        ss << "[INSTRUCTION] " << (isVi ? "Trả lời bằng tiếng Việt. " : "Respond strictly in English. ") << persona << "\\n";
        if (!tools.is_null() && !tools.empty()) ss << (isVi ? "Công cụ khả dụng: " : "Available tools: ") << tools.dump() << "\\n";
        ss << "\\n[CONTEXT]\\n";
        json cog; cog["npc_info"] = {{"name", npcData.value("name", "NPC")}, {"mood", gameState.value("mood_state", "Neutral")}, {"health", gameState.value("health_state", "Healthy")}};
        if (gameState.contains("current_emotion")) cog["current_emotion"] = gameState["current_emotion"];
        if (gameState.contains("memories")) cog["memories"] = gameState["memories"];
        if (gameState.contains("relationships")) cog["relationships"] = gameState["relationships"];
        if (gameState.contains("knowledge")) cog["knowledge"] = gameState["knowledge"];
        if (gameState.contains("recent_history")) cog["recent_dialogue"] = gameState["recent_history"];
        if (gameState.contains("memory_context") && !gameState["memory_context"].get<std::string>().empty()) cog["historical_memories"] = gameState["memory_context"];
        ss << cog.dump() << "\\n\\n[PLAYER] " << playerInput << "\\n[NPC] ";
        return ss.str();
    }
""")
with open('cpp/src/PromptBuilder.cpp', 'r', encoding='utf-8') as f: pb_content = f.read()
pb_content = re.sub(r'std::string PromptBuilder::BuildAdvanced.*?\n\s+\}', pb_new_source, pb_content, flags=re.DOTALL)
with open('cpp/src/PromptBuilder.cpp', 'w', encoding='utf-8') as f: f.write(pb_content)
print('✅ Overwrote PromptBuilder::BuildAdvanced')

# Fix 5: NPCInference.cpp — Wire BuildAdvancedContext into Chat() with shared_ptr
chat_new_source = textwrap.dedent("""
    std::string NPCInferenceEngine::Chat(const std::string& session_id, const std::string& user_message) {
        if (!conversation_manager_) return "Error: No conversation manager";
        
        auto ctx = conversation_manager_->GetSession(session_id); // Now returns shared_ptr
        if (!ctx) return "Error: Invalid session ID";
        
        conversation_manager_->AddMessage(session_id, "user", user_message);
        json advanced_context = BuildAdvancedContext(ctx->npc_name, user_message);
        
        std::string history_str = "";
        auto history = conversation_manager_->GetHistory(session_id, 6);
        for (const auto& msg : history) {
            history_str += (msg.role == "user" ? ctx->player_name : ctx->npc_name) + ": " + msg.content + "\\n";
        }
        
        advanced_context["recent_history"] = history_str;
        advanced_context["npc_id"] = ctx->npc_name;
        advanced_context["player_id"] = ctx->player_name;
        advanced_context["conversation_id"] = session_id;
        
        std::string response = GenerateWithState(user_message, advanced_context, false);
        conversation_manager_->AddMessage(session_id, "assistant", response);
        
        if (config_.enable_graph) Learn(user_message);
        return response;
    }
""")
with open('cpp/src/NPCInference.cpp', 'r', encoding='utf-8') as f: ni_content = f.read()
ni_content = re.sub(r'std::string NPCInferenceEngine::Chat.*?\n\s+\}', chat_new_source, ni_content, flags=re.DOTALL)
with open('cpp/src/NPCInference.cpp', 'w', encoding='utf-8') as f: f.write(ni_content)
print('✅ Overwrote NPCInferenceEngine::Chat')

print('🎉 C++ patching complete!')


---
## 2. 📝 Training Data Generation (Enhanced)


In [None]:
# ============================================================
# Cell 2: Training Data Generation (Refined English)
# ============================================================
import json, random, os
os.makedirs('data', exist_ok=True)
PERSONAS_PATH = 'data/personas.json'
UTTERANCES_PATH = 'data/player_utterances.json'
OUTPUT_PATH = 'data/npc_training_v2.json'

if os.path.exists(PERSONAS_PATH):
    with open(PERSONAS_PATH, 'r', encoding='utf-8') as f: personas = json.load(f)
else: personas = {'merchant': {'persona_en': 'You are a Merchant.', 'traits': ['friendly'], 'id': 'merchant'}}

if os.path.exists(UTTERANCES_PATH):
    with open(UTTERANCES_PATH, 'r', encoding='utf-8') as f: utterances = json.load(f)
else: utterances = {'greetings': {'en': ['Hello!']}}

def generate_heuristic_response(persona, category, player_input):
    name = persona.get('id', 'NPC').replace('npc_', '').capitalize()
    traits = persona.get('traits', [])
    trait_str = random.choice(traits) if traits else 'friendly'
    templates = [
        lambda: f"{name} looks at you with a {trait_str} expression. 'Welcome, traveler. What brings you here?'",
        lambda: f"'Ah, a new face!' {name} exclaims. 'I hope your journey was smoother than mine.'",
        lambda: f"{name} pauses for a moment. 'I have much to share, but tell me, what is your business in this village?'",
        lambda: f"The {name} nods slowly. 'Greetings. I am here to help, if you have the coin.'"
    ]
    if category == 'greetings': return random.choice(templates)()
    elif category == 'trade_related': return f"{name} eyes your gear carefully. 'I deal in quality only. Are you buying or just looking?'"
    return f"{name} considers your words deeply. 'Interesting... {player_input} is not something I hear every day.'"

dataset = []
persona_list = list(personas.values())
categories = list(utterances.keys())
for _ in range(1200):
    p = random.choice(persona_list)
    c = random.choice(categories)
    q = random.choice(utterances[c].get('en', ['Hello']))
    a = generate_heuristic_response(p, c, q)
    ctx = {'memories': [], 'current_emotion': {'description': 'neutral', 'valence': 0.0}, 'knowledge': [], 'npc_info': {'name': p.get('id', 'NPC'), 'persona': p.get('persona_en', '')}}
    prompt = "[INSTRUCTION] Respond strictly in English.\n[CONTEXT]\n" + json.dumps(ctx, ensure_ascii=False) + "\n\n[PLAYER] " + q + "\n\n[NPC] "
    dataset.append({'prompt': prompt, 'completion': a})

with open(OUTPUT_PATH, 'w', encoding='utf-8') as f: json.dump(dataset, f, indent=1, ensure_ascii=False)
print(f'✅ Generated {len(dataset)} REFINED training samples at {OUTPUT_PATH}')


In [None]:
# ============================================================
# Cell 3: Write Standalone Training Script (120 Steps)
# ============================================================
script_content = """import torch, argparse, os, shutil, glob, gc
import torch.distributed as dist
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from datasets import Dataset
import json

def train(dataset_path, output_dir):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = 'unsloth/llama-3-8b-instruct-bnb-4bit', max_seq_length = 2048, load_in_4bit = True
    )
    model = FastLanguageModel.get_peft_model(model, r = 16, target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'], lora_alpha = 16, lora_dropout = 0, bias = 'none')
    
    with open(dataset_path, 'r') as f: data = json.load(f)
    dataset = Dataset.from_list([{'text': d['prompt'] + d['completion'] + '<|end|>'} for d in data])

    resume = os.path.exists(output_dir) and len(os.listdir(output_dir)) > 0
    trainer = SFTTrainer(
        model = model, tokenizer = tokenizer, train_dataset = dataset, dataset_text_field = 'text',
        max_seq_length = 2048, args = SFTConfig(
            per_device_train_batch_size = 2, gradient_accumulation_steps = 4, warmup_steps = 10,
            max_steps = 120, learning_rate = 2e-4, save_total_limit = 1, save_steps = 9999, logging_steps = 1, optim = 'adamw_8bit',
            output_dir = output_dir, report_to = 'none', ddp_find_unused_parameters=False, gradient_checkpointing_kwargs={'use_reentrant': False}, fp16 = not torch.cuda.is_bf16_supported(), bf16 = torch.cuda.is_bf16_supported(),
        ),
    )
    print('🚀 Training starting (120 steps)...')
    trainer.train(resume_from_checkpoint = resume)

    if dist.is_initialized(): dist.barrier()
    if not dist.is_initialized() or dist.get_rank() == 0:
        print('💾 Saving LoRA adapter...')
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        for ckpt in glob.glob(os.path.join(output_dir, 'checkpoint-*')):
            shutil.rmtree(ckpt, ignore_errors=True)
        print(f'✅ Model adapter saved to {output_dir}')
    
    if dist.is_initialized(): dist.barrier()
    print('✅ Training process finished successfully.')

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset', type=str, default='data/npc_training_v2.json')
    parser.add_argument('--output_dir', type=str, default='outputs/npc_model')
    args = parser.parse_args()
    train(args.dataset, args.output_dir)
"""
os.makedirs('scripts', exist_ok=True)
with open('scripts/train_unsloth.py', 'w') as f: f.write(script_content)
print('✅ Standalone training script written (120 steps)')



---
## 3. 🚀 QLoRA Fine-Tuning


In [None]:
# ============================================================
# Cell 4: Execute Fine-tuning & GGUF Export
# ============================================================
import subprocess, sys, os, torch, gc

print('🚀 Starting fine-tuning...')
subprocess.check_call(['accelerate', 'launch', '--num_processes', '2', 'scripts/train_unsloth.py', 
                    '--dataset', 'data/npc_training_v2.json',
                    '--output_dir', 'outputs/npc_model'])

gc.collect()
torch.cuda.empty_cache()
print('✅ Fine-tuning complete and VRAM cleared.')

print('📦 Writing GGUF Export Script...')
gguf_script = """import torch, os, glob, shutil
from unsloth import FastLanguageModel

output_dir = 'outputs/npc_model'
print('Loading saved LoRA model for export...')
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = output_dir, max_seq_length = 2048, load_in_4bit = True
)
temp_export = '/tmp/model_export'
os.makedirs(temp_export, exist_ok=True)
print('Converting to GGUF (q4_k_m)... This may take ~16 minutes.')
model.save_pretrained_gguf(temp_export, tokenizer, quantization_method='q4_k_m', maximum_memory_usage=0.55)

print('🚚 Moving GGUF to working directory...')
for f in glob.glob(f'{temp_export}*.gguf') + glob.glob(f'{temp_export}/**/*.gguf', recursive=True):
    shutil.move(f, os.path.join('/kaggle/working', os.path.basename(f)))
print('✅ GGUF export complete.')
"""
with open('scripts/export_gguf.py', 'w') as f:
    f.write(gguf_script)

print('🚀 Executing GGUF Export (Single Process to avoid NCCL Timeout)...')
subprocess.check_call([sys.executable, 'scripts/export_gguf.py'])
print('✅ All processes finished successfully.')



---
## 4. 📦 GGUF Export


In [None]:
# ============================================================
# Cell 5: GGUF Export Status
# ============================================================
import os, glob
all_ggufs = glob.glob('*.gguf') + glob.glob('**/*.gguf', recursive=True)
candidates = [f for f in all_ggufs if not any(x in f.lower() for x in ['vocab', 'embedding', 'bge', 'bert'])]
candidates = [f for f in candidates if os.path.exists(f) and os.path.getsize(f) > 500 * 1024 * 1024]
if candidates:
    trained_model_path = candidates[0]
    print(f'✅ GGUF found: {trained_model_path}')
else:
    print('⚠️ GGUF not found. Check training logs.')
    trained_model_path = 'unsloth/llama-3-8b-instruct-gguf'


---
## 5. 🤖 Ollama Serving


In [None]:
# ============================================================
# Cell 6: Ollama Serving (Safe Register & Stop Tokens)
# ============================================================
import subprocess, time, requests, os, glob
print('🚀 Starting Ollama server...')
try:
    if requests.get('http://localhost:11434/api/tags', timeout=1).status_code == 200:
        print('✅ Ollama is ALREADY running.')
    else: raise Exception('Not running')
except:
    subprocess.Popen(['ollama', 'serve'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    time.sleep(5)

tm_path = globals().get('trained_model_path')
if not tm_path or not os.path.exists(tm_path):
    all_ggufs = glob.glob('model_gguf/*.gguf') + glob.glob('*.gguf') + glob.glob('outputs/*.gguf')
    candidates = [f for f in all_ggufs if not any(x in f.lower() for x in ['vocab', 'embedding', 'bge', 'bert'])]
    candidates = [f for f in candidates if os.path.exists(f) and os.path.getsize(f) > 500 * 1024 * 1024]
    if candidates: tm_path = candidates[0]

if tm_path and os.path.exists(tm_path):
    lines = [
        f'FROM {tm_path}', 'PARAMETER temperature 0.7',
        'PARAMETER stop "[PLAYER]"', 'PARAMETER stop "[INSTRUCTION]"', 'PARAMETER stop "[CONTEXT]"', 'PARAMETER stop "<|end|>"',
        'SYSTEM "You are an NPC. Always respond strictly in English as the [NPC] speaker. Do not repeat the prompt."'
    ]
    with open('Modelfile', 'w') as f: f.write('\n'.join(lines))
    print(f'📦 Registering model npc-ai from {tm_path}...')
    subprocess.run(['ollama', 'create', 'npc-ai', '-f', 'Modelfile'])
else: print('❌ Model file NOT FOUND.')


---
## 6. 🎮 Integrated Demo (Enhanced)


In [None]:
# ============================================================
# Cell 7: Integrated Demo (Clean Turns)
# ============================================================
import json, requests
def query_npc(player_input):
    ctx = {'memories': [], 'current_emotion': {'description': 'neutral', 'valence': 0.0}, 'npc_info': {'name': 'Blacksmith', 'persona': 'A friendly blacksmith.'}}
    prompt = "[INSTRUCTION] Respond strictly in English.\n[CONTEXT]\n" + json.dumps(ctx) + "\n\n[PLAYER] " + player_input + "\n\n[NPC] "
    try:
        payload = {"model": "npc-ai", "prompt": prompt, "stream": False, "options": {"stop": ["[PLAYER]", "[INSTRUCTION]", "<|end|>"]}}
        res = requests.post("http://localhost:11434/api/generate", json=payload, timeout=60)
        if res.status_code == 200:
            text = res.json().get('response', '[No response]')
            return text.split('[NPC]')[-1].strip()
        return f"[Error {res.status_code}]"
    except Exception as e: return f"[Error: {e}]"

for inp in ["Hello! I am new here.", "What is the curse?"]:
    print(f"👤 Player: {inp}\n🤖 NPC: {query_npc(inp)}\n")


---
## 7. 📊 Quality Evaluation


In [None]:
print('📊 Evaluating responses...')
# Simplified evaluation loop
test_queries = ["Hello!", "Who are you?", "Tell me a story."]
for q in test_queries:
    resp = query_npc(q)
    print(f"Q: {q}\nA: {resp[:50]}...\n")


---
## 8. 🛠️ C++ Engine Compilation


In [None]:
# ============================================================
# Cell 10: C++ Engine Compilation (Optimized)
# ============================================================
import os, subprocess
if os.path.exists('cpp'):
    os.makedirs('cpp/build', exist_ok=True)
    try:
        subprocess.check_call(['cmake', '..'], cwd='cpp/build')
        nproc = subprocess.check_output(['nproc']).decode().strip()
        subprocess.check_call(['make', f'-j{nproc}'], cwd='cpp/build')
        print('✅ Compilation successful!')
    except subprocess.CalledProcessError as e: print(f'❌ Failed: {e}')
else: print('⚠️ cpp/ not found.')


---
## 9. 📈 Performance Benchmarking


In [None]:
# ============================================================
# Cell 11: C++ Engine Benchmarks
# ============================================================
import os, subprocess

if os.path.exists('cpp/build'):
    print("🚀 Running C++ Engine Benchmarks...")
    benchmarks = ['bench_engine', 'bench_memory', 'bench_retrieval', 'ablation_suite']
    
    for bench in benchmarks:
        path = f'cpp/build/{bench}'
        if os.path.exists(path):
            print(f"\n📊 Executing {bench}...")
            print('-'*40)
            try:
                res = subprocess.run([path], capture_output=True, text=True, timeout=300)
                print(res.stdout)
                if res.stderr: print(f"⚠️ Stderr: {res.stderr}")
            except subprocess.TimeoutExpired:
                print(f"❌ {bench} timed out after 5 minutes.")
            except Exception as e:
                print(f"❌ Failed to run {bench}: {e}")
        else:
            print(f"⚠️ Benchmark binary not found: {path}")
else:
    print("❌ C++ build directory (cpp/build) not found! Please run Cell 10 first.")


---
## 10. 📊 Ablation Study Visualization


In [None]:
# ============================================================
# Cell 12: Visualize Ablation Results
# ============================================================
import os, json
import pandas as pd
import matplotlib.pyplot as plt

results_path = 'cpp/build/ablation_results.json'
if os.path.exists(results_path):
    print(f"📈 Loading ablation results from {results_path}...")
    with open(results_path, 'r') as f:
        data = json.load(f)
    
    records = []
    for config, metrics in data.items():
        records.append({
            'Configuration': config,
            'Latency p95 (ms)': metrics.get('latency_p95_ms', 0),
            'Throughput (tok/s)': metrics.get('throughput_tok_s', 0),
            'Memory (MB)': metrics.get('peak_memory_mb', 0)
        })
    
    df = pd.DataFrame(records)
    # display(df) # Commented out for standalone script robustness
    print(df)
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    df.plot(x='Configuration', y='Latency p95 (ms)', kind='bar', ax=axes[0], color='salmon', legend=False)
    axes[0].set_title('95th Percentile Latency (Lower is Better)')
    axes[0].set_ylabel('Milliseconds (ms)')
    axes[0].tick_params(axis='x', rotation=45)
    
    df.plot(x='Configuration', y='Throughput (tok/s)', kind='bar', ax=axes[1], color='skyblue', legend=False)
    axes[1].set_title('Generation Throughput (Higher is Better)')
    axes[1].set_ylabel('Tokens per Second')
    axes[1].tick_params(axis='x', rotation=45)
    
    df.plot(x='Configuration', y='Memory (MB)', kind='bar', ax=axes[2], color='lightgreen', legend=False)
    axes[2].set_title('Peak Memory Usage (Lower is Better)')
    axes[2].set_ylabel('Megabytes (MB)')
    axes[2].tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print(f"⚠️ Ablation results not found at {results_path}. Make sure Cell 11 ran successfully.")
