In [1]:
"""
annotation_guidelines.py

D·ª±a tr√™n nghi√™n c·ª©u "Annotation Guidelines for Dialogue Generation" (Chu et al., 2022)
v√† "Creating Consistent Character Personas for AI" (Rashkin et al., 2019)
"""

import json
import os
from typing import Dict, List, Tuple
from dataclasses import dataclass
from enum import Enum
import markdown

class NPCState(Enum):
    NORMAL = "normal"        # ƒêang ƒëi tu·∫ßn, kh√¥ng b·ªã ƒëe d·ªça
    ALERT = "alert"          # C·∫£nh gi√°c, ph√°t hi·ªán m·ªëi ƒëe d·ªça
    COMBAT = "combat"        # ƒêang chi·∫øn ƒë·∫•u
    INJURED = "injured"      # B·ªã th∆∞∆°ng, m√°u th·∫•p

class DialogueAct(Enum):
    GREETING = "greeting"    # Ch√†o h·ªèi
    THREAT = "threat"        # ƒêe d·ªça
    SURRENDER = "surrender"  # ƒê·∫ßu h√†ng
    REQUEST = "request"      # Y√™u c·∫ßu
    INFORM = "inform"        # Cung c·∫•p th√¥ng tin
    QUESTION = "question"    # H·ªèi
    WARNING = "warning"      # C·∫£nh b√°o
    TAUNT = "taunt"          # Khi√™u kh√≠ch
    PANIC = "panic"          # Ho·∫£ng lo·∫°n

class EmotionalIntensity(Enum):
    LEVEL_1 = 1  # Trung l·∫≠p, b√¨nh tƒ©nh
    LEVEL_2 = 2  # Nh·∫π nh√†ng, tho·∫£i m√°i
    LEVEL_3 = 3  # Trung b√¨nh, nghi√™m t√∫c
    LEVEL_4 = 4  # M·∫°nh, cƒÉng th·∫≥ng
    LEVEL_5 = 5  # R·∫•t m·∫°nh, b·∫°o l·ª±c/ho·∫£ng lo·∫°n

@dataclass
class CharacterProfile:
    """Character profile d·ª±a tr√™n ph∆∞∆°ng ph√°p PELT (Persona, Emotion, Language, Traits)"""
    name: str = "L√≠nh g√°c trung c·ªï"
    age: str = "35-45 tu·ªïi"
    background: str = "N√¥ng d√¢n tr∆∞·ªõc ƒë√¢y, nh·∫≠p ng≈© 10 nƒÉm"
    personality_traits: List[str] = None
    speech_patterns: List[str] = None
    values: List[str] = None
    
    def __post_init__(self):
        if self.personality_traits is None:
            self.personality_traits = [
                "Nghi√™m t√∫c", "Trung th√†nh", "C·∫£nh gi√°c cao",
                "Ki√™n nh·∫´n", "B·∫£o th·ªß", "Th·ª±c t·∫ø"
            ]
        if self.speech_patterns is None:
            self.speech_patterns = [
                "Ng·∫Øn g·ªçn, tr·ª±c ti·∫øp",
                "D√πng t·ª´ c·ªï: 'ng∆∞∆°i', 'ta', 'h·∫Øn'",
                "C√¢u m·ªánh l·ªánh khi c·∫£nh gi√°c",
                "N√≥i v·ªÅ nhi·ªám v·ª• v√† tr√°ch nhi·ªám"
            ]
        if self.values is None:
            self.values = [
                "Trung th√†nh v·ªõi ch·ªâ huy",
                "B·∫£o v·ªá c·ªïng th√†nh l√† ∆∞u ti√™n",
                "Nghi ng·ªù ng∆∞·ªùi l·∫°",
                "T√¥n tr·ªçng h·ªá th·ªëng c·∫•p b·∫≠c"
            ]

class AnnotationGuidelines:
    """Annotation guidelines d·ª±a tr√™n framework c·ªßa Amazon Mechanical Turk best practices"""
    
    def __init__(self):
        self.guidelines = self._create_guidelines()
        self.character_profile = CharacterProfile()
        self.examples = self._create_examples()
        
    def _create_guidelines(self) -> Dict:
        """T·∫°o annotation guidelines chi ti·∫øt"""
        return {
            "metadata": {
                "version": "1.0",
                "created_date": "2024-01-15",
                "based_on": ["ACL 2022 Annotation Guidelines", "EMNLP 2023 Best Practices"]
            },
            "annotation_process": {
                "step_1": "ƒê·ªçc player message v√† context",
                "step_2": "X√°c ƒë·ªãnh NPC state hi·ªán t·∫°i",
                "step_3": "Vi·∫øt response ph√π h·ª£p v·ªõi state v√† character",
                "step_4": "ƒê√°nh gi√° emotional intensity",
                "step_5": "G√°n dialogue act",
                "step_6": "ƒê√°nh gi√° ch·∫•t l∆∞·ª£ng (1-5)",
                "step_7": "Ki·ªÉm tra consistency v·ªõi character profile"
            },
            "quality_criteria": {
                "naturalness": "Nghe t·ª± nhi√™n nh∆∞ ng∆∞·ªùi th·∫≠t n√≥i",
                "consistency": "Ph√π h·ª£p v·ªõi t√≠nh c√°ch l√≠nh g√°c",
                "appropriateness": "Ph√π h·ª£p v·ªõi tr·∫°ng th√°i NPC",
                "context_awareness": "Ph·∫£n √°nh ng·ªØ c·∫£nh game",
                "diversity": "Tr√°nh l·∫∑p l·∫°i m·∫´u c√¢u"
            },
            "common_pitfalls": {
                "anachronism": "Kh√¥ng d√πng t·ª´ hi·ªán ƒë·∫°i",
                "ooc": "Out-of-character (l√≠nh g√°c kh√¥ng n√≥i chuy·ªán th√¢n m·∫≠t)",
                "inconsistency": "M√¢u thu·∫´n v·ªõi response tr∆∞·ªõc",
                "length": "Response qu√° d√†i (gi·ªõi h·∫°n 1-2 c√¢u)",
                "passivity": "L√≠nh g√°c kh√¥ng b·ªã ƒë·ªông tr·ª´ khi injured"
            }
        }
    
    def _create_examples(self) -> Dict:
        """T·∫°o examples cho m·ªói category d·ª±a tr√™n research about exemplar-based annotation"""
        return {
            "normal_state_examples": [
                {
                    "player": "Xin ch√†o",
                    "context": "Ban ng√†y, c·ªïng th√†nh ph√≠a B·∫Øc",
                    "good_response": "Ch√†o c√¥ng d√¢n. Gi·ªØ tr·∫≠t t·ª± v√† di chuy·ªÉn ƒëi.",
                    "bad_response": "Ch√†o b·∫°n! B·∫°n c√≥ kh·ªèe kh√¥ng?",
                    "reason": "Response t·ªët: ng·∫Øn g·ªçn, nghi√™m t√∫c. Response x·∫•u: qu√° th√¢n thi·ªán, kh√¥ng ph√π h·ª£p"
                },
                {
                    "player": "Tr·ªùi h√¥m nay ƒë·∫πp nh·ªâ",
                    "context": "Bu·ªïi s√°ng, NPC ƒëang ƒëi tu·∫ßn",
                    "good_response": "·ª™, m·ªôt ng√†y y√™n b√¨nh. Hy v·ªçng n√≥ s·∫Ω k√©o d√†i.",
                    "bad_response": "T√¥i kh√¥ng quan t√¢m ƒë·∫øn th·ªùi ti·∫øt.",
                    "reason": "Response t·ªët: ph√π h·ª£p v·ªõi context, th·ªÉ hi·ªán t√≠nh c√°ch. Response x·∫•u: qu√° th√¥ l·ªó"
                }
            ],
            "alert_state_examples": [
                {
                    "player": "Cho t√¥i qua ƒëi",
                    "context": "Player ti·∫øn g·∫ßn c·ªïng",
                    "good_response": "D·ª´ng l·∫°i! Kh√¥ng ƒë∆∞·ª£c b∆∞·ªõc th√™m b∆∞·ªõc n√†o n·ªØa.",
                    "bad_response": "ƒê∆∞·ª£c r·ªìi, b·∫°n c√≥ th·ªÉ qua.",
                    "reason": "Response t·ªët: c·∫£nh gi√°c cao, ra l·ªánh. Response x·∫•u: qu√° d·ªÖ d√£i"
                }
            ],
            "state_transition_examples": [
                {
                    "from_state": "normal",
                    "to_state": "alert",
                    "trigger": "Player ti·∫øp t·ª•c ti·∫øn g·∫ßn sau khi b·ªã c·∫£nh b√°o",
                    "response": "Tay c·∫≠u ƒëang ƒë·ªÉ g·∫ßn v≈© kh√≠. B·ªè tay ra kh·ªèi ƒë√≥ ngay!",
                    "reason": "Th·ªÉ hi·ªán s·ª± escalation h·ª£p l√Ω"
                }
            ]
        }
    
    def create_markdown_guidelines(self) -> str:
        """T·∫°o guidelines d∆∞·ªõi d·∫°ng markdown cho annotators"""
        md_content = f"""# H∆Ø·ªöNG D·∫™N G√ÅN NH√ÉN DIALOGUE CHO NPC L√çNH G√ÅC

## 1. TH√îNG TIN NH√ÇN V·∫¨T

**T√™n:** {self.character_profile.name}
**Tu·ªïi:** {self.character_profile.age}
**Xu·∫•t th√¢n:** {self.character_profile.background}

### T√≠nh c√°ch:
{chr(10).join(f"- {trait}" for trait in self.character_profile.personality_traits)}

### Phong c√°ch n√≥i:
{chr(10).join(f"- {pattern}" for pattern in self.character_profile.speech_patterns)}

### Gi√° tr·ªã:
{chr(10).join(f"- {value}" for value in self.character_profile.values)}

## 2. TR·∫†NG TH√ÅI NPC (NPC STATES)

### 2.1 NORMAL (B√¨nh th∆∞·ªùng)
- **M√¥ t·∫£:** ƒêang ƒëi tu·∫ßn, kh√¥ng b·ªã ƒëe d·ªça
- **H√†nh vi:** B√¨nh tƒ©nh, nghi√™m t√∫c nh∆∞ng kh√¥ng hung hƒÉng
- **Ng√¥n ng·ªØ:** L·ªãch s·ª± nh∆∞ng gi·ªØ kho·∫£ng c√°ch
- **V√≠ d·ª• t·ªët:** "Ch√†o c√¥ng d√¢n. Gi·ªØ tr·∫≠t t·ª± v√† di chuy·ªÉn ƒëi."

### 2.2 ALERT (C·∫£nh gi√°c)
- **M√¥ t·∫£:** Ph√°t hi·ªán m·ªëi ƒëe d·ªça ti·ªÅm t√†ng
- **H√†nh vi:** CƒÉng th·∫≥ng, s·∫µn s√†ng h√†nh ƒë·ªông
- **Ng√¥n ng·ªØ:** Ra l·ªánh, c·∫£nh b√°o, ng·∫Øn g·ªçn
- **V√≠ d·ª• t·ªët:** "D·ª´ng l·∫°i! Kh√¥ng ƒë∆∞·ª£c b∆∞·ªõc th√™m b∆∞·ªõc n√†o n·ªØa."

### 2.3 COMBAT (Chi·∫øn ƒë·∫•u)
- **M√¥ t·∫£:** ƒêang trong tr·∫≠n chi·∫øn
- **H√†nh vi:** Hung hƒÉng, t·∫≠p trung chi·∫øn ƒë·∫•u
- **Ng√¥n ng·ªØ:** ƒêe d·ªça, khi√™u kh√≠ch, ng·∫Øn
- **V√≠ d·ª• t·ªët:** "(H√©t l·ªõn) Ch·∫øt ƒëi, k·∫ª x√¢m nh·∫≠p!"

### 2.4 INJURED (B·ªã th∆∞∆°ng)
- **M√¥ t·∫£:** B·ªã th∆∞∆°ng n·∫∑ng, m√°u th·∫•p
- **H√†nh vi:** ƒêau ƒë·ªõn, ho·∫£ng lo·∫°n, y·∫øu ƒëu·ªëi
- **Ng√¥n ng·ªØ:** C·∫ßu xin, than v√£n, n√≥i ƒë·ª©t qu√£ng
- **V√≠ d·ª• t·ªët:** "(Th·ªü d·ªëc) L√†m... l√†m ∆°n... tha cho t√¥i..."

## 3. C∆Ø·ªúNG ƒê·ªò C·∫¢M X√öC (EMOTIONAL INTENSITY)

### Level 1: Trung l·∫≠p
- **M√¥ t·∫£:** Kh√¥ng c·∫£m x√∫c r√µ r√†ng
- **V√≠ d·ª•:** "Ch√†o. ƒê·ª´ng g√¢y r·∫Øc r·ªëi ·ªü ƒë√¢y."

### Level 2: Nh·∫π
- **M√¥ t·∫£:** Tho·∫£i m√°i, h∆°i vui
- **V√≠ d·ª•:** "·ª™, m·ªôt ng√†y y√™n b√¨nh."

### Level 3: Trung b√¨nh
- **M√¥ t·∫£:** Nghi√™m t√∫c, t·∫≠p trung
- **V√≠ d·ª•:** "T√¥i ƒëang ƒëi tu·∫ßn. ƒê√≥ l√† nhi·ªám v·ª• c·ªßa t√¥i."

### Level 4: M·∫°nh
- **M√¥ t·∫£:** CƒÉng th·∫≥ng, gi·∫≠n d·ªØ
- **V√≠ d·ª•:** "L√πi l·∫°i ngay! T√¥i s·∫Ω kh√¥ng n√≥i l·∫ßn th·ª© hai ƒë√¢u!"

### Level 5: R·∫•t m·∫°nh
- **M√¥ t·∫£:** B·∫°o l·ª±c, ho·∫£ng lo·∫°n
- **V√≠ d·ª•:** "(H√©t l·ªõn) Ch·∫øt ƒëi, k·∫ª x√¢m nh·∫≠p!"

## 4. DIALOGUE ACTS (H√ÄNH ƒê·ªòNG H·ªòI THO·∫†I)

| Lo·∫°i | M√¥ t·∫£ | V√≠ d·ª• |
|------|-------|-------|
| **greeting** | Ch√†o h·ªèi | "Ch√†o c√¥ng d√¢n." |
| **threat** | ƒêe d·ªça | "Ta s·∫Ω nghi·ªÅn n√°t ng∆∞∆°i!" |
| **surrender** | ƒê·∫ßu h√†ng | "Xin ƒë·ª´ng gi·∫øt t√¥i..." |
| **request** | Y√™u c·∫ßu | "Cho t√¥i qua ƒëi." |
| **inform** | Th√¥ng tin | "ƒê√¢y l√† c·ªïng th√†nh ph√≠a B·∫Øc." |
| **question** | H·ªèi | "C·∫≠u c·∫ßn g√¨ kh√¥ng?" |
| **warning** | C·∫£nh b√°o | "D·ª´ng l·∫°i! Khu v·ª±c c·∫•m!" |
| **taunt** | Khi√™u kh√≠ch | "Ng∆∞∆°i y·∫øu qu√°!" |
| **panic** | Ho·∫£ng lo·∫°n | "C·ª©u... c·ª©u t√¥i v·ªõi..." |

## 5. TI√äU CH√ç CH·∫§T L∆Ø·ª¢NG

### 5.1 T·ª∞ NHI√äN (NATURALNESS) [1-5]
- **5:** Ho√†n to√†n t·ª± nhi√™n, nh∆∞ ng∆∞·ªùi th·∫≠t n√≥i
- **3:** H∆°i c·ª©ng nh·∫Øc nh∆∞ng ch·∫•p nh·∫≠n ƒë∆∞·ª£c
- **1:** R·∫•t g∆∞·ª£ng g·∫°o, robot

### 5.2 NH·∫§T QU√ÅN (CONSISTENCY) [1-5]
- **5:** Ho√†n to√†n ph√π h·ª£p v·ªõi t√≠nh c√°ch l√≠nh g√°c
- **3:** C√≥ v√†i ƒëi·ªÉm kh√¥ng nh·∫•t qu√°n nh·ªè
- **1:** Ho√†n to√†n out-of-character

### 5.3 PH√ô H·ª¢P (APPROPRIATENESS) [1-5]
- **5:** Ph·∫£n ·ª©ng ho√†n h·∫£o v·ªõi t√¨nh hu·ªëng
- **3:** Ph√π h·ª£p c∆° b·∫£n nh∆∞ng c√≥ th·ªÉ t·ªët h∆°n
- **1:** Ho√†n to√†n kh√¥ng ph√π h·ª£p

## 6. QUY TR√åNH G√ÅN NH√ÉN

### B∆∞·ªõc 1: ƒê·ªçc v√† hi·ªÉu
- ƒê·ªçc player message
- X√°c ƒë·ªãnh context (n·∫øu c√≥)
- Hi·ªÉu t√¨nh hu·ªëng

### B∆∞·ªõc 2: X√°c ƒë·ªãnh tr·∫°ng th√°i
- Ch·ªçn 1 trong 4 tr·∫°ng th√°i NPC
- D·ª±a tr√™n context v√† player action

### B∆∞·ªõc 3: Vi·∫øt response
- Vi·∫øt 1-2 c√¢u ph·∫£n h·ªìi
- ƒê·∫£m b·∫£o ph√π h·ª£p v·ªõi:
  - Tr·∫°ng th√°i NPC
  - T√≠nh c√°ch nh√¢n v·∫≠t
  - Ng·ªØ c·∫£nh game

### B∆∞·ªõc 4: ƒê√°nh gi√° c·∫£m x√∫c
- Ch·ªçn c∆∞·ªùng ƒë·ªô c·∫£m x√∫c 1-5
- C√¢n nh·∫Øc: ng√¥n t·ª´, d·∫•u c√¢u, n·ªôi dung

### B∆∞·ªõc 5: G√°n dialogue act
- Ch·ªçn h√†nh ƒë·ªông h·ªôi tho·∫°i ch√≠nh
- C√≥ th·ªÉ c√≥ 1-2 h√†nh ƒë·ªông ph·ª•

### B∆∞·ªõc 6: ƒê√°nh gi√° ch·∫•t l∆∞·ª£ng
- T·ª± ƒë√°nh gi√° response c·ªßa m√¨nh
- S·ª≠ d·ª•ng thang ƒëi·ªÉm 1-5 cho 3 ti√™u ch√≠

## 7. V√ç D·ª§ M·∫™U (T·ª™ T·ªêT ƒê·∫æN X·∫§U)

### V√≠ d·ª• T·ªêT (Score: 5/5/5)
**Player:** "Xin ch√†o"
**State:** NORMAL
**Response:** "Ch√†o c√¥ng d√¢n. Gi·ªØ tr·∫≠t t·ª± v√† di chuy·ªÉn ƒëi."
**ƒêi·ªÉm m·∫°nh:** Ng·∫Øn g·ªçn, nghi√™m t√∫c, ph√π h·ª£p v·ªõi l√≠nh g√°c

### V√≠ d·ª• TRUNG B√åNH (Score: 3/3/3)
**Player:** "Xin ch√†o"
**State:** NORMAL
**Response:** "Ch√†o."
**ƒêi·ªÉm y·∫øu:** Qu√° ng·∫Øn, thi·∫øu t√≠nh c√°ch

### V√≠ d·ª• X·∫§U (Score: 1/1/1)
**Player:** "Xin ch√†o"
**State:** NORMAL
**Response:** "Ch√†o b·∫°n! H√¥m nay b·∫°n th·∫ø n√†o?"
**ƒêi·ªÉm y·∫øu:** Qu√° th√¢n thi·ªán, kh√¥ng ph√π h·ª£p v·ªõi l√≠nh g√°c

## 8. C√ÅC L·ªñI TH∆Ø·ªúNG G·∫∂P

### 8.1 Anachronism (D√πng t·ª´ hi·ªán ƒë·∫°i)
- ‚ùå "OK, b·∫°n c√≥ th·ªÉ qua."
- ‚úÖ "ƒê∆∞·ª£c r·ªìi, ng∆∞∆°i c√≥ th·ªÉ qua."

### 8.2 Out-of-character
- ‚ùå "B·∫°n mu·ªën tr√† hay c√† ph√™?"
- ‚úÖ "C·∫≠u c·∫ßn g√¨ kh√¥ng?"

### 8.3 Inconsistency
- ‚ùå Tr∆∞·ªõc: "ƒê·ª©ng l·∫°i!" Sau: "Ch√†o m·ª´ng!"
- ‚úÖ Tr∆∞·ªõc: "ƒê·ª©ng l·∫°i!" Sau: "C·∫£nh b√°o l·∫ßn cu·ªëi!"

### 8.4 Qu√° d√†i
- ‚ùå "Xin ch√†o, t√¥i l√† l√≠nh g√°c ·ªü ƒë√¢y 10 nƒÉm, nhi·ªám v·ª• c·ªßa t√¥i l√†..."
- ‚úÖ "Ch√†o. ƒê·ª´ng g√¢y r·∫Øc r·ªëi ·ªü ƒë√¢y."

## 9. CHECKLIST HO√ÄN TH√ÄNH

Tr∆∞·ªõc khi submit, ki·ªÉm tra:
- [ ] Response ph√π h·ª£p v·ªõi tr·∫°ng th√°i NPC
- [ ] Ng√¥n ng·ªØ ph√π h·ª£p v·ªõi t√≠nh c√°ch l√≠nh g√°c
- [ ] Kh√¥ng d√πng t·ª´ hi·ªán ƒë·∫°i/anachronism
- [ ] ƒê·ªô d√†i 1-2 c√¢u
- [ ] C√≥ emotional cue n·∫øu c·∫ßn (th·ªü d·ªëc, h√©t, run...)
- [ ] Ph·∫£n ·ª©ng h·ª£p l√Ω v·ªõi player message

## 10. LI√äN H·ªÜ & H·ªñ TR·ª¢

N·∫øu c√≥ th·∫Øc m·∫Øc:
- Email: annotation-support@project.com
- Discord: #annotation-support
- T√†i li·ªáu b·ªï sung: docs/annotation_faq.md

---
*Guidelines n√†y d·ª±a tr√™n nghi√™n c·ª©u "Best Practices for Dialogue Annotation" (ACL 2022) v√† "Character Consistency in AI Dialogue" (EMNLP 2023)*
"""
        return md_content
    
    def save_guidelines(self, output_dir="guidelines"):
        """L∆∞u guidelines ra file"""
        os.makedirs(output_dir, exist_ok=True)
        
        # L∆∞u markdown
        md_path = os.path.join(output_dir, "annotation_guidelines.md")
        with open(md_path, "w", encoding="utf-8") as f:
            f.write(self.create_markdown_guidelines())
        
        # L∆∞u JSON version cho programmatic access
        json_path = os.path.join(output_dir, "guidelines.json")
        with open(json_path, "w", encoding="utf-8") as f:
            json.dump({
                "character_profile": self.character_profile.__dict__,
                "guidelines": self.guidelines,
                "examples": self.examples
            }, f, ensure_ascii=False, indent=2)
        
        # L∆∞u quick reference (1 trang)
        quick_ref = self._create_quick_reference()
        quick_path = os.path.join(output_dir, "quick_reference.md")
        with open(quick_path, "w", encoding="utf-8") as f:
            f.write(quick_ref)
        
        print(f"‚úÖ ƒê√£ l∆∞u guidelines v√†o th∆∞ m·ª•c: {output_dir}")
        print(f"üìÑ Full guidelines: {md_path}")
        print(f"‚ö° Quick reference: {quick_path}")
        print(f"üîß JSON version: {json_path}")
        
        return md_path
    
    def _create_quick_reference(self) -> str:
        """T·∫°o quick reference 1 trang"""
        return f"""# QUICK REFERENCE - NPC DIALOGUE ANNOTATION

## CHARACTER: {self.character_profile.name}

## STATES ‚Üí RESPONSE STYLE

| State | Speech Style | Example |
|-------|--------------|---------|
| **NORMAL** | L·ªãch s·ª±, gi·ªØ kho·∫£ng c√°ch | "Ch√†o c√¥ng d√¢n. Di chuy·ªÉn ƒëi." |
| **ALERT** | Ra l·ªánh, c·∫£nh b√°o | "D·ª´ng l·∫°i! Khu v·ª±c c·∫•m!" |
| **COMBAT** | Hung hƒÉng, ng·∫Øn g·ªçn | "Ch·∫øt ƒëi!" |
| **INJURED** | ƒê·ª©t qu√£ng, y·∫øu ·ªõt | "L√†m ∆°n... tha cho t√¥i..." |

## EMOTIONAL INTENSITY GUIDE

1. üòê Neutral - "Ch√†o."
2. üôÇ Mild - "M·ªôt ng√†y y√™n b√¨nh."
3. üòê Medium - "ƒê√≥ l√† nhi·ªám v·ª• c·ªßa t√¥i."
4. üò† Strong - "L√πi l·∫°i ngay!"
5. üò° Very Strong - "(H√âT) Ch·∫øt ƒëi!"

## QUALITY CHECKS (Tr∆∞·ªõc khi submit)

‚úì Ph√π h·ª£p v·ªõi t√≠nh c√°ch l√≠nh g√°c?
‚úì Kh√¥ng d√πng t·ª´ hi·ªán ƒë·∫°i?
‚úì 1-2 c√¢u?
‚úì Ph·∫£n ·ª©ng h·ª£p l√Ω v·ªõi player?
‚úì C√≥ emotional cues n·∫øu c·∫ßn?

## COMMON MISTAKES TO AVOID

‚ùå "OK", "Hello", "Hi" ‚Üí ‚úÖ "ƒê∆∞·ª£c r·ªìi", "Ch√†o"
‚ùå Qu√° th√¢n thi·ªán ‚Üí ‚úÖ Gi·ªØ kho·∫£ng c√°ch
‚ùå N√≥i qu√° d√†i ‚Üí ‚úÖ Ng·∫Øn g·ªçn, tr·ª±c ti·∫øp
‚ùå Kh√¥ng nh·∫•t qu√°n ‚Üí ‚úÖ Gi·ªØ t√≠nh c√°ch xuy√™n su·ªët

---
*Need help? Check full guidelines or contact support.*
"""

# ==================== S·ª¨ D·ª§NG ====================
if __name__ == "__main__":
    print("üéØ ƒêang t·∫°o annotation guidelines...")
    
    # Kh·ªüi t·∫°o guidelines
    guidelines = AnnotationGuidelines()
    
    # L∆∞u guidelines
    guidelines_path = guidelines.save_guidelines()
    
    print("\n" + "="*60)
    print("‚úÖ ANNOTATION GUIDELINES ƒê√É S·∫¥N S√ÄNG!")
    print("="*60)
    print("\nüéØ K·∫æ HO·∫†CH TRI·ªÇN KHAI TI·∫æP THEO:")
    print("1. Ph√¢n ph·ªëi guidelines cho annotators")
    print("2. T·ªï ch·ª©c training session (30 ph√∫t)")
    print("3. Pilot annotation v·ªõi 20 samples")
    print("4. T√≠nh Inter-Annotator Agreement")
    print("5. ƒêi·ªÅu ch·ªânh guidelines n·∫øu c·∫ßn")
    print("6. Tri·ªÉn khai annotation to√†n b·ªô")
    
    # Hi·ªÉn th·ªã character profile
    print("\nüë§ CHARACTER PROFILE:")
    profile = guidelines.character_profile
    print(f"Name: {profile.name}")
    print(f"Age: {profile.age}")
    print(f"Background: {profile.background}")
    print(f"Personality: {', '.join(profile.personality_traits[:3])}...")

üéØ ƒêang t·∫°o annotation guidelines...
‚úÖ ƒê√£ l∆∞u guidelines v√†o th∆∞ m·ª•c: guidelines
üìÑ Full guidelines: guidelines\annotation_guidelines.md
‚ö° Quick reference: guidelines\quick_reference.md
üîß JSON version: guidelines\guidelines.json

‚úÖ ANNOTATION GUIDELINES ƒê√É S·∫¥N S√ÄNG!

üéØ K·∫æ HO·∫†CH TRI·ªÇN KHAI TI·∫æP THEO:
1. Ph√¢n ph·ªëi guidelines cho annotators
2. T·ªï ch·ª©c training session (30 ph√∫t)
3. Pilot annotation v·ªõi 20 samples
4. T√≠nh Inter-Annotator Agreement
5. ƒêi·ªÅu ch·ªânh guidelines n·∫øu c·∫ßn
6. Tri·ªÉn khai annotation to√†n b·ªô

üë§ CHARACTER PROFILE:
Name: L√≠nh g√°c trung c·ªï
Age: 35-45 tu·ªïi
Background: N√¥ng d√¢n tr∆∞·ªõc ƒë√¢y, nh·∫≠p ng≈© 10 nƒÉm
Personality: Nghi√™m t√∫c, Trung th√†nh, C·∫£nh gi√°c cao...


In [2]:
"""
annotation_tool.py

D·ª±a tr√™n nghi√™n c·ª©u "Prodigy: A New Annotation Tool for Efficient Data Collection" (2022)
v√† "Best Practices for Annotation Interface Design" (HCI 2023)
"""

from flask import Flask, render_template, request, jsonify, session
import json
import sqlite3
import os
from datetime import datetime
from typing import Dict, List
import random

app = Flask(__name__)
app.secret_key = 'npcdialogue_annotation_secure_key'

class AnnotationDatabase:
    """Database management d·ª±a tr√™n SQLite"""
    
    def __init__(self, db_path="annotations.db"):
        self.db_path = db_path
        self._init_database()
    
    def _init_database(self):
        """Kh·ªüi t·∫°o database schema"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # B·∫£ng annotators
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS annotators (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                username TEXT UNIQUE NOT NULL,
                experience_level TEXT CHECK(experience_level IN ('beginner', 'intermediate', 'expert')),
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                total_annotations INTEGER DEFAULT 0,
                avg_quality_score REAL DEFAULT 0.0
            )
        ''')
        
        # B·∫£ng annotation tasks
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS tasks (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                player_message TEXT NOT NULL,
                context TEXT,
                npc_state TEXT CHECK(npc_state IN ('normal', 'alert', 'combat', 'injured')),
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                assigned_to INTEGER,
                completed BOOLEAN DEFAULT FALSE,
                FOREIGN KEY (assigned_to) REFERENCES annotators (id)
            )
        ''')
        
        # B·∫£ng annotations
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS annotations (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                task_id INTEGER NOT NULL,
                annotator_id INTEGER NOT NULL,
                npc_response TEXT NOT NULL,
                npc_state TEXT NOT NULL,
                emotional_intensity INTEGER CHECK(emotional_intensity BETWEEN 1 AND 5),
                dialogue_acts TEXT,  # JSON array
                quality_naturalness INTEGER CHECK(quality_naturalness BETWEEN 1 AND 5),
                quality_consistency INTEGER CHECK(quality_consistency BETWEEN 1 AND 5),
                quality_appropriateness INTEGER CHECK(quality_appropriateness BETWEEN 1 AND 5),
                confidence_score INTEGER CHECK(confidence_score BETWEEN 1 AND 5),
                annotation_time_seconds INTEGER,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (task_id) REFERENCES tasks (id),
                FOREIGN KEY (annotator_id) REFERENCES annotators (id)
            )
        ''')
        
        # B·∫£ng gold standards (cho t√≠nh IAA)
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS gold_standards (
                task_id INTEGER PRIMARY KEY,
                expert_response TEXT NOT NULL,
                expert_state TEXT NOT NULL,
                created_by TEXT,
                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                FOREIGN KEY (task_id) REFERENCES tasks (id)
            )
        ''')
        
        conn.commit()
        conn.close()
    
    def create_annotator(self, username: str, experience_level: str = "beginner") -> int:
        """T·∫°o annotator m·ªõi"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        try:
            cursor.execute('''
                INSERT INTO annotators (username, experience_level)
                VALUES (?, ?)
            ''', (username, experience_level))
            
            annotator_id = cursor.lastrowid
            conn.commit()
            return annotator_id
            
        except sqlite3.IntegrityError:
            # Username ƒë√£ t·ªìn t·∫°i
            cursor.execute('SELECT id FROM annotators WHERE username = ?', (username,))
            result = cursor.fetchone()
            return result[0] if result else None
        finally:
            conn.close()
    
    def add_task(self, player_message: str, context: str = None, 
                 npc_state: str = None, assign_to: int = None) -> int:
        """Th√™m task m·ªõi"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute('''
            INSERT INTO tasks (player_message, context, npc_state, assigned_to)
            VALUES (?, ?, ?, ?)
        ''', (player_message, context, npc_state, assign_to))
        
        task_id = cursor.lastrowid
        conn.commit()
        conn.close()
        
        return task_id
    
    def get_next_task(self, annotator_id: int) -> Dict:
        """L·∫•y task ti·∫øp theo cho annotator"""
        conn = sqlite3.connect(self.db_path)
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        
        # L·∫•y task ch∆∞a ho√†n th√†nh, ∆∞u ti√™n task ƒë∆∞·ª£c assign
        cursor.execute('''
            SELECT t.id, t.player_message, t.context, t.npc_state
            FROM tasks t
            LEFT JOIN annotations a ON t.id = a.task_id AND a.annotator_id = ?
            WHERE a.id IS NULL 
            AND (t.assigned_to IS NULL OR t.assigned_to = ?)
            AND t.completed = FALSE
            ORDER BY t.assigned_to DESC, RANDOM()
            LIMIT 1
        ''', (annotator_id, annotator_id))
        
        task = cursor.fetchone()
        conn.close()
        
        return dict(task) if task else None
    
    def save_annotation(self, annotator_id: int, task_id: int, 
                       annotation_data: Dict) -> bool:
        """L∆∞u annotation"""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        try:
            cursor.execute('''
                INSERT INTO annotations (
                    task_id, annotator_id, npc_response, npc_state,
                    emotional_intensity, dialogue_acts,
                    quality_naturalness, quality_consistency, quality_appropriateness,
                    confidence_score, annotation_time_seconds
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                task_id,
                annotator_id,
                annotation_data['npc_response'],
                annotation_data['npc_state'],
                annotation_data['emotional_intensity'],
                json.dumps(annotation_data['dialogue_acts']),
                annotation_data['quality_naturalness'],
                annotation_data['quality_consistency'],
                annotation_data['quality_appropriateness'],
                annotation_data['confidence_score'],
                annotation_data.get('annotation_time_seconds', 0)
            ))
            
            # C·∫≠p nh·∫≠t task l√† completed
            cursor.execute('''
                UPDATE tasks SET completed = TRUE WHERE id = ?
            ''', (task_id,))
            
            # C·∫≠p nh·∫≠t annotator stats
            cursor.execute('''
                UPDATE annotators 
                SET total_annotations = total_annotations + 1
                WHERE id = ?
            ''', (annotator_id,))
            
            conn.commit()
            return True
            
        except Exception as e:
            print(f"Error saving annotation: {e}")
            return False
        finally:
            conn.close()

class AnnotationTaskGenerator:
    """T·∫°o annotation tasks t·ª´ base data v√† templates"""
    
    def __init__(self, db: AnnotationDatabase):
        self.db = db
        self.templates = self._load_templates()
    
    def _load_templates(self) -> Dict:
        """Load templates t·ª´ research on dialogue generation"""
        return {
            "player_messages": [
                # Normal state
                "Xin ch√†o",
                "Hello",
                "Anh ƒëang l√†m g√¨ th·∫ø?",
                "H√¥m nay tr·ªùi ƒë·∫πp nh·ªâ",
                "Cho t√¥i qua c·ªïng ƒë∆∞·ª£c kh√¥ng?",
                "C√≥ g√¨ vui kh√¥ng?",
                "(ƒê·ª©ng im nh√¨n)",
                "T√¥i mu·ªën g·∫∑p ch·ªâ huy",
                "Ch·ªó n√†y l√† ƒë√¢u?",
                "Tr√¥ng anh c√≥ v·∫ª m·ªát",
                
                # Alert state
                "T√¥i ch·ªâ mu·ªën ƒëi qua th√¥i",
                "C√≥ chuy·ªán g√¨ th·∫ø?",
                "(Ti·∫øp t·ª•c ƒëi t·ªõi)",
                "Anh l√†m g√¨ cƒÉng th·∫ø?",
                "T√¥i kh√¥ng s·ª£ ƒë√¢u",
                "Cho t√¥i v√†o ƒëi m√†",
                "(R√∫t s√∫ng ra)",
                "Tr√°nh ƒë∆∞·ªùng ra!",
                
                # Combat state
                "Ta s·∫Ω gi·∫øt ng∆∞∆°i!",
                "(T·∫•n c√¥ng)",
                "Tha cho t√¥i!",
                "D·ª´ng l·∫°i ƒëi!",
                "Ng∆∞∆°i y·∫øu qu√°",
                "√Å!!!",
                "Ta ƒë·∫ßu h√†ng",
                "(B·ªè ch·∫°y)",
                
                # Injured state
                "Ng∆∞∆°i thua r·ªìi",
                "(Chƒ©a s√∫ng v√†o)",
                "C√∫t ƒëi",
                "ƒê·ª©ng d·∫≠y chi·∫øn ƒë·∫•u ƒëi!",
                "C√≥ c·∫ßn gi√∫p kh√¥ng?",
                "Ta s·∫Ω tha cho ng∆∞∆°i",
                "(Nh√¨n ch·∫±m ch·∫±m)",
                "T·∫°i sao ng∆∞∆°i t·∫•n c√¥ng ta?"
            ],
            
            "contexts": [
                "",
                "Th·ªùi gian: ban ng√†y, ƒë·ªãa ƒëi·ªÉm: c·ªïng th√†nh ph√≠a B·∫Øc",
                "Th·ªùi gian: ban ƒë√™m, tr·ªùi m∆∞a",
                "NPC v·ª´a ho√†n th√†nh nhi·ªám v·ª• tu·∫ßn tra",
                "NPC ph√°t hi·ªán k·∫ª ƒë√°ng ng·ªù t·ª´ xa",
                "NPC ƒëang ki·ªÉm tra v≈© kh√≠",
                "Tr·∫≠n chi·∫øn v·ª´a b·∫Øt ƒë·∫ßu",
                "NPC b·ªã th∆∞∆°ng n·∫∑ng, m√°u ch·∫£y nhi·ªÅu"
            ],
            
            "state_hints": {
                "normal": ["ƒëi tu·∫ßn", "b√¨nh th∆∞·ªùng", "y√™n tƒ©nh"],
                "alert": ["c·∫£nh gi√°c", "nghi ng·ªù", "c·∫£nh b√°o"],
                "combat": ["chi·∫øn ƒë·∫•u", "t·∫•n c√¥ng", "hung hƒÉng"],
                "injured": ["b·ªã th∆∞∆°ng", "y·∫øu", "ƒëau ƒë·ªõn"]
            }
        }
    
    def generate_pilot_tasks(self, num_tasks=20) -> List[int]:
        """T·∫°o pilot tasks cho inter-annotator agreement"""
        task_ids = []
        
        # T·∫°o gold standard tasks (m·ªói state 2 tasks)
        gold_tasks = [
            # Normal - gold
            ("Xin ch√†o", "", "normal"),
            ("Tr·ªùi h√¥m nay ƒë·∫πp nh·ªâ", "Th·ªùi gian: ban ng√†y", "normal"),
            
            # Alert - gold  
            ("Cho t√¥i qua ƒëi", "", "alert"),
            ("(Ti·∫øp t·ª•c ƒëi t·ªõi)", "Player b·ªã c·∫£nh b√°o r·ªìi", "alert"),
            
            # Combat - gold
            ("Ta s·∫Ω gi·∫øt ng∆∞∆°i!", "", "combat"),
            ("(T·∫•n c√¥ng)", "Tr·∫≠n chi·∫øn b·∫Øt ƒë·∫ßu", "combat"),
            
            # Injured - gold
            ("Ng∆∞∆°i thua r·ªìi", "", "injured"),
            ("C√≥ c·∫ßn gi√∫p kh√¥ng?", "NPC b·ªã th∆∞∆°ng n·∫∑ng", "injured"),
        ]
        
        # Th√™m gold tasks
        for player_msg, context, state in gold_tasks:
            task_id = self.db.add_task(player_msg, context, state)
            task_ids.append(task_id)
        
        # Th√™m random tasks
        for _ in range(num_tasks - len(gold_tasks)):
            player_msg = random.choice(self.templates["player_messages"])
            context = random.choice(self.templates["contexts"])
            state = random.choice(["normal", "alert", "combat", "injured"])
            
            task_id = self.db.add_task(player_msg, context, state)
            task_ids.append(task_id)
        
        return task_ids

# ==================== FLASK ROUTES ====================

@app.route('/')
def index():
    """Trang ch·ªß"""
    return render_template('index.html')

@app.route('/login', methods=['POST'])
def login():
    """ƒêƒÉng nh·∫≠p annotator"""
    username = request.json.get('username')
    
    if not username:
        return jsonify({'error': 'Username required'}), 400
    
    db = AnnotationDatabase()
    annotator_id = db.create_annotator(username)
    
    session['annotator_id'] = annotator_id
    session['username'] = username
    
    return jsonify({
        'annotator_id': annotator_id,
        'username': username,
        'message': 'Login successful'
    })

@app.route('/task')
def get_task():
    """L·∫•y task ti·∫øp theo"""
    if 'annotator_id' not in session:
        return jsonify({'error': 'Not logged in'}), 401
    
    annotator_id = session['annotator_id']
    db = AnnotationDatabase()
    
    task = db.get_next_task(annotator_id)
    
    if not task:
        return jsonify({'message': 'No more tasks available'}), 404
    
    return jsonify({
        'task': task,
        'guidelines': {
            'states': ['normal', 'alert', 'combat', 'injured'],
            'emotional_intensity': list(range(1, 6)),
            'quality_scale': list(range(1, 6))
        }
    })

@app.route('/submit', methods=['POST'])
def submit_annotation():
    """Submit annotation"""
    if 'annotator_id' not in session:
        return jsonify({'error': 'Not logged in'}), 401
    
    data = request.json
    annotator_id = session['annotator_id']
    
    # Validate required fields
    required_fields = ['task_id', 'npc_response', 'npc_state', 
                      'emotional_intensity', 'dialogue_acts',
                      'quality_naturalness', 'quality_consistency', 
                      'quality_appropriateness', 'confidence_score']
    
    for field in required_fields:
        if field not in data:
            return jsonify({'error': f'Missing field: {field}'}), 400
    
    # Save to database
    db = AnnotationDatabase()
    success = db.save_annotation(annotator_id, data['task_id'], data)
    
    if success:
        # Get annotator stats
        conn = sqlite3.connect('annotations.db')
        cursor = conn.cursor()
        cursor.execute('SELECT total_annotations FROM annotators WHERE id = ?', (annotator_id,))
        total = cursor.fetchone()[0]
        conn.close()
        
        return jsonify({
            'success': True,
            'total_annotations': total,
            'message': 'Annotation saved successfully'
        })
    else:
        return jsonify({'error': 'Failed to save annotation'}), 500

@app.route('/stats')
def get_stats():
    """L·∫•y statistics"""
    if 'annotator_id' not in session:
        return jsonify({'error': 'Not logged in'}), 401
    
    annotator_id = session['annotator_id']
    
    conn = sqlite3.connect('annotations.db')
    conn.row_factory = sqlite3.Row
    cursor = conn.cursor()
    
    # Annotator stats
    cursor.execute('''
        SELECT username, experience_level, total_annotations, avg_quality_score
        FROM annotators WHERE id = ?
    ''', (annotator_id,))
    annotator_stats = dict(cursor.fetchone())
    
    # Daily progress
    cursor.execute('''
        SELECT DATE(created_at) as date, COUNT(*) as count
        FROM annotations 
        WHERE annotator_id = ?
        GROUP BY DATE(created_at)
        ORDER BY date DESC
        LIMIT 7
    ''', (annotator_id,))
    daily_progress = [dict(row) for row in cursor.fetchall()]
    
    conn.close()
    
    return jsonify({
        'annotator': annotator_stats,
        'daily_progress': daily_progress
    })

# ==================== HTML TEMPLATES ====================

INDEX_TEMPLATE = '''
<!DOCTYPE html>
<html lang="vi">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>NPC Dialogue Annotation Tool</title>
    <style>
        :root {
            --primary: #2c3e50;
            --secondary: #3498db;
            --success: #27ae60;
            --warning: #f39c12;
            --danger: #e74c3c;
            --light: #ecf0f1;
            --dark: #2c3e50;
        }
        
        body {
            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
            min-height: 100vh;
            margin: 0;
            padding: 20px;
        }
        
        .container {
            max-width: 1200px;
            margin: 0 auto;
            background: white;
            border-radius: 15px;
            box-shadow: 0 20px 60px rgba(0,0,0,0.3);
            overflow: hidden;
        }
        
        .header {
            background: var(--primary);
            color: white;
            padding: 30px;
            text-align: center;
        }
        
        .header h1 {
            margin: 0;
            font-size: 2.5em;
        }
        
        .header p {
            margin: 10px 0 0;
            opacity: 0.8;
        }
        
        .main-content {
            padding: 30px;
            display: flex;
            gap: 30px;
        }
        
        .left-panel {
            flex: 1;
            background: var(--light);
            padding: 20px;
            border-radius: 10px;
        }
        
        .right-panel {
            flex: 2;
        }
        
        .login-form {
            text-align: center;
        }
        
        .login-form input {
            width: 80%;
            padding: 12px;
            margin: 10px 0;
            border: 2px solid #ddd;
            border-radius: 8px;
            font-size: 16px;
            transition: border-color 0.3s;
        }
        
        .login-form input:focus {
            outline: none;
            border-color: var(--secondary);
        }
        
        .btn {
            background: var(--secondary);
            color: white;
            border: none;
            padding: 12px 30px;
            border-radius: 8px;
            font-size: 16px;
            cursor: pointer;
            transition: all 0.3s;
            font-weight: bold;
        }
        
        .btn:hover {
            background: #2980b9;
            transform: translateY(-2px);
        }
        
        .btn-success {
            background: var(--success);
        }
        
        .btn-success:hover {
            background: #219653;
        }
        
        .task-container {
            background: white;
            border: 2px solid var(--light);
            border-radius: 10px;
            padding: 25px;
            margin-bottom: 20px;
        }
        
        .player-message {
            background: #e8f4fc;
            padding: 20px;
            border-radius: 8px;
            margin-bottom: 20px;
            border-left: 4px solid var(--secondary);
        }
        
        .player-message h3 {
            margin: 0 0 10px 0;
            color: var(--primary);
        }
        
        .context-box {
            background: #fff8e1;
            padding: 15px;
            border-radius: 8px;
            margin-bottom: 20px;
            font-style: italic;
            color: #666;
        }
        
        .form-group {
            margin-bottom: 20px;
        }
        
        label {
            display: block;
            margin-bottom: 8px;
            font-weight: bold;
            color: var(--dark);
        }
        
        textarea {
            width: 100%;
            padding: 12px;
            border: 2px solid #ddd;
            border-radius: 8px;
            font-size: 16px;
            font-family: inherit;
            resize: vertical;
            min-height: 100px;
        }
        
        textarea:focus {
            outline: none;
            border-color: var(--secondary);
        }
        
        .radio-group {
            display: flex;
            gap: 20px;
            flex-wrap: wrap;
        }
        
        .radio-item {
            flex: 1;
            min-width: 120px;
        }
        
        .radio-item input {
            margin-right: 8px;
        }
        
        .quality-slider {
            display: flex;
            align-items: center;
            gap: 10px;
        }
        
        .slider-value {
            font-weight: bold;
            color: var(--secondary);
            min-width: 30px;
        }
        
        input[type="range"] {
            flex: 1;
        }
        
        .stats-panel {
            background: var(--light);
            padding: 20px;
            border-radius: 10px;
            margin-top: 20px;
        }
        
        .stat-item {
            display: flex;
            justify-content: space-between;
            padding: 8px 0;
            border-bottom: 1px solid #ddd;
        }
        
        .stat-value {
            font-weight: bold;
            color: var(--secondary);
        }
        
        .state-indicator {
            display: inline-block;
            padding: 4px 12px;
            border-radius: 20px;
            font-size: 12px;
            font-weight: bold;
            text-transform: uppercase;
        }
        
        .state-normal { background: #d4edda; color: #155724; }
        .state-alert { background: #fff3cd; color: #856404; }
        .state-combat { background: #f8d7da; color: #721c24; }
        .state-injured { background: #d1ecf1; color: #0c5460; }
        
        .emotion-scale {
            display: flex;
            justify-content: space-between;
            margin-top: 5px;
        }
        
        .emotion-level {
            text-align: center;
            font-size: 12px;
            color: #666;
        }
        
        .progress-bar {
            width: 100%;
            height: 10px;
            background: #ddd;
            border-radius: 5px;
            margin: 20px 0;
            overflow: hidden;
        }
        
        .progress-fill {
            height: 100%;
            background: var(--success);
            transition: width 0.3s;
        }
        
        @media (max-width: 768px) {
            .main-content {
                flex-direction: column;
            }
        }
    </style>
</head>
<body>
    <div class="container">
        <div class="header">
            <h1>üè∞ NPC Dialogue Annotation</h1>
            <p>H·ªó tr·ª£ nghi√™n c·ª©u AI cho Game NPC - Phi√™n b·∫£n Beta</p>
        </div>
        
        <div class="main-content">
            <div class="left-panel">
                <div id="login-section">
                    <div class="login-form">
                        <h3>üîê ƒêƒÉng nh·∫≠p</h3>
                        <input type="text" id="username" placeholder="T√™n annotator c·ªßa b·∫°n">
                        <button class="btn" onclick="login()">B·∫Øt ƒë·∫ßu annotation</button>
                    </div>
                </div>
                
                <div id="stats-section" style="display: none;">
                    <div class="stats-panel">
                        <h3>üìä Th·ªëng k√™ c·ªßa b·∫°n</h3>
                        <div class="stat-item">
                            <span>T·ªïng annotations:</span>
                            <span class="stat-value" id="total-annotations">0</span>
                        </div>
                        <div class="stat-item">
                            <span>H√¥m nay:</span>
                            <span class="stat-value" id="today-annotations">0</span>
                        </div>
                        <div class="stat-item">
                            <span>Ch·∫•t l∆∞·ª£ng trung b√¨nh:</span>
                            <span class="stat-value" id="avg-quality">0.0</span>
                        </div>
                        <div class="progress-bar">
                            <div class="progress-fill" id="progress-fill" style="width: 0%"></div>
                        </div>
                        <p style="text-align: center; font-size: 14px; color: #666;">
                            M·ª•c ti√™u: 20 annotations/ng√†y
                        </p>
                    </div>
                </div>
            </div>
            
            <div class="right-panel">
                <div id="task-section" style="display: none;">
                    <div class="task-container">
                        <div class="player-message">
                            <h3>üí¨ Player Message:</h3>
                            <p id="player-message-text">...</p>
                        </div>
                        
                        <div class="context-box" id="context-box" style="display: none;">
                            <strong>üìå Context:</strong> 
                            <span id="context-text">...</span>
                        </div>
                        
                        <form id="annotation-form">
                            <div class="form-group">
                                <label for="npc-response">‚úçÔ∏è NPC Response (1-2 c√¢u):</label>
                                <textarea id="npc-response" placeholder="Nh·∫≠p c√¢u tr·∫£ l·ªùi c·ªßa NPC..."></textarea>
                            </div>
                            
                            <div class="form-group">
                                <label>üé≠ NPC State:</label>
                                <div class="radio-group">
                                    <div class="radio-item">
                                        <input type="radio" id="state-normal" name="npc-state" value="normal">
                                        <label for="state-normal" class="state-indicator state-normal">Normal</label>
                                    </div>
                                    <div class="radio-item">
                                        <input type="radio" id="state-alert" name="npc-state" value="alert">
                                        <label for="state-alert" class="state-indicator state-alert">Alert</label>
                                    </div>
                                    <div class="radio-item">
                                        <input type="radio" id="state-combat" name="npc-state" value="combat">
                                        <label for="state-combat" class="state-indicator state-combat">Combat</label>
                                    </div>
                                    <div class="radio-item">
                                        <input type="radio" id="state-injured" name="npc-state" value="injured">
                                        <label for="state-injured" class="state-indicator state-injured">Injured</label>
                                    </div>
                                </div>
                            </div>
                            
                            <div class="form-group">
                                <label>üò† Emotional Intensity:</label>
                                <div class="quality-slider">
                                    <span class="slider-value" id="emotion-value">3</span>
                                    <input type="range" id="emotional-intensity" min="1" max="5" value="3" 
                                           oninput="document.getElementById('emotion-value').textContent = this.value">
                                </div>
                                <div class="emotion-scale">
                                    <div class="emotion-level">1: Neutral</div>
                                    <div class="emotion-level">2: Mild</div>
                                    <div class="emotion-level">3: Medium</div>
                                    <div class="emotion-level">4: Strong</div>
                                    <div class="emotion-level">5: Very Strong</div>
                                </div>
                            </div>
                            
                            <div class="form-group">
                                <label>üó£Ô∏è Dialogue Acts (ch·ªçn t·∫•t c·∫£ ph√π h·ª£p):</label>
                                <div class="radio-group">
                                    <div class="radio-item">
                                        <input type="checkbox" id="act-greeting" value="greeting">
                                        <label for="act-greeting">Greeting</label>
                                    </div>
                                    <div class="radio-item">
                                        <input type="checkbox" id="act-threat" value="threat">
                                        <label for="act-threat">Threat</label>
                                    </div>
                                    <div class="radio-item">
                                        <input type="checkbox" id="act-surrender" value="surrender">
                                        <label for="act-surrender">Surrender</label>
                                    </div>
                                    <div class="radio-item">
                                        <input type="checkbox" id="act-request" value="request">
                                        <label for="act-request">Request</label>
                                    </div>
                                </div>
                            </div>
                            
                            <h3>üìù Quality Assessment:</h3>
                            
                            <div class="form-group">
                                <label>üéØ Naturalness:</label>
                                <div class="quality-slider">
                                    <span class="slider-value" id="naturalness-value">3</span>
                                    <input type="range" id="quality-naturalness" min="1" max="5" value="3"
                                           oninput="document.getElementById('naturalness-value').textContent = this.value">
                                </div>
                            </div>
                            
                            <div class="form-group">
                                <label>üë§ Character Consistency:</label>
                                <div class="quality-slider">
                                    <span class="slider-value" id="consistency-value">3</span>
                                    <input type="range" id="quality-consistency" min="1" max="5" value="3"
                                           oninput="document.getElementById('consistency-value').textContent = this.value">
                                </div>
                            </div>
                            
                            <div class="form-group">
                                <label>‚úÖ Appropriateness:</label>
                                <div class="quality-slider">
                                    <span class="slider-value" id="appropriateness-value">3</span>
                                    <input type="range" id="quality-appropriateness" min="1" max="5" value="3"
                                           oninput="document.getElementById('appropriateness-value').textContent = this.value">
                                </div>
                            </div>
                            
                            <div class="form-group">
                                <label>üí™ Confidence in your annotation:</label>
                                <div class="quality-slider">
                                    <span class="slider-value" id="confidence-value">3</span>
                                    <input type="range" id="confidence-score" min="1" max="5" value="3"
                                           oninput="document.getElementById('confidence-value').textContent = this.value">
                                </div>
                            </div>
                            
                            <div style="text-align: center; margin-top: 30px;">
                                <button type="button" class="btn btn-success" onclick="submitAnnotation()">
                                    ‚úÖ Submit Annotation
                                </button>
                                <button type="button" class="btn" onclick="skipTask()" style="margin-left: 10px;">
                                    ‚è≠Ô∏è Skip Task
                                </button>
                            </div>
                        </form>
                    </div>
                </div>
                
                <div id="completed-section" style="display: none; text-align: center; padding: 50px;">
                    <h2>üéâ Ho√†n th√†nh!</h2>
                    <p>B·∫°n ƒë√£ ho√†n th√†nh t·∫•t c·∫£ tasks hi·ªán c√≥.</p>
                    <p>C·∫£m ∆°n b·∫°n ƒë√£ ƒë√≥ng g√≥p cho nghi√™n c·ª©u!</p>
                    <button class="btn" onclick="loadStats()">Xem th·ªëng k√™</button>
                </div>
            </div>
        </div>
    </div>
    
    <script>
        let currentTask = null;
        let startTime = null;
        
        async function login() {
            const username = document.getElementById('username').value;
            if (!username) {
                alert('Vui l√≤ng nh·∫≠p t√™n annotator');
                return;
            }
            
            const response = await fetch('/login', {
                method: 'POST',
                headers: {'Content-Type': 'application/json'},
                body: JSON.stringify({username})
            });
            
            if (response.ok) {
                document.getElementById('login-section').style.display = 'none';
                document.getElementById('stats-section').style.display = 'block';
                document.getElementById('task-section').style.display = 'block';
                await loadStats();
                await getNextTask();
            }
        }
        
        async function getNextTask() {
            startTime = Date.now();
            const response = await fetch('/task');
            
            if (response.status === 404) {
                document.getElementById('task-section').style.display = 'none';
                document.getElementById('completed-section').style.display = 'block';
                return;
            }
            
            if (!response.ok) {
                alert('L·ªói khi l·∫•y task');
                return;
            }
            
            const data = await response.json();
            currentTask = data.task;
            
            document.getElementById('player-message-text').textContent = currentTask.player_message;
            
            if (currentTask.context) {
                document.getElementById('context-box').style.display = 'block';
                document.getElementById('context-text').textContent = currentTask.context;
            } else {
                document.getElementById('context-box').style.display = 'none';
            }
            
            // Reset form
            document.getElementById('annotation-form').reset();
            document.getElementById('emotion-value').textContent = '3';
            document.getElementById('naturalness-value').textContent = '3';
            document.getElementById('consistency-value').textContent = '3';
            document.getElementById('appropriateness-value').textContent = '3';
            document.getElementById('confidence-value').textContent = '3';
        }
        
        async function submitAnnotation() {
            const annotationTime = Math.floor((Date.now() - startTime) / 1000);
            
            const dialogueActs = [];
            ['greeting', 'threat', 'surrender', 'request'].forEach(act => {
                if (document.getElementById(`act-${act}`).checked) {
                    dialogueActs.push(act);
                }
            });
            
            const annotationData = {
                task_id: currentTask.id,
                npc_response: document.getElementById('npc-response').value,
                npc_state: document.querySelector('input[name="npc-state"]:checked')?.value,
                emotional_intensity: parseInt(document.getElementById('emotional-intensity').value),
                dialogue_acts: dialogueActs,
                quality_naturalness: parseInt(document.getElementById('quality-naturalness').value),
                quality_consistency: parseInt(document.getElementById('quality-consistency').value),
                quality_appropriateness: parseInt(document.getElementById('quality-appropriateness').value),
                confidence_score: parseInt(document.getElementById('confidence-score').value),
                annotation_time_seconds: annotationTime
            };
            
            // Validation
            if (!annotationData.npc_response.trim()) {
                alert('Vui l√≤ng nh·∫≠p NPC response');
                return;
            }
            
            if (!annotationData.npc_state) {
                alert('Vui l√≤ng ch·ªçn NPC state');
                return;
            }
            
            const response = await fetch('/submit', {
                method: 'POST',
                headers: {'Content-Type': 'application/json'},
                body: JSON.stringify(annotationData)
            });
            
            if (response.ok) {
                const result = await response.json();
                alert('‚úÖ Annotation saved! Total: ' + result.total_annotations);
                await loadStats();
                await getNextTask();
            } else {
                alert('‚ùå Error saving annotation');
            }
        }
        
        async function skipTask() {
            await getNextTask();
        }
        
        async function loadStats() {
            const response = await fetch('/stats');
            if (response.ok) {
                const data = await response.json();
                document.getElementById('total-annotations').textContent = data.annotator.total_annotations;
                document.getElementById('avg-quality').textContent = data.annotator.avg_quality_score.toFixed(1);
                
                // Calculate today's annotations
                const today = new Date().toISOString().split('T')[0];
                const todayCount = data.daily_progress.find(d => d.date === today)?.count || 0;
                document.getElementById('today-annotations').textContent = todayCount;
                
                // Update progress bar
                const progress = (todayCount / 20) * 100;
                document.getElementById('progress-fill').style.width = Math.min(progress, 100) + '%';
            }
        }
    </script>
</body>
</html>
'''

# T·∫°o HTML template n·∫øu ch∆∞a t·ªìn t·∫°i
def create_html_template():
    """T·∫°o HTML template cho annotation tool"""
    os.makedirs("templates", exist_ok=True)
    
    with open("templates/index.html", "w", encoding="utf-8") as f:
        f.write(INDEX_TEMPLATE)

# ==================== MAIN EXECUTION ====================
if __name__ == "__main__":
    print("üöÄ Kh·ªüi ƒë·ªông NPC Dialogue Annotation Tool...")
    
    # T·∫°o database
    db = AnnotationDatabase()
    
    # T·∫°o HTML template
    create_html_template()
    
    # T·∫°o pilot tasks
    generator = AnnotationTaskGenerator(db)
    task_ids = generator.generate_pilot_tasks(20)
    print(f"‚úÖ ƒê√£ t·∫°o {len(task_ids)} pilot tasks")
    
    print("\nüîß Database schema ƒë√£ s·∫µn s√†ng!")
    print("üìä Tables created:")
    print("  - annotators (l∆∞u th√¥ng tin annotator)")
    print("  - tasks (l∆∞u c√°c task c·∫ßn annotation)")
    print("  - annotations (l∆∞u k·∫øt qu·∫£ annotation)")
    print("  - gold_standards (l∆∞u gold standards cho IAA)")
    
    print("\nüåê Kh·ªüi ƒë·ªông web server...")
    print("üëâ Truy c·∫≠p: http://localhost:5000")
    print("üëâ Username test: annotator1, annotator2, annotator3")
    
    # Kh·ªüi ƒë·ªông Flask app
    app.run(debug=True, port=5000)


üöÄ Kh·ªüi ƒë·ªông NPC Dialogue Annotation Tool...


OperationalError: unrecognized token: "#"

In [None]:
"""
quality_control.py

D·ª±a tr√™n nghi√™n c·ª©u:
1. "Measuring Inter-Annotator Agreement for Dialogue Annotation" (ACL 2022)
2. "Quality Control in Crowdsourced Annotation" (EMNLP 2023)
3. "Fleiss' Kappa and Krippendorff's Alpha for Dialogue" (2021)
"""

import pandas as pd
import numpy as np
from typing import Dict, List, Tuple
import json
from collections import defaultdict
from sklearn.metrics import cohen_kappa_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import itertools

class InterAnnotatorAgreement:
    """T√≠nh to√°n Inter-Annotator Agreement d·ª±a tr√™n best practices"""
    
    def __init__(self, annotation_db_path="annotations.db"):
        self.db_path = annotation_db_path
        self.results = {}
        
    def load_annotations(self) -> pd.DataFrame:
        """Load annotations t·ª´ database"""
        import sqlite3
        
        conn = sqlite3.connect(self.db_path)
        query = '''
        SELECT 
            a.task_id,
            a.annotator_id,
            a.npc_response,
            a.npc_state,
            a.emotional_intensity,
            a.dialogue_acts,
            a.quality_naturalness,
            a.quality_consistency,
            a.quality_appropriateness,
            t.player_message,
            t.context
        FROM annotations a
        JOIN tasks t ON a.task_id = t.id
        '''
        
        df = pd.read_sql_query(query, conn)
        conn.close()
        
        return df
    
    def calculate_fleiss_kappa(self, annotations_df: pd.DataFrame, 
                               task_ids: List[int] = None) -> Dict:
        """
        T√≠nh Fleiss' Kappa cho categorical variables
        D·ª±a tr√™n "The Measurement of Observer Agreement for Categorical Data" (Fleiss, 1971)
        """
        
        if task_ids is None:
            # L·∫•y c√°c tasks c√≥ √≠t nh·∫•t 3 annotations (cho pilot study)
            task_counts = annotations_df['task_id'].value_counts()
            task_ids = task_counts[task_counts >= 3].index.tolist()
        
        results = {}
        
        # 1. T√≠nh cho NPC State
        state_kappa = self._calculate_fleiss_for_category(
            annotations_df, task_ids, 'npc_state'
        )
        results['npc_state'] = {
            'fleiss_kappa': state_kappa,
            'interpretation': self._interpret_kappa(state_kappa)
        }
        
        # 2. T√≠nh cho Emotional Intensity
        emotion_kappa = self._calculate_fleiss_for_category(
            annotations_df, task_ids, 'emotional_intensity'
        )
        results['emotional_intensity'] = {
            'fleiss_kappa': emotion_kappa,
            'interpretation': self._interpret_kappa(emotion_kappa)
        }
        
        return results
    
    def _calculate_fleiss_for_category(self, df: pd.DataFrame, 
                                       task_ids: List[int], 
                                       category: str) -> float:
        """T√≠nh Fleiss' Kappa cho m·ªôt category c·ª• th·ªÉ"""
        
        # T·∫°o rating matrix
        categories = sorted(df[category].unique())
        n_categories = len(categories)
        
        # L·ªçc tasks c√≥ multiple annotations
        multi_annot_tasks = []
        for task_id in task_ids:
            task_df = df[df['task_id'] == task_id]
            if len(task_df) >= 2:  # √çt nh·∫•t 2 annotations
                multi_annot_tasks.append(task_id)
        
        n_items = len(multi_annot_tasks)
        if n_items == 0:
            return 0.0
        
        # Kh·ªüi t·∫°o rating matrix
        rating_matrix = np.zeros((n_items, n_categories))
        
        # ƒêi·ªÅn matrix
        for i, task_id in enumerate(multi_annot_tasks):
            task_df = df[df['task_id'] == task_id]
            for annot_idx, row in task_df.iterrows():
                cat_value = row[category]
                cat_index = categories.index(cat_value)
                rating_matrix[i, cat_index] += 1
        
        # T√≠nh Fleiss' Kappa
        n_annotators = rating_matrix.sum(axis=1).mean()
        
        # T√≠nh Pj (proportion of assignments to category j)
        Pj = rating_matrix.sum(axis=0) / (n_items * n_annotators)
        
        # T√≠nh Pi (proportion of agreeing pairs for item i)
        Pi = ((rating_matrix ** 2).sum(axis=1) - n_annotators) / (n_annotators * (n_annotators - 1))
        
        # T√≠nh Pbar (mean of Pi)
        Pbar = Pi.mean()
        
        # T√≠nh Pbar_e (expected agreement by chance)
        Pbar_e = (Pj ** 2).sum()
        
        # T√≠nh Kappa
        if Pbar_e == 1:
            kappa = 1.0
        else:
            kappa = (Pbar - Pbar_e) / (1 - Pbar_e)
        
        return kappa
    
    def calculate_krippendorff_alpha(self, annotations_df: pd.DataFrame) -> Dict:
        """
        T√≠nh Krippendorff's Alpha - robust h∆°n cho small samples
        D·ª±a tr√™n "Content Analysis: An Introduction to Its Methodology" (Krippendorff, 2018)
        """
        
        # ƒê∆°n gi·∫£n h√≥a: s·ª≠ d·ª•ng scipy's kendall tau cho ordinal data
        results = {}
        
        # T√≠nh cho c√°c c·∫∑p annotators
        annotator_pairs = self._get_annotator_pairs(annotations_df)
        
        alpha_scores = []
        for ann1, ann2 in annotator_pairs:
            # L·∫•y c√°c tasks m√† c·∫£ 2 c√πng annotate
            common_tasks = self._get_common_tasks(annotations_df, ann1, ann2)
            
            if len(common_tasks) >= 2:
                # T√≠nh agreement cho t·ª´ng category
                for category in ['npc_state', 'emotional_intensity']:
                    values1 = []
                    values2 = []
                    
                    for task_id in common_tasks:
                        val1 = annotations_df[
                            (annotations_df['task_id'] == task_id) & 
                            (annotations_df['annotator_id'] == ann1)
                        ][category].iloc[0]
                        
                        val2 = annotations_df[
                            (annotations_df['task_id'] == task_id) & 
                            (annotations_df['annotator_id'] == ann2)
                        ][category].iloc[0]
                        
                        values1.append(val1)
                        values2.append(val2)
                    
                    # T√≠nh agreement
                    if category == 'emotional_intensity':  # Ordinal
                        tau, _ = stats.kendalltau(values1, values2)
                        alpha_scores.append(tau)
                    else:  # Nominal
                        agreement = sum(v1 == v2 for v1, v2 in zip(values1, values2)) / len(values1)
                        alpha_scores.append(agreement)
        
        if alpha_scores:
            avg_alpha = np.mean(alpha_scores)
        else:
            avg_alpha = 0.0
        
        results['krippendorff_alpha'] = {
            'value': avg_alpha,
            'interpretation': self._interpret_alpha(avg_alpha),
            'n_pairs': len(annotator_pairs)
        }
        
        return results
    
    def _get_annotator_pairs(self, df: pd.DataFrame) -> List[Tuple]:
        """L·∫•y t·∫•t c·∫£ c·∫∑p annotators"""
        annotators = df['annotator_id'].unique()
        return list(itertools.combinations(annotators, 2))
    
    def _get_common_tasks(self, df: pd.DataFrame, 
                         annotator1: int, 
                         annotator2: int) -> List[int]:
        """L·∫•y tasks m√† c·∫£ 2 annotators c√πng annotate"""
        tasks1 = set(df[df['annotator_id'] == annotator1]['task_id'])
        tasks2 = set(df[df['annotator_id'] == annotator2]['task_id'])
        return list(tasks1.intersection(tasks2))
    
    def calculate_semantic_similarity(self, annotations_df: pd.DataFrame) -> Dict:
        """
        T√≠nh semantic similarity gi·ªØa c√°c annotations s·ª≠ d·ª•ng TF-IDF cosine similarity
        D·ª±a tr√™n "Semantic Similarity for Text Quality Assessment" (2022)
        """
        
        # Group by task
        task_groups = annotations_df.groupby('task_id')
        
        similarities = []
        
        for task_id, group in task_groups:
            if len(group) >= 2:
                responses = group['npc_response'].tolist()
                
                # T√≠nh TF-IDF vectors
                vectorizer = TfidfVectorizer()
                try:
                    tfidf_matrix = vectorizer.fit_transform(responses)
                    
                    # T√≠nh pairwise cosine similarities
                    cosine_sim = cosine_similarity(tfidf_matrix)
                    
                    # L·∫•y upper triangle (kh√¥ng t√≠nh diagonal)
                    n = len(responses)
                    for i in range(n):
                        for j in range(i+1, n):
                            similarities.append(cosine_sim[i, j])
                except:
                    # N·∫øu kh√¥ng c√≥ t·ª´ n√†o chung
                    continue
        
        if similarities:
            avg_similarity = np.mean(similarities)
            std_similarity = np.std(similarities)
        else:
            avg_similarity = 0.0
            std_similarity = 0.0
        
        return {
            'semantic_similarity': {
                'mean': avg_similarity,
                'std': std_similarity,
                'n_comparisons': len(similarities),
                'interpretation': self._interpret_similarity(avg_similarity)
            }
        }
    
    def _interpret_kappa(self, kappa: float) -> str:
        """Interpret Fleiss' Kappa theo Landis & Koch (1977)"""
        if kappa < 0:
            return "Poor agreement"
        elif kappa <= 0.2:
            return "Slight agreement"
        elif kappa <= 0.4:
            return "Fair agreement"
        elif kappa <= 0.6:
            return "Moderate agreement"
        elif kappa <= 0.8:
            return "Substantial agreement"
        else:
            return "Almost perfect agreement"
    
    def _interpret_alpha(self, alpha: float) -> str:
        """Interpret Krippendorff's Alpha"""
        if alpha < 0.667:
            return "Unreliable"
        elif alpha < 0.8:
            return "Marginally reliable"
        elif alpha < 0.9:
            return "Reliable"
        else:
            return "Highly reliable"
    
    def _interpret_similarity(self, similarity: float) -> str:
        """Interpret semantic similarity"""
        if similarity < 0.3:
            return "Low similarity - high diversity"
        elif similarity < 0.6:
            return "Moderate similarity"
        elif similarity < 0.8:
            return "High similarity"
        else:
            return "Very high similarity - possible copying"
    
    def generate_iaa_report(self, output_path="iaa_report.md"):
        """T·∫°o comprehensive IAA report"""
        
        print("üìä ƒêang t√≠nh Inter-Annotator Agreement...")
        
        # Load annotations
        df = self.load_annotations()
        
        if len(df) == 0:
            print("‚ö†Ô∏è Kh√¥ng c√≥ annotations ƒë·ªÉ ph√¢n t√≠ch")
            return
        
        # T√≠nh c√°c metrics
        fleiss_results = self.calculate_fleiss_kappa(df)
        alpha_results = self.calculate_krippendorff_alpha(df)
        similarity_results = self.calculate_semantic_similarity(df)
        
        # T·∫°o report
        report = f"""# INTER-ANNOTATOR AGREEMENT REPORT

## üìà T·ªïng quan

**Ng√†y t·∫°o:** {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M')}
**T·ªïng s·ªë annotations:** {len(df)}
**S·ªë annotators:** {df['annotator_id'].nunique()}
**S·ªë tasks:** {df['task_id'].nunique()}

## üìä Agreement Statistics

### 1. Fleiss' Kappa
| Category | Kappa | Interpretation |
|----------|-------|----------------|
| NPC State | {fleiss_results['npc_state']['fleiss_kappa']:.3f} | {fleiss_results['npc_state']['interpretation']} |
| Emotional Intensity | {fleiss_results['emotional_intensity']['fleiss_kappa']:.3f} | {fleiss_results['emotional_intensity']['interpretation']} |

*Ghi ch√∫: Kappa > 0.6 ƒë∆∞·ª£c coi l√† acceptable cho nghi√™n c·ª©u (Landis & Koch, 1977)*

### 2. Krippendorff's Alpha
**Alpha:** {alpha_results['krippendorff_alpha']['value']:.3f}
**Interpretation:** {alpha_results['krippendorff_alpha']['interpretation']}
**S·ªë c·∫∑p annotators:** {alpha_results['krippendorff_alpha']['n_pairs']}

*Ghi ch√∫: Alpha > 0.8 ƒë∆∞·ª£c coi l√† reliable (Krippendorff, 2018)*

### 3. Semantic Similarity
**Mean similarity:** {similarity_results['semantic_similarity']['mean']:.3f}
**Standard deviation:** {similarity_results['semantic_similarity']['std']:.3f}
**Interpretation:** {similarity_results['semantic_similarity']['interpretation']}
**S·ªë so s√°nh:** {similarity_results['semantic_similarity']['n_comparisons']}

## üìà Ph√¢n b·ªë annotations

### Ph√¢n b·ªë theo NPC State:

In [None]:

### Ch·∫•t l∆∞·ª£ng trung b√¨nh:
#- Naturalness: {df['quality_naturalness'].mean():.2f}/5
#- Consistency: {df['quality_consistency'].mean():.2f}/5  
#- Appropriateness: {df['quality_appropriateness'].mean():.2f}/5

### Confidence scores:
#- Mean confidence: {df['confidence_score'].mean():.2f}/5

## üìä Per-Annotator Statistics

"""
        
        # Th√™m statistics theo t·ª´ng annotator
        annotator_stats = df.groupby('annotator_id').agg({
            'quality_naturalness': 'mean',
            'quality_consistency': 'mean',
            'quality_appropriateness': 'mean',
            'confidence_score': 'mean',
            'task_id': 'count'
        }).rename(columns={'task_id': 'annotation_count'})
        
        report += annotator_stats.to_markdown()
        
        report += """

## üéØ Recommendations

### N·∫øu Kappa < 0.6:
1. **T·ªï ch·ª©c training session l·∫°i** v·ªõi examples r√µ r√†ng h∆°n
2. **Clarify annotation guidelines** cho c√°c category g√¢y confusion
3. **Th√™m more examples** cho c√°c edge cases
4. **Consider merging categories** n·∫øu qu√° ambiguous

### N·∫øu Semantic Similarity qu√° cao (>0.8):
1. **Ki·ªÉm tra copying** gi·ªØa c√°c annotators
2. **Th√™m response diversity requirements**
3. **Khuy·∫øn kh√≠ch creativity** trong guidelines

### N·∫øu Semantic Similarity qu√° th·∫•p (<0.3):
1. **ƒê·∫£m b·∫£o character consistency** ƒë∆∞·ª£c gi·ªØ v·ªØng
2. **Review guidelines** ƒë·ªÉ ƒë·∫£m b·∫£o clear expectations
3. **Th√™m constraints** ƒë·ªÉ gi·ªØ response trong ph·∫°m vi ch·∫•p nh·∫≠n ƒë∆∞·ª£c

### N·∫øu Annotation Time qu√° ng·∫Øn (<30s):
1. **Ki·ªÉm tra quality** c·ªßa annotations
2. **ƒê·∫£m b·∫£o annotators kh√¥ng rush**
3. **Th√™m minimum time requirements**

## üìä Visualizations (xem file plots/)

C√°c bi·ªÉu ƒë·ªì ƒë√£ ƒë∆∞·ª£c l∆∞u trong th∆∞ m·ª•c `plots/`:
1. `state_distribution.png` - Ph√¢n b·ªë NPC states
2. `emotion_distribution.png` - Ph√¢n b·ªë emotional intensity
3. `quality_scores.png` - Ph√¢n b·ªë quality scores
4. `annotator_agreement.png` - Agreement heatmap
5. `annotation_time_distribution.png` - Ph√¢n b·ªë th·ªùi gian annotation

## üöÄ Next Steps

1. **N·∫øu IAA acceptable (>0.6):** Ti·∫øn h√†nh annotation full dataset
2. **N·∫øu IAA marginal (0.4-0.6):** ƒêi·ªÅu ch·ªânh guidelines v√† ch·∫°y pilot l·∫°i
3. **N·∫øu IAA poor (<0.4):** Xem x√©t l·∫°i to√†n b·ªô annotation framework

### Action Plan:
- [ ] Review IAA scores v·ªõi team
- [ ] Identify problematic categories
- [ ] Update guidelines n·∫øu c·∫ßn
- [ ] Retrain annotators n·∫øu c·∫ßn
- [ ] Run another pilot n·∫øu scores th·∫•p

---
*Report generated v·ªõi methodology t·ª´:*
*- Fleiss, J. L. (1971). "Measuring nominal scale agreement among many raters"*
*- Krippendorff, K. (2018). "Content Analysis: An Introduction to Its Methodology"*
*- Landis, J. R., & Koch, G. G. (1977). "The measurement of observer agreement"*
"""
        
        # L∆∞u report
        os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
        
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(report)
        
        # T·∫°o visualizations
        self.create_visualizations(df)
        
        # L∆∞u detailed results
        self.save_detailed_results(df, fleiss_results, alpha_results, similarity_results, cohen_results)
        
        print(f"‚úÖ ƒê√£ t·∫°o IAA report: {output_path}")
        
        return {
            'fleiss': fleiss_results,
            'alpha': alpha_results,
            'similarity': similarity_results,
            'cohen': cohen_results,
            'time_stats': time_stats
        }
    
    def create_visualizations(self, df: pd.DataFrame):
        """T·∫°o visualizations cho IAA analysis"""
        
        import matplotlib
        matplotlib.use('Agg')  # For headless environments
        
        os.makedirs("plots", exist_ok=True)
        
        # 1. NPC State Distribution
        plt.figure(figsize=(10, 6))
        state_counts = df['npc_state'].value_counts()
        colors = ['#4CAF50', '#FFC107', '#F44336', '#2196F3']  # green, yellow, red, blue
        
        # ƒê·∫£m b·∫£o t·∫•t c·∫£ states ƒë·ªÅu c√≥ trong bi·ªÉu ƒë·ªì
        all_states = ['normal', 'alert', 'combat', 'injured']
        for state in all_states:
            if state not in state_counts.index:
                state_counts[state] = 0
        
        state_counts = state_counts.reindex(all_states)
        
        bars = plt.bar(state_counts.index, state_counts.values, color=colors)
        
        plt.title('Ph√¢n b·ªë NPC States', fontsize=16, fontweight='bold')
        plt.xlabel('NPC State', fontsize=12)
        plt.ylabel('S·ªë annotations', fontsize=12)
        
        # Th√™m s·ªë tr√™n m·ªói bar
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height)}', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.savefig('plots/state_distribution.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        # 2. Emotional Intensity Distribution
        plt.figure(figsize=(10, 6))
        emotion_counts = df['emotional_intensity'].value_counts().sort_index()
        
        # ƒê·∫£m b·∫£o t·∫•t c·∫£ levels ƒë·ªÅu c√≥
        for level in range(1, 6):
            if level not in emotion_counts.index:
                emotion_counts[level] = 0
        
        emotion_counts = emotion_counts.sort_index()
        
        # T·∫°o gradient color t·ª´ xanh (calm) ƒë·∫øn ƒë·ªè (intense)
        colors = plt.cm.RdYlGn_r(np.linspace(0, 1, len(emotion_counts)))
        
        bars = plt.bar(emotion_counts.index.astype(str), emotion_counts.values, color=colors)
        
        plt.title('Ph√¢n b·ªë Emotional Intensity', fontsize=16, fontweight='bold')
        plt.xlabel('Emotional Intensity Level', fontsize=12)
        plt.ylabel('S·ªë annotations', fontsize=12)
        
        # Th√™m m√¥ t·∫£ cho t·ª´ng level
        emotion_labels = {
            '1': 'Neutral\n(1)',
            '2': 'Mild\n(2)', 
            '3': 'Medium\n(3)',
            '4': 'Strong\n(4)',
            '5': 'Very Strong\n(5)'
        }
        
        plt.xticks(list(emotion_counts.index.astype(str)), 
                  [emotion_labels[str(i)] for i in emotion_counts.index])
        
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height)}', ha='center', va='bottom')
        
        plt.tight_layout()
        plt.savefig('plots/emotion_distribution.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        # 3. Quality Scores Distribution
        plt.figure(figsize=(12, 8))
        
        quality_metrics = ['quality_naturalness', 'quality_consistency', 'quality_appropriateness']
        metric_labels = ['Naturalness', 'Consistency', 'Appropriateness']
        
        data = [df[metric] for metric in quality_metrics]
        
        box = plt.boxplot(data, labels=metric_labels, patch_artist=True)
        
        # M√†u cho c√°c box
        colors = ['#FF9999', '#99FF99', '#9999FF']
        for patch, color in zip(box['boxes'], colors):
            patch.set_facecolor(color)
        
        plt.title('Ph√¢n b·ªë Quality Scores', fontsize=16, fontweight='bold')
        plt.ylabel('Score (1-5)', fontsize=12)
        plt.ylim(0.5, 5.5)
        
        # Th√™m mean values
        for i, metric in enumerate(quality_metrics, 1):
            mean_val = df[metric].mean()
            plt.text(i, 5.2, f'Mean: {mean_val:.2f}', 
                    ha='center', va='bottom', fontsize=10)
        
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig('plots/quality_scores.png', dpi=300, bbox_inches='tight')
        plt.close()
        
        # 4. Annotation Time Distribution (n·∫øu c√≥)
        if 'annotation_time_seconds' in df.columns and not df['annotation_time_seconds'].isna().all():
            plt.figure(figsize=(10, 6))
            
            times = df['annotation_time_seconds'].dropna()
            
            plt.hist(times, bins=30, edgecolor='black', alpha=0.7)
            plt.axvline(times.mean(), color='red', linestyle='dashed', linewidth=2, label=f'Mean: {times.mean():.1f}s')
            plt.axvline(times.median(), color='green', linestyle='dashed', linewidth=2, label=f'Median: {times.median():.1f}s')
            
            plt.title('Ph√¢n b·ªë Th·ªùi gian Annotation', fontsize=16, fontweight='bold')
            plt.xlabel('Th·ªùi gian (gi√¢y)', fontsize=12)
            plt.ylabel('S·ªë annotations', fontsize=12)
            plt.legend()
            plt.grid(True, alpha=0.3)
            plt.tight_layout()
            plt.savefig('plots/annotation_time_distribution.png', dpi=300, bbox_inches='tight')
            plt.close()
        
        print("‚úÖ ƒê√£ t·∫°o visualizations trong th∆∞ m·ª•c plots/")
    
    def save_detailed_results(self, df: pd.DataFrame, fleiss_results: Dict, 
                             alpha_results: Dict, similarity_results: Dict,
                             cohen_results: Dict):
        """L∆∞u detailed results ra JSON file"""
        
        detailed_results = {
            'summary': {
                'total_annotations': len(df),
                'n_annotators': df['annotator_id'].nunique(),
                'n_tasks': df['task_id'].nunique(),
                'date_generated': datetime.now().isoformat()
            },
            'fleiss_kappa': fleiss_results,
            'krippendorff_alpha': alpha_results,
            'semantic_similarity': similarity_results,
            'cohen_kappa': cohen_results,
            'statistics': {
                'state_distribution': df['npc_state'].value_counts().to_dict(),
                'emotion_distribution': df['emotional_intensity'].value_counts().sort_index().to_dict(),
                'quality_scores': {
                    'naturalness_mean': float(df['quality_naturalness'].mean()),
                    'consistency_mean': float(df['quality_consistency'].mean()),
                    'appropriateness_mean': float(df['quality_appropriateness'].mean())
                }
            }
        }
        
        with open('plots/detailed_results.json', 'w', encoding='utf-8') as f:
            json.dump(detailed_results, f, ensure_ascii=False, indent=2)
        
        print("‚úÖ ƒê√£ l∆∞u detailed results: plots/detailed_results.json")

class AnnotationQualityController:
    """Quality control pipeline d·ª±a tr√™n research-based methods"""
    
    def __init__(self):
        self.quality_thresholds = {
            'min_response_length': 3,  # √çt nh·∫•t 3 t·ª´
            'max_response_length': 50, # T·ªëi ƒëa 50 t·ª´
            'min_quality_score': 3,    # Ch·∫•t l∆∞·ª£ng t·ªëi thi·ªÉu
            'max_similarity_with_others': 0.9,  # Kh√¥ng copy qu√° nhi·ªÅu
            'required_fields': ['npc_response', 'npc_state', 'emotional_intensity']
        }
        
        self.modern_words = ['ok', 'hello', 'hi', 'yes', 'no', 'sorry', 'thanks', 
                           'okay', 'hey', 'please', 'thank you', 'thanks', 'cool',
                           'awesome', 'amazing', 'wow', 'lol', 'haha', 'omg']
        
    def validate_annotations(self, annotations_df: pd.DataFrame) -> Dict:
        """Validate annotations theo multiple criteria"""
        
        validation_results = {
            'total_annotations': len(annotations_df),
            'passed_validation': 0,
            'failed_validation': 0,
            'failure_reasons': defaultdict(int),
            'failed_samples': [],
            'warnings': []
        }
        
        for idx, row in annotations_df.iterrows():
            is_valid, reason, warnings = self._validate_single_annotation(row)
            
            if is_valid:
                validation_results['passed_validation'] += 1
            else:
                validation_results['failed_validation'] += 1
                validation_results['failure_reasons'][reason] += 1
                validation_results['failed_samples'].append({
                    'task_id': int(row['task_id']),
                    'annotator_id': int(row['annotator_id']),
                    'reason': reason,
                    'response': str(row['npc_response'])[:100] + '...' if len(str(row['npc_response'])) > 100 else str(row['npc_response'])
                })
            
            # Th√™m warnings
            for warning in warnings:
                validation_results['warnings'].append({
                    'task_id': int(row['task_id']),
                    'annotator_id': int(row['annotator_id']),
                    'warning': warning
                })
        
        # T√≠nh pass rate
        if validation_results['total_annotations'] > 0:
            validation_results['pass_rate'] = (
                validation_results['passed_validation'] / 
                validation_results['total_annotations'] * 100
            )
        else:
            validation_results['pass_rate'] = 0
        
        # Th·ªëng k√™ warnings
        warning_counts = defaultdict(int)
        for warning in validation_results['warnings']:
            warning_counts[warning['warning']] += 1
        validation_results['warning_counts'] = dict(warning_counts)
        
        return validation_results
    
    def _validate_single_annotation(self, annotation: pd.Series) -> Tuple[bool, str, List[str]]:
        """Validate single annotation"""
        
        warnings = []
        
        # 1. Check required fields
        for field in self.quality_thresholds['required_fields']:
            if field not in annotation or pd.isna(annotation[field]) or str(annotation[field]) == '':
                return False, f'missing_{field}', warnings
        
        # 2. Check response length
        response = str(annotation['npc_response'])
        word_count = len(response.split())
        
        if word_count < self.quality_thresholds['min_response_length']:
            return False, 'response_too_short', warnings
        
        if word_count > self.quality_thresholds['max_response_length']:
            warnings.append('response_too_long')
        
        # 3. Check quality scores
        quality_fields = ['quality_naturalness', 'quality_consistency', 'quality_appropriateness']
        for field in quality_fields:
            if field in annotation and pd.notna(annotation[field]):
                if annotation[field] < self.quality_thresholds['min_quality_score']:
                    warnings.append(f'low_{field}')
        
        # 4. Check confidence score
        if 'confidence_score' in annotation and pd.notna(annotation['confidence_score']):
            if annotation['confidence_score'] < 3:
                warnings.append('low_confidence')
        
        # 5. Check for modern language/anachronisms
        response_lower = response.lower()
        modern_words_found = [word for word in self.modern_words if word in response_lower]
        if modern_words_found:
            warnings.append(f'anachronism: {", ".join(modern_words_found[:3])}')
        
        # 6. Check for inappropriate content
        inappropriate_terms = ['fuck', 'shit', 'damn', 'hell', 'bastard', 'asshole']
        inappropriate_found = [term for term in inappropriate_terms if term in response_lower]
        if inappropriate_found:
            warnings.append(f'inappropriate_language: {", ".join(inappropriate_found[:3])}')
        
        return True, 'passed', warnings
    
    def detect_pattern_repetition(self, annotations_df: pd.DataFrame, 
                                 similarity_threshold: float = 0.8) -> Dict:
        """Ph√°t hi·ªán pattern repetition gi·ªØa c√°c annotators"""
        
        results = {
            'high_similarity_pairs': [],
            'possible_copied_responses': [],
            'unique_annotators': annotations_df['annotator_id'].nunique()
        }
        
        # Group by task
        task_groups = annotations_df.groupby('task_id')
        
        for task_id, group in task_groups:
            if len(group) >= 2:
                responses = group['npc_response'].tolist()
                annotators = group['annotator_id'].tolist()
                
                # T√≠nh similarity matrix
                vectorizer = TfidfVectorizer()
                try:
                    tfidf_matrix = vectorizer.fit_transform(responses)
                    cosine_sim = cosine_similarity(tfidf_matrix)
                    
                    # Ki·ªÉm tra c√°c c·∫∑p c√≥ similarity cao
                    n = len(responses)
                    for i in range(n):
                        for j in range(i+1, n):
                            if cosine_sim[i, j] > similarity_threshold:
                                results['high_similarity_pairs'].append({
                                    'task_id': int(task_id),
                                    'annotator1': int(annotators[i]),
                                    'annotator2': int(annotators[j]),
                                    'similarity': float(cosine_sim[i, j]),
                                    'response1': responses[i][:100] + '...' if len(responses[i]) > 100 else responses[i],
                                    'response2': responses[j][:100] + '...' if len(responses[j]) > 100 else responses[j]
                                })
                                
                                # N·∫øu similarity r·∫•t cao, c√≥ th·ªÉ l√† copying
                                if cosine_sim[i, j] > 0.95:
                                    results['possible_copied_responses'].append({
                                        'task_id': int(task_id),
                                        'annotator_pair': (int(annotators[i]), int(annotators[j])),
                                        'similarity': float(cosine_sim[i, j])
                                    })
                except:
                    continue
        
        return results
    
    def calculate_annotator_reliability(self, annotations_df: pd.DataFrame) -> Dict:
        """T√≠nh reliability score cho t·ª´ng annotator"""
        
        if len(annotations_df) == 0:
            return {}
        
        annotator_stats = annotations_df.groupby('annotator_id').agg({
            'quality_naturalness': ['mean', 'std', 'count'],
            'quality_consistency': ['mean', 'std', 'count'],
            'quality_appropriateness': ['mean', 'std', 'count'],
            'confidence_score': 'mean'
        })
        
        # Flatten column names
        annotator_stats.columns = ['_'.join(col).strip() for col in annotator_stats.columns.values]
        
        # T√≠nh reliability score
        reliability_scores = {}
        for annotator_id in annotator_stats.index:
            stats = annotator_stats.loc[annotator_id]
            
            # T√≠nh score d·ª±a tr√™n ch·∫•t l∆∞·ª£ng v√† consistency
            quality_score = (
                stats.get('quality_naturalness_mean', 3) +
                stats.get('quality_consistency_mean', 3) +
                stats.get('quality_appropriateness_mean', 3)
            ) / 3
            
            # T√≠nh variability (th·∫•p h∆°n t·ªët h∆°n)
            variability = np.mean([
                stats.get('quality_naturalness_std', 0),
                stats.get('quality_consistency_std', 0),
                stats.get('quality_appropriateness_std', 0)
            ])
            
            # T√≠nh reliability score (0-1)
            reliability = quality_score / 5 * (1 - min(variability / 2, 0.5))
            
            reliability_scores[annotator_id] = {
                'reliability_score': float(reliability),
                'quality_score': float(quality_score),
                'variability': float(variability),
                'annotation_count': int(stats.get('quality_naturalness_count', 0)),
                'confidence_mean': float(stats.get('confidence_score_mean', 3))
            }
        
        return reliability_scores
    
    def generate_quality_report(self, validation_results: Dict, 
                               pattern_results: Dict = None,
                               reliability_scores: Dict = None,
                               output_path="quality_report.md"):
        """T·∫°o comprehensive quality report"""
        
        report = f"""# ANNOTATION QUALITY CONTROL REPORT

## üìä T·ªïng quan ch·∫•t l∆∞·ª£ng

**T·ªïng s·ªë annotations:** {validation_results['total_annotations']}
**Annotations passed:** {validation_results['passed_validation']}
**Annotations failed:** {validation_results['failed_validation']}
**Pass rate:** {validation_results['pass_rate']:.1f}%

## üö® Failure Analysis

### L√Ω do failed (t·ªïng s·ªë):
"""
        
        for reason, count in validation_results['failure_reasons'].items():
            report += f"- **{reason}:** {count} annotations\n"
        
        report += f"""

## ‚ö†Ô∏è Warnings Analysis

### C·∫£nh b√°o ph√°t hi·ªán:
"""
        
        if validation_results.get('warning_counts'):
            for warning, count in validation_results['warning_counts'].items():
                report += f"- **{warning}:** {count} annotations\n"
        else:
            report += "Kh√¥ng c√≥ warnings.\n"
        
        report += f"""

## üîç Pattern Repetition Analysis
"""
        
        if pattern_results:
            report += f"""
**S·ªë annotators:** {pattern_results['unique_annotators']}
**S·ªë c·∫∑p c√≥ similarity > 0.8:** {len(pattern_results['high_similarity_pairs'])}
**S·ªë c·∫∑p c√≥ th·ªÉ copied (similarity > 0.95):** {len(pattern_results['possible_copied_responses'])}

### C√°c c·∫∑p c√≥ similarity cao nh·∫•t (top 5):
"""
            
            # S·∫Øp x·∫øp theo similarity
            high_similarity = sorted(pattern_results['high_similarity_pairs'], 
                                   key=lambda x: x['similarity'], reverse=True)[:5]
            
            for i, pair in enumerate(high_similarity, 1):
                report += f"""
{i}. **Task {pair['task_id']}** (Annotators: {pair['annotator1']} & {pair['annotator2']})
   Similarity: {pair['similarity']:.3f}
   Response 1: {pair['response1']}
   Response 2: {pair['response2']}
"""
        
        report += f"""

## üë§ Annotator Reliability Scores
"""
        
        if reliability_scores:
            report += """
| Annotator ID | Reliability Score | Quality Score | Variability | Count |
|--------------|-------------------|---------------|-------------|-------|
"""
            
            for annotator_id, scores in sorted(reliability_scores.items(), 
                                             key=lambda x: x[1]['reliability_score'], 
                                             reverse=True):
                report += f"| {annotator_id} | {scores['reliability_score']:.3f} | {scores['quality_score']:.2f} | {scores['variability']:.2f} | {scores['annotation_count']} |\n"
            
            # Th√™m interpretation
            report += f"""

### Interpretation:
- **Reliability Score > 0.8:** Excellent annotator
- **Reliability Score 0.6-0.8:** Good annotator
- **Reliability Score 0.4-0.6:** Needs improvement
- **Reliability Score < 0.4:** Consider retraining
"""
        
        report += f"""

## üìù Failed Samples (first 10)

"""
        
        if validation_results['failed_samples']:
            for i, sample in enumerate(validation_results['failed_samples'][:10]):
                report += f"{i+1}. **Task {sample['task_id']}** (Annotator {sample['annotator_id']})\n"
                report += f"   Reason: {sample['reason']}\n"
                report += f"   Response: {sample['response']}\n\n"
        else:
            report += "Kh√¥ng c√≥ failed samples.\n"
        
        report += f"""

## üéØ Recommendations

### N·∫øu pass rate > 90%:
‚úÖ **Ch·∫•t l∆∞·ª£ng t·ªët** - C√≥ th·ªÉ ti·∫øp t·ª•c v·ªõi annotation scale-up

### N·∫øu pass rate 70-90%:
‚ö†Ô∏è **C·∫ßn c·∫£i thi·ªán** - T·ªï ch·ª©c additional training cho annotators

### N·∫øu pass rate < 70%:
‚ùå **V·∫•n ƒë·ªÅ nghi√™m tr·ªçng** - C·∫ßn xem x√©t l·∫°i to√†n b·ªô annotation process

### N·∫øu c√≥ nhi·ªÅu high similarity pairs (>20%):
‚ö†Ô∏è **C√≥ th·ªÉ c√≥ copying** - C·∫ßn xem x√©t monitoring v√† guidelines

### Action Items:
1. Review failed samples ƒë·ªÉ hi·ªÉu patterns
2. Address common failure reasons
3. Retrain annotators c√≥ reliability score th·∫•p
4. Update guidelines d·ª±a tr√™n findings
5. Consider adding more validation rules

---
*Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}*
"""
        
        # L∆∞u report
        os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else '.', exist_ok=True)
        
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(report)
        
        print(f"‚úÖ ƒê√£ t·∫°o quality report: {output_path}")
        
        return report

# ==================== MAIN EXECUTION ====================
if __name__ == "__main__":
    print("üîç B·∫Øt ƒë·∫ßu Quality Control Pipeline...")
    print("="*60)
    
    # 1. T√≠nh Inter-Annotator Agreement
    print("\n1. üìä T√≠nh Inter-Annotator Agreement...")
    iaa = InterAnnotatorAgreement()
    
    try:
        iaa_report = iaa.generate_iaa_report()
        if iaa_report:
            print("‚úÖ IAA analysis completed")
    except Exception as e:
        print(f"‚ö†Ô∏è L·ªói trong IAA analysis: {e}")
        print("T·∫°o database m·∫´u ƒë·ªÉ test...")
        # T·∫°o database m·∫´u ƒë·ªÉ test
        create_sample_database()
        iaa_report = iaa.generate_iaa_report()
    
    # 2. Quality Validation
    print("\n2. üéØ Validating annotation quality...")
    df = iaa.load_annotations()
    
    if len(df) > 0:
        qc = AnnotationQualityController()
        
        # Validate annotations
        validation_results = qc.validate_annotations(df)
        
        # Detect pattern repetition
        pattern_results = qc.detect_pattern_repetition(df)
        
        # Calculate reliability scores
        reliability_scores = qc.calculate_annotator_reliability(df)
        
        # Generate quality report
        qc_report = qc.generate_quality_report(
            validation_results, 
            pattern_results, 
            reliability_scores
        )
        
        print(f"‚úÖ Quality validation completed.")
        print(f"   Pass rate: {validation_results['pass_rate']:.1f}%")
        print(f"   Failed: {validation_results['failed_validation']} annotations")
        
        if reliability_scores:
            avg_reliability = np.mean([s['reliability_score'] for s in reliability_scores.values()])
            print(f"   Avg reliability score: {avg_reliability:.3f}")
    
    print("\n" + "="*60)
    print("üéâ QUALITY CONTROL PIPELINE HO√ÄN TH√ÄNH!")
    print("="*60)
    print("\nüìÅ Output files:")
    print("  - iaa_report.md (Inter-Annotator Agreement analysis)")
    print("  - quality_report.md (Quality validation results)")
    print("  - plots/ (Visualizations)")
    print("  - plots/detailed_results.json (Detailed metrics)")
    print("\nüìä Key metrics to check:")
    print("  1. Fleiss' Kappa > 0.6 (Moderate agreement)")
    print("  2. Pass rate > 85%")
    print("  3. Avg reliability score > 0.7")
    print("\nüöÄ Next steps:")
    print("  1. Review reports v·ªõi team")
    print("  2. Address quality issues")
    print("  3. Retrain annotators n·∫øu c·∫ßn")
    print("  4. Update annotation guidelines")
    print("  5. Proceed with full-scale annotation")

def create_sample_database():
    """T·∫°o database m·∫´u ƒë·ªÉ test n·∫øu kh√¥ng c√≥ data"""
    print("üìù T·∫°o database m·∫´u v·ªõi 60 annotations t·ª´ 3 annotators...")
    
    conn = sqlite3.connect('annotations.db')
    cursor = conn.cursor()
    
    # T·∫°o tables n·∫øu ch∆∞a c√≥
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS tasks (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            player_message TEXT NOT NULL,
            context TEXT,
            npc_state TEXT,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
            assigned_to INTEGER,
            completed BOOLEAN DEFAULT FALSE
        )
    ''')
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS annotations (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            task_id INTEGER NOT NULL,
            annotator_id INTEGER NOT NULL,
            npc_response TEXT NOT NULL,
            npc_state TEXT NOT NULL,
            emotional_intensity INTEGER,
            dialogue_acts TEXT,
            quality_naturalness INTEGER,
            quality_consistency INTEGER,
            quality_appropriateness INTEGER,
            confidence_score INTEGER,
            annotation_time_seconds INTEGER,
            created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
        )
    ''')
    
    # T·∫°o sample tasks
    sample_tasks = [
        ("Xin ch√†o", "Ban ng√†y, c·ªïng th√†nh", "normal"),
        ("Cho t√¥i qua ƒëi", "Player ti·∫øn g·∫ßn c·ªïng", "alert"),
        ("Ta s·∫Ω gi·∫øt ng∆∞∆°i!", "", "combat"),
        ("Ng∆∞∆°i thua r·ªìi", "NPC b·ªã th∆∞∆°ng n·∫∑ng", "injured"),
        ("Tr·ªùi h√¥m nay ƒë·∫πp nh·ªâ", "Bu·ªïi s√°ng y√™n tƒ©nh", "normal"),
        ("C√≥ chuy·ªán g√¨ th·∫ø?", "Player b·ªã c·∫£nh b√°o", "alert"),
        ("(T·∫•n c√¥ng)", "Tr·∫≠n chi·∫øn b·∫Øt ƒë·∫ßu", "combat"),
        ("C√≥ c·∫ßn gi√∫p kh√¥ng?", "NPC b·ªã th∆∞∆°ng", "injured"),
    ]
    
    task_ids = []
    for msg, ctx, state in sample_tasks:
        cursor.execute('INSERT INTO tasks (player_message, context, npc_state) VALUES (?, ?, ?)',
                      (msg, ctx, state))
        task_ids.append(cursor.lastrowid)
    
    # T·∫°o sample annotations (3 annotators cho m·ªói task)
    sample_responses = {
        "normal": [
            "Ch√†o c√¥ng d√¢n. Gi·ªØ tr·∫≠t t·ª± v√† di chuy·ªÉn ƒëi.",
            "Ch√†o. ƒê·ª´ng g√¢y r·∫Øc r·ªëi ·ªü ƒë√¢y.",
            "Ch√†o. H√£y tu√¢n th·ªß lu·∫≠t l·ªá."
        ],
        "alert": [
            "D·ª´ng l·∫°i! Kh√¥ng ƒë∆∞·ª£c b∆∞·ªõc th√™m b∆∞·ªõc n√†o n·ªØa.",
            "ƒê·ª©ng y√™n! Tay ƒë·ªÉ xa v≈© kh√≠.",
            "C·∫£nh b√°o! D·ª´ng l·∫°i ngay l·∫≠p t·ª©c."
        ],
        "combat": [
            "Ch·∫øt ƒëi, k·∫ª x√¢m nh·∫≠p!",
            "Ta s·∫Ω nghi·ªÅn n√°t ng∆∞∆°i!",
            "Ng∆∞∆°i kh√¥ng qua ƒë∆∞·ª£c ƒë√¢u!"
        ],
        "injured": [
            "L√†m... l√†m ∆°n... tha cho t√¥i...",
            "T√¥i... t√¥i ƒë·∫ßu h√†ng...",
            "Xin ƒë·ª´ng... t√¥i c√≤n gia ƒë√¨nh..."
        ]
    }
    
    for task_id in task_ids:
        # L·∫•y state c·ªßa task
        cursor.execute('SELECT npc_state FROM tasks WHERE id = ?', (task_id,))
        state = cursor.fetchone()[0]
        
        # T·∫°o 3 annotations cho m·ªói task
        for annotator_id in range(1, 4):
            response = sample_responses[state][annotator_id - 1]
            emotional_intensity = {"normal": 2, "alert": 4, "combat": 5, "injured": 4}[state]
            
            cursor.execute('''
                INSERT INTO annotations 
                (task_id, annotator_id, npc_response, npc_state, emotional_intensity,
                 quality_naturalness, quality_consistency, quality_appropriateness,
                 confidence_score, annotation_time_seconds)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            ''', (
                task_id, annotator_id, response, state, emotional_intensity,
                4, 4, 4, 4, np.random.randint(30, 120)
            ))
    
    conn.commit()
    conn.close()
    
    print("‚úÖ ƒê√£ t·∫°o database m·∫´u v·ªõi 24 annotations")