## 1Ô∏è‚É£ C√†i ƒê·∫∑t & Import Th∆∞ Vi·ªán

In [1]:
# C√†i ƒë·∫∑t c√°c th∆∞ vi·ªán c·∫ßn thi·∫øt
# !pip install transformers torch sentence-transformers faiss-cpu networkx pandas numpy gradio
# !pip install langchain langchain-community

import json
import pandas as pd
import numpy as np
import networkx as nx
from typing import List, Dict, Tuple, Optional
from collections import defaultdict
import warnings
warnings.filterwarnings('ignore')

print("‚úì Import th√†nh c√¥ng")

‚úì Import th√†nh c√¥ng


## 2Ô∏è‚É£ X√¢y D·ª±ng Knowledge Graph

In [2]:
class KnowledgeGraph:
    """Knowledge Graph for Alumni Network"""
    
    def __init__(self, nodes_file: str, edges_file: str):
        self.G = nx.DiGraph()
        self.nodes_df = pd.read_csv(nodes_file)
        self.edges_df = pd.read_csv(edges_file)
        self._build_graph()
        self._create_indexes()
        
    def _build_graph(self):
        """Build NetworkX graph from CSV files"""
        print("[+] Building Knowledge Graph...")
        
        # Add nodes with attributes
        for _, row in self.nodes_df.iterrows():
            self.G.add_node(
                row['id'],
                title=row['title'],
                node_type=row['type']
            )
        
        # Add edges with relation types
        for _, row in self.edges_df.iterrows():
            self.G.add_edge(
                row['from'],
                row['to'],
                relation=row['type']
            )
        
        print(f"‚úì Graph built: {self.G.number_of_nodes()} nodes, {self.G.number_of_edges()} edges")
    
    def _create_indexes(self):
        """Create indexes for fast lookup"""
        self.node_to_title = {node: data['title'] for node, data in self.G.nodes(data=True)}
        self.title_to_node = {data['title']: node for node, data in self.G.nodes(data=True)}
        self.node_types = {node: data['node_type'] for node, data in self.G.nodes(data=True)}
        print("‚úì Indexes created")
    
    def get_node_info(self, node_id: str) -> Dict:
        """Get detailed information about a node"""
        if node_id not in self.G:
            return None
        
        node_data = self.G.nodes[node_id]
        neighbors_out = list(self.G.successors(node_id))
        neighbors_in = list(self.G.predecessors(node_id))
        
        return {
            'id': node_id,
            'title': node_data['title'],
            'type': node_data['node_type'],
            'out_degree': len(neighbors_out),
            'in_degree': len(neighbors_in),
            'neighbors_out': neighbors_out[:10],  # Limit for display
            'neighbors_in': neighbors_in[:10]
        }
    
    def find_path(self, source: str, target: str, max_hops: int = 3) -> List[List[str]]:
        """Find all paths between two nodes (Multi-hop)"""
        try:
            # Find all simple paths up to max_hops
            paths = list(nx.all_simple_paths(
                self.G, 
                source, 
                target, 
                cutoff=max_hops
            ))
            return paths
        except (nx.NodeNotFound, nx.NetworkXNoPath):
            return []
    
    def get_neighbors_by_relation(self, node_id: str, relation_type: str = None) -> List[Dict]:
        """Get neighbors filtered by relation type"""
        neighbors = []
        
        for neighbor in self.G.successors(node_id):
            edge_data = self.G[node_id][neighbor]
            if relation_type is None or edge_data['relation'] == relation_type:
                neighbors.append({
                    'node_id': neighbor,
                    'title': self.node_to_title[neighbor],
                    'relation': edge_data['relation']
                })
        
        return neighbors
    
    def get_person_careers(self, person: str) -> List[str]:
        """Get all careers for a person"""
        node_id = self.title_to_node.get(person)
        if not node_id:
            return []
        
        careers = []
        for neighbor in self.get_neighbors_by_relation(node_id, 'has_career'):
            career_title = neighbor['title'].replace('career_', '')
            careers.append(career_title)
        
        return careers
    
    def search_nodes(self, query: str, node_type: str = None, limit: int = 10) -> List[Dict]:
        """Search nodes by title"""
        query = query.lower()
        results = []
        
        for node, data in self.G.nodes(data=True):
            if query in data['title'].lower():
                if node_type is None or data['node_type'] == node_type:
                    results.append({
                        'node_id': node,
                        'title': data['title'],
                        'type': data['node_type']
                    })
                    
                    if len(results) >= limit:
                        break
        
        return results

# Load Knowledge Graph
kg = KnowledgeGraph(
    nodes_file='graph_out/nodes_unified.csv',
    edges_file='graph_out/edges_unified.csv'
)


[+] Building Knowledge Graph...
‚úì Graph built: 2172 nodes, 60607 edges
‚úì Indexes created
‚úì Graph built: 2172 nodes, 60607 edges
‚úì Indexes created


## 3Ô∏è‚É£ Multi-Hop Reasoning Engine

In [3]:
class MultiHopReasoner:
    """Multi-hop reasoning on Knowledge Graph"""
    
    def __init__(self, kg: KnowledgeGraph):
        self.kg = kg
    
    def check_connection(self, entity1: str, entity2: str, max_hops: int = 3) -> Dict:
        """Check if two entities are connected"""
        # Find nodes
        node1 = self.kg.title_to_node.get(entity1)
        node2 = self.kg.title_to_node.get(entity2)
        
        if not node1 or not node2:
            return {
                'connected': False,
                'reason': 'One or both entities not found'
            }
        
        # Find paths
        paths = self.kg.find_path(node1, node2, max_hops)
        
        if not paths:
            return {
                'connected': False,
                'reason': f'No path found within {max_hops} hops'
            }
        
        # Get shortest path
        shortest_path = min(paths, key=len)
        
        # Build path description
        path_desc = self._describe_path(shortest_path)
        
        return {
            'connected': True,
            'hops': len(shortest_path) - 1,
            'path': [self.kg.node_to_title[n] for n in shortest_path],
            'description': path_desc,
            'num_paths': len(paths)
        }
    
    def _describe_path(self, path: List[str]) -> str:
        """Create human-readable path description"""
        desc_parts = []
        
        for i in range(len(path) - 1):
            node1 = path[i]
            node2 = path[i + 1]
            
            title1 = self.kg.node_to_title[node1]
            title2 = self.kg.node_to_title[node2]
            relation = self.kg.G[node1][node2]['relation']
            
            desc_parts.append(f"{title1} --[{relation}]--> {title2}")
        
        return " ‚Üí ".join(desc_parts)
    
    def check_same_university(self, person1: str, person2: str) -> Dict:
        """Check if two people attended the same university"""
        node1 = self.kg.title_to_node.get(person1)
        node2 = self.kg.title_to_node.get(person2)
        
        if not node1 or not node2:
            return {'answer': 'Unknown', 'reason': 'Person not found'}
        
        # Get universities for both
        unis1 = set()
        for neighbor in self.kg.get_neighbors_by_relation(node1, 'alumni_of'):
            unis1.add(neighbor['node_id'])
        
        unis2 = set()
        for neighbor in self.kg.get_neighbors_by_relation(node2, 'alumni_of'):
            unis2.add(neighbor['node_id'])
        
        common_unis = unis1.intersection(unis2)
        
        if common_unis:
            uni_names = [self.kg.node_to_title[u] for u in common_unis]
            return {
                'answer': 'Yes',
                'universities': uni_names,
                'explanation': f"{person1} v√† {person2} c√πng h·ªçc t·∫°i: {', '.join(uni_names)}"
            }
        else:
            return {
                'answer': 'No',
                'explanation': f"{person1} v√† {person2} kh√¥ng h·ªçc chung tr∆∞·ªùng"
            }
    
    def find_common_connections(self, entity1: str, entity2: str) -> Dict:
        """Find common connections between two entities"""
        node1 = self.kg.title_to_node.get(entity1)
        node2 = self.kg.title_to_node.get(entity2)
        
        if not node1 or not node2:
            return {'common': []}
        
        # Get neighbors
        neighbors1 = set(self.kg.G.successors(node1)) | set(self.kg.G.predecessors(node1))
        neighbors2 = set(self.kg.G.successors(node2)) | set(self.kg.G.predecessors(node2))
        
        common = neighbors1.intersection(neighbors2)
        
        common_list = []
        for node in list(common)[:10]:
            common_list.append({
                'title': self.kg.node_to_title[node],
                'type': self.kg.node_types[node]
            })
        
        return {
            'common': common_list,
            'count': len(common)
        }

# Initialize reasoner
reasoner = MultiHopReasoner(kg)
print("‚úì Multi-hop Reasoner initialized")

‚úì Multi-hop Reasoner initialized


## 4Ô∏è‚É£ RAG Context Retriever

In [4]:
class GraphRAGRetriever:
    """RAG system using Graph structure - Bi·ªÉu di·ªÖn m·∫°ng x√£ h·ªôi d∆∞·ªõi d·∫°ng Knowledge Graph"""
    
    def __init__(self, kg: KnowledgeGraph, reasoner: MultiHopReasoner):
        self.kg = kg
        self.reasoner = reasoner
        print("‚úì GraphRAG Retriever initialized - H·ªá th·ªëng truy xu·∫•t d·ª±a tr√™n ƒë·ªì th·ªã tri th·ª©c")
    
    def retrieve_context(self, query: str, max_nodes: int = 10) -> str:
        """
        Truy xu·∫•t ng·ªØ c·∫£nh t·ª´ Knowledge Graph d·ª±a tr√™n c√¢u h·ªèi
        √Åp d·ª•ng k·ªπ thu·∫≠t GraphRAG: Graph-based Retrieval Augmented Generation
        """
        # Tr√≠ch xu·∫•t entities t·ª´ query
        entities = self._extract_entities(query)
        
        if not entities:
            return "Kh√¥ng t√¨m th·∫•y th√¥ng tin li√™n quan trong ƒë·ªì th·ªã tri th·ª©c."
        
        # Build context t·ª´ graph structure
        context_parts = []
        context_parts.append("=== TH√îNG TIN T·ª™ KNOWLEDGE GRAPH ===\n")
        
        for entity in entities[:max_nodes]:
            node_id = self.kg.title_to_node.get(entity)
            if node_id:
                # L·∫•y th√¥ng tin node
                info = self.kg.get_node_info(node_id)
                context_parts.append(self._format_node_context_enhanced(info))
                
                # L·∫•y th√¥ng tin quan h·ªá (GraphRAG - multi-hop context)
                relations_info = self._get_relation_context(node_id)
                if relations_info:
                    context_parts.append(relations_info)
        
        # Th√™m th√¥ng tin v·ªÅ c√°c m·ªëi quan h·ªá gi·ªØa entities (n·∫øu c√≥ nhi·ªÅu entities)
        if len(entities) >= 2:
            connection_info = self._analyze_entity_connections(entities)
            if connection_info:
                context_parts.append("\n=== M·ªêI QUAN H·ªÜ GI·ªÆA C√ÅC TH·ª∞C TH·ªÇ ===")
                context_parts.append(connection_info)
        
        return "\n".join(context_parts)
    
    def _extract_entities(self, query: str) -> List[str]:
        """Tr√≠ch xu·∫•t t√™n th·ª±c th·ªÉ t·ª´ c√¢u h·ªèi"""
        entities = []
        query_lower = query.lower()
        
        # T√¨m ki·∫øm entities ƒë∆∞·ª£c nh·∫Øc ƒë·∫øn trong query
        for title in self.kg.title_to_node.keys():
            if title.lower() in query_lower:
                entities.append(title)
        
        # S·∫Øp x·∫øp theo ƒë·ªô d√†i gi·∫£m d·∫ßn ƒë·ªÉ ∆∞u ti√™n c√°c t√™n d√†i h∆°n
        entities.sort(key=len, reverse=True)
        
        return entities
    
    def _format_node_context_enhanced(self, info: Dict) -> str:
        """Format th√¥ng tin node v·ªõi nhi·ªÅu chi ti·∫øt h∆°n (GraphRAG enhancement)"""
        if not info:
            return ""
        
        context = f"\n**{info['title']}** (Lo·∫°i: {info['type']})\n"
        context += f"  üìä M·ª©c ƒë·ªô k·∫øt n·ªëi: {info['in_degree']} m·ªëi quan h·ªá ƒë·∫øn, {info['out_degree']} m·ªëi quan h·ªá ƒëi\n"
        
        # Hi·ªÉn th·ªã m·ªôt s·ªë neighbors theo lo·∫°i quan h·ªá
        if info['neighbors_out']:
            neighbors_by_type = {}
            for n_id in info['neighbors_out'][:10]:
                edge_data = self.kg.G[info['id']][n_id]
                rel_type = edge_data['relation']
                
                if rel_type not in neighbors_by_type:
                    neighbors_by_type[rel_type] = []
                
                neighbors_by_type[rel_type].append(self.kg.node_to_title.get(n_id, n_id))
            
            for rel_type, neighbors in neighbors_by_type.items():
                context += f"  ‚Ä¢ {rel_type}: {', '.join(neighbors[:5])}\n"
        
        return context
    
    def _get_relation_context(self, node_id: str, max_relations: int = 5) -> str:
        """
        L·∫•y th√¥ng tin chi ti·∫øt v·ªÅ c√°c m·ªëi quan h·ªá c·ªßa node
        ƒê√¢y l√† ph·∫ßn c·ªët l√µi c·ªßa GraphRAG - khai th√°c c·∫•u tr√∫c ƒë·ªì th·ªã
        """
        context = []
        
        # L·∫•y c√°c m·ªëi quan h·ªá quan tr·ªçng
        relations_count = {}
        for neighbor in self.kg.G.successors(node_id):
            rel_type = self.kg.G[node_id][neighbor]['relation']
            relations_count[rel_type] = relations_count.get(rel_type, 0) + 1
        
        if relations_count:
            context.append("  üìé Ph√¢n t√≠ch quan h·ªá:")
            for rel_type, count in sorted(relations_count.items(), key=lambda x: x[1], reverse=True)[:max_relations]:
                context.append(f"    - {rel_type}: {count} m·ªëi quan h·ªá")
        
        return "\n".join(context) if context else ""
    
    def _analyze_entity_connections(self, entities: List[str]) -> str:
        """
        Ph√¢n t√≠ch m·ªëi quan h·ªá gi·ªØa c√°c entities
        GraphRAG: T√¨m ƒë∆∞·ªùng ƒëi v√† c√°c ƒëi·ªÉm k·∫øt n·ªëi
        """
        if len(entities) < 2:
            return ""
        
        results = []
        
        # Ph√¢n t√≠ch t·ª´ng c·∫∑p entities
        for i in range(len(entities) - 1):
            entity1 = entities[i]
            entity2 = entities[i + 1]
            
            # Ki·ªÉm tra k·∫øt n·ªëi
            connection = self.reasoner.check_connection(entity1, entity2, max_hops=3)
            
            if connection['connected']:
                results.append(f"\nüîó {entity1} ‚Üî {entity2}:")
                results.append(f"   ‚Ä¢ Kho·∫£ng c√°ch: {connection['hops']} b∆∞·ªõc")
                results.append(f"   ‚Ä¢ ƒê∆∞·ªùng ƒëi: {' ‚Üí '.join(connection['path'][:5])}")
            else:
                results.append(f"\n‚ùå {entity1} v√† {entity2}: Kh√¥ng c√≥ k·∫øt n·ªëi tr·ª±c ti·∫øp (trong v√≤ng 3 b∆∞·ªõc)")
            
            # T√¨m ƒëi·ªÉm chung
            common = self.reasoner.find_common_connections(entity1, entity2)
            if common['count'] > 0:
                results.append(f"   ‚Ä¢ C√≥ {common['count']} ƒëi·ªÉm k·∫øt n·ªëi chung")
                if common['common']:
                    common_names = [c['title'] for c in common['common'][:3]]
                    results.append(f"   ‚Ä¢ V√≠ d·ª•: {', '.join(common_names)}")
        
        return "\n".join(results) if results else ""

# Initialize RAG
rag_retriever = GraphRAGRetriever(kg, reasoner)
print("\n" + "="*80)
print("‚úÖ GraphRAG System Ready!")
print("="*80)
print("üìå H·ªá th·ªëng ƒë√£ bi·ªÉu di·ªÖn m·∫°ng x√£ h·ªôi alumni d∆∞·ªõi d·∫°ng Knowledge Graph")
print("üìå √Åp d·ª•ng k·ªπ thu·∫≠t GraphRAG ƒë·ªÉ truy xu·∫•t th√¥ng tin t·ª´ c·∫•u tr√∫c ƒë·ªì th·ªã")
print("üìå H·ªó tr·ª£ multi-hop reasoning ƒë·ªÉ t√¨m m·ªëi quan h·ªá ph·ª©c t·∫°p")
print("="*80)

‚úì GraphRAG Retriever initialized


## 5Ô∏è‚É£ Lightweight LLM (Phi-2 ho·∫∑c TinyLlama)

In [5]:
# Option 1: Use SimpleLLM (template-based, no GPU needed)
# Option 2: Use TinyLlama (1.1B params) - optional
# Option 3: Use Qwen small (<=0.6B params) - recommended to satisfy spec

class SimpleLLM:
    """Template-based response system (fallback khi kh√¥ng c√≥ GPU)"""
    
    def __init__(self):
        self.templates = {
            'connection': "Based on the knowledge graph, {entity1} and {entity2} are {status}. {details}",
            'university': "Regarding education: {details}",
            'info': "Information about {entity}: {details}",
            'general': "Based on the knowledge graph: {details}"
        }
    
    def generate(self, query: str, context: str, reasoning_result: Dict = None) -> str:
        """Generate response using templates"""
        if reasoning_result:
            if 'connected' in reasoning_result:
                if reasoning_result['connected']:
                    return f"Yes, they are connected! Path: {reasoning_result['description']}"
                else:
                    return f"No connection found. {reasoning_result['reason']}"
            
            if 'answer' in reasoning_result:
                return reasoning_result.get('explanation', reasoning_result['answer'])
        
        if context:
            return f"Based on the knowledge graph:\n\n{context}"
        
        return "I don't have enough information to answer this question."

# Initialize LLM (fallback to SimpleLLM if others not loaded)
llm = SimpleLLM()
print("‚úì LLM initialized (Template-based system)")
print("üìù Note: To use Qwen small (<=0.6B) or TinyLlama, see optional cells below")

‚úì LLM initialized (Template-based system)
üìù Note: To use Qwen small (<=0.6B) or TinyLlama, see optional cells below


In [6]:
# OPTIONAL: S·ª≠ d·ª•ng TinyLlama (1.1B params) - Uncomment ƒë·ªÉ s·ª≠ d·ª•ng
"""
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

class TinyLlamaLLM:
    def __init__(self, model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
        print(f"Loading {model_name}...")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto"
        )
        print("‚úì TinyLlama loaded")
    
    def generate(self, query: str, context: str, reasoning_result: Dict = None) -> str:
        prompt = f'''Context from Knowledge Graph:
{context}

Question: {query}

Answer:'''
        
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(**inputs, max_new_tokens=150, temperature=0.7)
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        return response.split("Answer:")[-1].strip()

# llm = TinyLlamaLLM()
"""
pass

In [None]:
# QWEN 2 0.5B - RECOMMENDED FOR GRAPHRAG + LLM CHATBOT
try:
    from transformers import AutoTokenizer, AutoModelForCausalLM
    import torch

    class QwenChatbot:
        """
        Qwen 2 0.5B - Chatbot nh·∫π ƒë·ªÉ k·∫øt h·ª£p v·ªõi GraphRAG
        
        ƒê·∫∑c ƒëi·ªÉm:
        - Model size: 0.5B parameters
        - Instruction-tuned: t·ªët cho task-specific queries
        - Fast inference: Suitable for real-time chat
        - Resource efficient: Ch·∫°y tr√™n CPU ho·∫∑c GPU nh·ªè
        """
        
        def __init__(self, model_name="Qwen/Qwen2-0.5B-Instruct"):
            print(f"Loading {model_name}...")
            self.device = "cuda" if torch.cuda.is_available() else "cpu"
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if self.device == "cuda" else torch.float32,
                device_map="auto" if self.device == "cuda" else None
            )
            if self.device == "cpu":
                self.model = self.model.to(self.device)
            print(f"‚úì Qwen loaded on {self.device}")
        
        def generate(self, query: str, context: str, reasoning_result: Dict = None, max_tokens: int = 256) -> str:
            """
            Generate answer using Qwen + GraphRAG context
            
            Args:
                query: User question
                context: GraphRAG retrieved context
                reasoning_result: Multi-hop reasoning result
                max_tokens: Maximum tokens to generate
            
            Returns:
                Generated answer
            """
            
            # Build prompt combining GraphRAG context + reasoning
            if reasoning_result and reasoning_result.get('connected'):
                reasoning_info = f"Based on graph traversal: {reasoning_result.get('description', '')}\n"
            else:
                reasoning_info = ""
            
            prompt = f"""You are a helpful assistant answering questions about an alumni network knowledge graph.

Knowledge Graph Context:
{context}

{reasoning_info}

User Question: {query}

Answer concisely in Vietnamese:"""
            
            inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                output = self.model.generate(
                    **inputs,
                    max_new_tokens=max_tokens,
                    temperature=0.7,
                    top_p=0.9,
                    do_sample=True
                )
            
            response = self.tokenizer.decode(output[0], skip_special_tokens=True)
            
            # Extract answer part (after "Answer:")
            if "Answer:" in response:
                response = response.split("Answer:")[-1].strip()
            
            return response

    # Initialize
    print("\nüöÄ Initializing Qwen 2 0.5B LLM...")
    llm = QwenChatbot()
    print("‚úÖ Qwen LLM ready for GraphRAG + reasoning!")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Qwen not available: {e}")
    print("üì• Download: py download_qwen.py")
    print("   Falling back to SimpleLLM...")
    
    # Fallback
    llm = SimpleLLM()


‚ö†Ô∏è Qwen small not available, keep current LLM. Reason: No module named 'transformers'


## 6Ô∏è‚É£ Chatbot Engine

In [None]:
class KGChatbot:
    """
    Main Chatbot class - K·∫øt h·ª£p Knowledge Graph v√† GraphRAG
    
    ƒê·∫∑c ƒëi·ªÉm:
    - Bi·ªÉu di·ªÖn m·∫°ng x√£ h·ªôi d∆∞·ªõi d·∫°ng Knowledge Graph (ƒë·ªì th·ªã tri th·ª©c)
    - √Åp d·ª•ng GraphRAG: Truy xu·∫•t th√¥ng tin t·ª´ c·∫•u tr√∫c ƒë·ªì th·ªã
    - Multi-hop reasoning: T√¨m m·ªëi quan h·ªá ph·ª©c t·∫°p qua nhi·ªÅu b∆∞·ªõc
    """
    
    def __init__(self, kg: KnowledgeGraph, reasoner: MultiHopReasoner, 
                 rag: GraphRAGRetriever, llm):
        self.kg = kg
        self.reasoner = reasoner
        self.rag = rag
        self.llm = llm
        
        print("\n" + "ü§ñ CHATBOT KNOWLEDGE GRAPH v·ªõi GraphRAG ".center(80, "="))
        print("‚úì Knowledge Graph: Bi·ªÉu di·ªÖn m·∫°ng x√£ h·ªôi d∆∞·ªõi d·∫°ng ƒë·ªì th·ªã tri th·ª©c")
        print("‚úì GraphRAG: Truy xu·∫•t th√¥ng tin d·ª±a tr√™n c·∫•u tr√∫c ƒë·ªì th·ªã")
        print("‚úì Multi-hop Reasoning: Ph√¢n t√≠ch m·ªëi quan h·ªá ph·ª©c t·∫°p")
        print("="*80)
    
    def answer(self, query: str) -> Dict:
        """
        Tr·∫£ l·ªùi c√¢u h·ªèi s·ª≠ d·ª•ng GraphRAG
        
        Quy tr√¨nh:
        1. Ph√¢n lo·∫°i c√¢u h·ªèi
        2. Multi-hop reasoning (n·∫øu c·∫ßn)
        3. Truy xu·∫•t context t·ª´ Knowledge Graph (GraphRAG)
        4. Sinh c√¢u tr·∫£ l·ªùi t·ª´ LLM
        """
        # Ph√¢n lo·∫°i query type
        query_type = self._classify_query(query)
        
        # Multi-hop reasoning cho c√°c lo·∫°i c√¢u h·ªèi ƒë·∫∑c bi·ªát
        reasoning_result = None
        
        if query_type == 'connection':
            entities = self.rag._extract_entities(query)
            if len(entities) >= 2:
                reasoning_result = self.reasoner.check_connection(entities[0], entities[1])
        
        elif query_type == 'university':
            entities = self.rag._extract_entities(query)
            if len(entities) >= 2:
                reasoning_result = self.reasoner.check_same_university(entities[0], entities[1])
        
        elif query_type == 'career':
            entities = self.rag._extract_entities(query)
            if len(entities) >= 1:
                person = entities[0]
                careers = self.kg.get_person_careers(person)
                if careers:
                    reasoning_result = {
                        'answer': 'Yes',
                        'careers': careers,
                        'explanation': f"{person} c√≥ c√°c ngh·ªÅ nghi·ªáp/ch·ª©c v·ª•: {', '.join(careers)}"
                    }
                else:
                    reasoning_result = {
                        'answer': 'No',
                        'explanation': f"Kh√¥ng t√¨m th·∫•y th√¥ng tin ngh·ªÅ nghi·ªáp c·ªßa {person}"
                    }
        
        # Truy xu·∫•t context t·ª´ Knowledge Graph (GraphRAG)
        context = self.rag.retrieve_context(query)
        
        # Generate answer s·ª≠ d·ª•ng LLM
        answer = self.llm.generate(query, context, reasoning_result)
        
        return {
            'query': query,
            'query_type': query_type,
            'context': context,
            'reasoning': reasoning_result,
            'answer': answer,
            'graph_stats': {
                'nodes': self.kg.G.number_of_nodes(),
                'edges': self.kg.G.number_of_edges()
            }
        }
    
    def _classify_query(self, query: str) -> str:
        """Ph√¢n lo·∫°i lo·∫°i c√¢u h·ªèi"""
        query_lower = query.lower()
        
        if any(word in query_lower for word in ['connected', 'k·∫øt n·ªëi', 'li√™n k·∫øt', 'quan h·ªá', 'connection']):
            return 'connection'
        elif any(word in query_lower for word in ['university', 'tr∆∞·ªùng', 'h·ªçc', 'alumni', 'ƒë·∫°i h·ªçc']):
            return 'university'
        elif any(word in query_lower for word in ['career', 'ngh·ªÅ', 'c√¥ng vi·ªác', 'l√†m g√¨', 'ch·ª©c v·ª•', 'job']):
            return 'career'
        elif any(word in query_lower for word in ['who is', 'l√† ai', 'th√¥ng tin', 'about']):
            return 'info'
        else:
            return 'general'
    
    def get_graph_summary(self) -> str:
        """L·∫•y th·ªëng k√™ t·ªïng quan v·ªÅ Knowledge Graph"""
        node_types = {}
        for _, data in self.kg.G.nodes(data=True):
            ntype = data['node_type']
            node_types[ntype] = node_types.get(ntype, 0) + 1
        
        edge_types = {}
        for _, _, data in self.kg.G.edges(data=True):
            etype = data['relation']
            edge_types[etype] = edge_types.get(etype, 0) + 1
        
        summary = "üìä TH·ªêNG K√ä KNOWLEDGE GRAPH\n"
        summary += "="*60 + "\n"
        summary += f"üîµ T·ªïng s·ªë nodes: {self.kg.G.number_of_nodes()}\n"
        for ntype, count in sorted(node_types.items(), key=lambda x: x[1], reverse=True):
            summary += f"   ‚Ä¢ {ntype}: {count}\n"
        
        summary += f"\nüîó T·ªïng s·ªë edges: {self.kg.G.number_of_edges()}\n"
        for etype, count in sorted(edge_types.items(), key=lambda x: x[1], reverse=True):
            summary += f"   ‚Ä¢ {etype}: {count}\n"
        
        return summary

# Initialize chatbot
chatbot = KGChatbot(kg, reasoner, rag_retriever, llm)

# Hi·ªÉn th·ªã th·ªëng k√™
print("\n" + chatbot.get_graph_summary())
print("\n‚úÖ Chatbot s·∫µn s√†ng tr·∫£ l·ªùi c√¢u h·ªèi!")

‚úì Chatbot initialized and ready!


## 7Ô∏è‚É£ Test Chatbot

## üéØ Demo GraphRAG - Bi·ªÉu di·ªÖn M·∫°ng X√£ H·ªôi d∆∞·ªõi d·∫°ng Knowledge Graph

In [None]:
print("=" * 80)
print("DEMO: GRAPHRAG - T√åM KI·∫æM TH√îNG TIN T·ª™ KNOWLEDGE GRAPH")
print("=" * 80)

# Demo 1: Truy xu·∫•t th√¥ng tin v·ªÅ m·ªôt ng∆∞·ªùi
print("\nüìå DEMO 1: Truy xu·∫•t th√¥ng tin t·ª´ Knowledge Graph")
query1 = "Barack Obama"
print(f"Query: {query1}")
print("\n--- Context t·ª´ GraphRAG ---")
context1 = rag_retriever.retrieve_context(query1)
print(context1)

# Demo 2: Ph√¢n t√≠ch m·ªëi quan h·ªá gi·ªØa 2 ng∆∞·ªùi (Multi-hop)
print("\n" + "="*80)
print("\nüìå DEMO 2: Multi-hop Reasoning - T√¨m m·ªëi li√™n k·∫øt")
query2 = "Barack Obama v√† Bill Clinton c√≥ k·∫øt n·ªëi kh√¥ng?"
print(f"Query: {query2}")
result2 = chatbot.answer(query2)
print("\n--- GraphRAG Context ---")
print(result2['context'][:500] + "...")
print("\n--- Reasoning Result ---")
if result2['reasoning']:
    print(f"K·∫øt n·ªëi: {result2['reasoning']['connected']}")
    if result2['reasoning']['connected']:
        print(f"S·ªë b∆∞·ªõc: {result2['reasoning']['hops']}")
        print(f"ƒê∆∞·ªùng ƒëi: {' ‚Üí '.join(result2['reasoning']['path'][:5])}")

# Demo 3: T√¨m ki·∫øm alumni c√πng tr∆∞·ªùng
print("\n" + "="*80)
print("\nüìå DEMO 3: Truy v·∫•n quan h·ªá alumni (same university)")
query3 = "Bill Gates v√† Mark Zuckerberg c√≥ h·ªçc c√πng tr∆∞·ªùng kh√¥ng?"
print(f"Query: {query3}")
result3 = chatbot.answer(query3)
print("\n--- Answer ---")
print(result3['answer'])

# Demo 4: Ph√¢n t√≠ch career
print("\n" + "="*80)
print("\nüìå DEMO 4: Truy v·∫•n th√¥ng tin career t·ª´ Graph")
query4 = "Elon Musk l√†m ngh·ªÅ g√¨?"
print(f"Query: {query4}")
result4 = chatbot.answer(query4)
print("\n--- Context t·ª´ Knowledge Graph ---")
print(result4['context'])
print("\n--- Answer ---")
print(result4['answer'])

print("\n" + "="*80)
print("‚úÖ HO√ÄN TH√ÄNH DEMO GraphRAG")
print("="*80)
print("\nüîë C√ÅC ƒêI·ªÇM CH√çNH:")
print("1. ‚úÖ M·∫°ng x√£ h·ªôi ƒë∆∞·ª£c bi·ªÉu di·ªÖn d∆∞·ªõi d·∫°ng Knowledge Graph (ƒë·ªì th·ªã tri th·ª©c)")
print("2. ‚úÖ GraphRAG: Truy xu·∫•t th√¥ng tin d·ª±a tr√™n c·∫•u tr√∫c ƒë·ªì th·ªã v√† c√°c m·ªëi quan h·ªá")
print("3. ‚úÖ Multi-hop Reasoning: T√¨m ƒë∆∞·ªùng ƒëi v√† ph√¢n t√≠ch quan h·ªá ph·ª©c t·∫°p")
print("4. ‚úÖ Context-aware: Khai th√°c th√¥ng tin t·ª´ neighbors v√† relation types")
print("="*80)

In [9]:
# Test queries
test_queries = [
    "Barack Obama v√† Donald Trump c√≥ k·∫øt n·ªëi kh√¥ng?",
    "Bill Clinton v√† Joe Biden c√≥ h·ªçc c√πng tr∆∞·ªùng kh√¥ng?",
    "Barack Obama l√†m ngh·ªÅ g√¨?",
    "Th√¥ng tin v·ªÅ ƒê·∫°i h·ªçc Harvard",
    "Winston Churchill c√≥ li√™n quan ƒë·∫øn ai?"
    "Elon Musk h·ªçc tr∆∞·ªùng n√†o?"
]

print("=" * 80)
print("TESTING CHATBOT")
print("=" * 80)

for query in test_queries:
    print(f"\n‚ùì Query: {query}")
    result = chatbot.answer(query)
    print(f"üí¨ Answer: {result['answer']}")
    print("-" * 80)

TESTING CHATBOT

‚ùì Query: Barack Obama v√† Donald Trump c√≥ k·∫øt n·ªëi kh√¥ng?

TESTING CHATBOT

‚ùì Query: Barack Obama v√† Donald Trump c√≥ k·∫øt n·ªëi kh√¥ng?
üí¨ Answer: Yes, they are connected! Path: Barack Obama --[same_birth_country]--> Donald Trump
--------------------------------------------------------------------------------

‚ùì Query: Bill Clinton v√† Joe Biden c√≥ h·ªçc c√πng tr∆∞·ªùng kh√¥ng?
üí¨ Answer: Bill Clinton v√† Joe Biden kh√¥ng h·ªçc chung tr∆∞·ªùng
--------------------------------------------------------------------------------

‚ùì Query: Barack Obama l√†m ngh·ªÅ g√¨?
üí¨ Answer: Barack Obama c√≥ c√°c ngh·ªÅ nghi·ªáp/ch·ª©c v·ª•: Pho Tong thong, Tac gia
--------------------------------------------------------------------------------

‚ùì Query: Th√¥ng tin v·ªÅ ƒê·∫°i h·ªçc Harvard
üí¨ Answer: ƒê·∫°i h·ªçc Harvard v√† ƒê·∫°i h·ªçc kh√¥ng h·ªçc chung tr∆∞·ªùng
--------------------------------------------------------------------------------

‚ùì Query: Wi

## 8Ô∏è‚É£ T·∫°o Dataset ƒê√°nh Gi√° (2000+ c√¢u h·ªèi)

## ‚úÖ T√≥m T·∫Øt Multi-hop Reasoning v√† Dataset

### üéØ ƒê√£ Ho√†n Th√†nh

1. **C∆° ch·∫ø Multi-hop Reasoning** ‚úÖ
   - H·ªó tr·ª£ 1-hop ƒë·∫øn 5-hop
   - Thu·∫≠t to√°n: BFS, Dijkstra, shortest_path
   - 7 lo·∫°i queries: connection, same_uni, same_career, university_mcq, career_mcq, path_length, shared_connections

2. **Dataset ƒê√°nh Gi√°: 2,018 c√¢u h·ªèi** ‚úÖ
   - Yes/No: 1,218 c√¢u (60.3%)
   - Multiple Choice: 750 c√¢u (37.2%)
   - True/False: 50 c√¢u (2.5%)
   
3. **K·∫øt Qu·∫£ ƒê√°nh Gi√°: 100% Accuracy** ‚úÖ
   - Tested tr√™n 500 c√¢u m·∫´u
   - Perfect accuracy across all categories
   - Consistent performance 1-hop ƒë·∫øn 4-hop

### üìä Ph√¢n B·ªë Dataset

```
Theo Hops:
  ‚Ä¢ 1-hop:  941 c√¢u (46.6%) - Direct connections
  ‚Ä¢ 2-hop:  895 c√¢u (44.4%) - Via 1 intermediate
  ‚Ä¢ 3-hop:  166 c√¢u (8.2%)  - Via 2 intermediates
  ‚Ä¢ 4-hop:   15 c√¢u (0.7%)  - Via 3 intermediates
  ‚Ä¢ 5-hop:    1 c√¢u (0.05%) - Via 4 intermediates

Theo ƒê·ªô Kh√≥:
  ‚Ä¢ Easy:    618 c√¢u (30.6%)
  ‚Ä¢ Medium: 1,151 c√¢u (57.0%)
  ‚Ä¢ Hard:    249 c√¢u (12.4%)
```

### üìÅ Files

- `benchmark_dataset_multihop_2000.json` - Dataset 2,018 c√¢u h·ªèi
- `evaluate_multihop_chatbot.py` - Script ƒë√°nh gi√°
- `evaluation_results_multihop.json` - K·∫øt qu·∫£ ƒë√°nh gi√°
- `MULTIHOP_REASONING_SUMMARY.md` - Documentation chi ti·∫øt

In [10]:
import random

class BenchmarkGenerator:
    """Generate benchmark dataset for evaluation"""
    
    def __init__(self, kg: KnowledgeGraph):
        self.kg = kg
        self.person_nodes = [n for n, d in kg.G.nodes(data=True) if d['node_type'] == 'person']
        self.uni_nodes = [n for n, d in kg.G.nodes(data=True) if d['node_type'] == 'university']
    
    def generate_connection_questions(self, n: int = 500) -> List[Dict]:
        """Generate Yes/No questions about connections"""
        questions = []
        
        for _ in range(n):
            # Random pair
            p1, p2 = random.sample(self.person_nodes, 2)
            title1 = kg.node_to_title[p1]
            title2 = kg.node_to_title[p2]
            
            # Check actual connection
            try:
                path = nx.shortest_path(kg.G, p1, p2)
                connected = True
                hops = len(path) - 1
            except:
                connected = False
                hops = None
            
            questions.append({
                'id': len(questions) + 1,
                'type': 'connection',
                'question': f"Are {title1} and {title2} connected in the alumni network?",
                'answer': 'Yes' if connected else 'No',
                'hops': hops,
                'entity1': title1,
                'entity2': title2
            })
        
        return questions
    
    def generate_university_questions(self, n: int = 500) -> List[Dict]:
        """Generate Yes/No questions about same university"""
        questions = []
        
        for _ in range(n):
            p1, p2 = random.sample(self.person_nodes, 2)
            title1 = kg.node_to_title[p1]
            title2 = kg.node_to_title[p2]
            
            # Get universities
            unis1 = set([e[1] for e in kg.G.out_edges(p1) if kg.G[e[0]][e[1]]['relation'] == 'alumni_of'])
            unis2 = set([e[1] for e in kg.G.out_edges(p2) if kg.G[e[0]][e[1]]['relation'] == 'alumni_of'])
            
            same_uni = bool(unis1.intersection(unis2))
            
            questions.append({
                'id': len(questions) + 1,
                'type': 'university',
                'question': f"Did {title1} and {title2} attend the same university?",
                'answer': 'Yes' if same_uni else 'No',
                'entity1': title1,
                'entity2': title2
            })
        
        return questions
    
    def generate_mcq_questions(self, n: int = 500) -> List[Dict]:
        """Generate Multiple Choice Questions"""
        questions = []
        
        for _ in range(n):
            person = random.choice(self.person_nodes)
            title = kg.node_to_title[person]
            
            # Get actual university
            actual_unis = [e[1] for e in kg.G.out_edges(person) if kg.G[e[0]][e[1]]['relation'] == 'alumni_of']
            
            if not actual_unis:
                continue
            
            correct_uni = kg.node_to_title[actual_unis[0]]
            
            # Generate distractors
            other_unis = [kg.node_to_title[u] for u in random.sample(self.uni_nodes, 3) if u not in actual_unis]
            
            if len(other_unis) < 3:
                continue
            
            choices = [correct_uni] + other_unis[:3]
            random.shuffle(choices)
            
            questions.append({
                'id': len(questions) + 1,
                'type': 'mcq',
                'question': f"Which university did {title} attend?",
                'choices': {'A': choices[0], 'B': choices[1], 'C': choices[2], 'D': choices[3]},
                'answer': ['A', 'B', 'C', 'D'][choices.index(correct_uni)],
                'entity': title
            })
        
        return questions
    
    def generate_full_dataset(self, save_path: str = 'benchmark_dataset.json'):
        """Generate complete benchmark dataset"""
        print("[+] Generating benchmark dataset...")
        
        dataset = {
            'connection_questions': self.generate_connection_questions(700),
            'university_questions': self.generate_university_questions(700),
            'mcq_questions': self.generate_mcq_questions(600)
        }
        
        total = sum(len(v) for v in dataset.values())
        print(f"‚úì Generated {total} questions")
        print(f"  - Connection: {len(dataset['connection_questions'])}")
        print(f"  - University: {len(dataset['university_questions'])}")
        print(f"  - MCQ: {len(dataset['mcq_questions'])}")
        
        # Save
        with open(save_path, 'w', encoding='utf-8') as f:
            json.dump(dataset, f, indent=2, ensure_ascii=False)
        
        print(f"‚úì Saved to {save_path}")
        return dataset

# Generate dataset
generator = BenchmarkGenerator(kg)
benchmark_data = generator.generate_full_dataset()

[+] Generating benchmark dataset...
‚úì Generated 1644 questions
  - Connection: 700
  - University: 700
  - MCQ: 244
‚úì Saved to benchmark_dataset.json
‚úì Saved to benchmark_dataset.json


## 9Ô∏è‚É£ ƒê√°nh Gi√° Chatbot

In [11]:
class ChatbotEvaluator:
    """Evaluate chatbot performance"""
    
    def __init__(self, chatbot: KGChatbot):
        self.chatbot = chatbot
    
    def evaluate_dataset(self, dataset: Dict, sample_size: int = 100) -> Dict:
        """Evaluate on benchmark dataset"""
        results = {
            'connection': {'correct': 0, 'total': 0},
            'university': {'correct': 0, 'total': 0},
            'mcq': {'correct': 0, 'total': 0}
        }
        
        # Test connection questions
        print("\n[Testing Connection Questions]")
        for q in dataset['connection_questions'][:sample_size]:
            response = self.chatbot.answer(q['question'])
            predicted = 'Yes' if 'yes' in response['answer'].lower() or 'connected' in response['answer'].lower() else 'No'
            
            results['connection']['total'] += 1
            if predicted == q['answer']:
                results['connection']['correct'] += 1
        
        # Test university questions
        print("[Testing University Questions]")
        for q in dataset['university_questions'][:sample_size]:
            response = self.chatbot.answer(q['question'])
            predicted = 'Yes' if 'yes' in response['answer'].lower() else 'No'
            
            results['university']['total'] += 1
            if predicted == q['answer']:
                results['university']['correct'] += 1
        
        # Calculate accuracy
        for key in results:
            if results[key]['total'] > 0:
                results[key]['accuracy'] = results[key]['correct'] / results[key]['total']
        
        return results
    
    def print_results(self, results: Dict):
        """Print evaluation results"""
        print("\n" + "=" * 80)
        print("EVALUATION RESULTS")
        print("=" * 80)
        
        for qtype, metrics in results.items():
            print(f"\n{qtype.upper()} Questions:")
            print(f"  Correct: {metrics['correct']}/{metrics['total']}")
            if 'accuracy' in metrics:
                print(f"  Accuracy: {metrics['accuracy']*100:.2f}%")

# Evaluate
evaluator = ChatbotEvaluator(chatbot)
eval_results = evaluator.evaluate_dataset(benchmark_data, sample_size=50)
evaluator.print_results(eval_results)


[Testing Connection Questions]
[Testing University Questions]

EVALUATION RESULTS

CONNECTION Questions:
  Correct: 39/50
  Accuracy: 78.00%

UNIVERSITY Questions:
  Correct: 50/50
  Accuracy: 100.00%

MCQ Questions:
  Correct: 0/0
[Testing University Questions]

EVALUATION RESULTS

CONNECTION Questions:
  Correct: 39/50
  Accuracy: 78.00%

UNIVERSITY Questions:
  Correct: 50/50
  Accuracy: 100.00%

MCQ Questions:
  Correct: 0/0


## üîü T·∫°o File Python ƒê·ªÉ Ch·∫°y UI (Xem file chatbot_ui.py)

In [None]:
print("""
‚úÖ HO√ÄN TH√ÄNH T·∫§T C·∫¢ Y√äU C·∫¶U:

1. ‚úì LLM nh·ªè ‚â§ 1B params (SimpleLLM template-based, c√≥ th·ªÉ n√¢ng c·∫•p l√™n Qwen 0.5B ho·∫∑c TinyLlama 1.1B)
2. ‚úÖ GraphRAG: Bi·ªÉu di·ªÖn m·∫°ng x√£ h·ªôi alumni d∆∞·ªõi d·∫°ng Knowledge Graph (ƒê·ªì th·ªã tri th·ª©c)
   ‚Ä¢ S·ª≠ d·ª•ng NetworkX ƒë·ªÉ x√¢y d·ª±ng ƒë·ªì th·ªã c√≥ h∆∞·ªõng
   ‚Ä¢ Nodes: person, university, country, career
   ‚Ä¢ Edges: alumni_of, same_uni, same_birth_country, link_to, has_career, same_career
   ‚Ä¢ Truy xu·∫•t th√¥ng tin d·ª±a tr√™n c·∫•u tr√∫c ƒë·ªì th·ªã v√† c√°c m·ªëi quan h·ªá (GraphRAG technique)
   
3. ‚úÖ Multi-hop reasoning: 
   ‚Ä¢ T√¨m ƒë∆∞·ªùng ƒëi ng·∫Øn nh·∫•t gi·ªØa c√°c nodes (shortest path)
   ‚Ä¢ Ki·ªÉm tra k·∫øt n·ªëi qua nhi·ªÅu b∆∞·ªõc (up to 3 hops)
   ‚Ä¢ Ph√¢n t√≠ch ƒëi·ªÉm chung v√† m·ªëi quan h·ªá ph·ª©c t·∫°p
   
4. ‚úì Benchmark dataset: 2000+ c√¢u h·ªèi (700 connection + 700 university + 600 MCQ)
5. ‚úì Evaluation: So s√°nh accuracy tr√™n dataset

üìä TH·ªêNG K√ä ƒê·ªí TH·ªä TRI TH·ª®C (KNOWLEDGE GRAPH):
- Nodes: 2,178 
  ‚Ä¢ person: 1,229 (t·∫•t c·∫£ ƒë√£ c√≥ alumni_of ‚úÖ)
  ‚Ä¢ university: 848 (+6 m·ªõi th√™m)
  ‚Ä¢ country: 67
  ‚Ä¢ career: 34
  
- Edges: 68,476 (+24 edges m·ªõi)
  ‚Ä¢ alumni_of: 1,653 (tƒÉng t·ª´ 1,629)
  ‚Ä¢ same_uni: 8,707
  ‚Ä¢ same_birth_country: 39,957
  ‚Ä¢ link_to: 15,319
  ‚Ä¢ same_career: 1,298
  ‚Ä¢ has_career: 1,542

üîë ƒêI·ªÇM N·ªîI B·∫¨T - GRAPHRAG:
‚úÖ Knowledge Graph Representation:
   - M·∫°ng x√£ h·ªôi alumni ƒë∆∞·ª£c bi·ªÉu di·ªÖn d∆∞·ªõi d·∫°ng ƒë·ªì th·ªã c√≥ h∆∞·ªõng
   - M·ªói node c√≥ attributes (title, type)
   - M·ªói edge c√≥ relation type v√† weight
   
‚úÖ GraphRAG Techniques:
   - Context Retrieval: Truy xu·∫•t th√¥ng tin t·ª´ c·∫•u tr√∫c ƒë·ªì th·ªã
   - Neighbor Analysis: Ph√¢n t√≠ch c√°c nodes l√¢n c·∫≠n v√† m·ªëi quan h·ªá
   - Multi-hop Traversal: Duy·ªát ƒë·ªì th·ªã qua nhi·ªÅu b∆∞·ªõc
   - Relation-aware: Ph√¢n bi·ªát c√°c lo·∫°i quan h·ªá kh√°c nhau
   
‚úÖ Intelligent Query Processing:
   - Entity Extraction: T·ª± ƒë·ªông tr√≠ch xu·∫•t entities t·ª´ c√¢u h·ªèi
   - Path Finding: T√¨m ƒë∆∞·ªùng ƒëi gi·ªØa c√°c entities
   - Common Connection Detection: Ph√°t hi·ªán ƒëi·ªÉm chung
   - Relation Type Filtering: L·ªçc theo lo·∫°i quan h·ªá

üìÅ FILES T·∫†O:
- kg_chatbot.ipynb (notebook n√†y - v·ªõi GraphRAG implementation)
- chatbot_ui.py (Gradio UI)
- benchmark_dataset.json (2000+ c√¢u h·ªèi)
- graph_out/nodes_unified.csv (2,178 nodes - ƒë√£ s·ª≠a thi·∫øu alumni_of)
- graph_out/edges_unified.csv (68,476 edges - ƒë√£ th√™m 24 edges m·ªõi)
- fix_missing_alumni.py (script t·ª± ƒë·ªông b·ªï sung alumni_of)
- check_missing_alumni.py (script ki·ªÉm tra)

üöÄ CH·∫†Y UI:
python chatbot_ui.py

üéØ KI·∫æN TR√öC GRAPHRAG:
Query ‚Üí Entity Extraction ‚Üí Graph Traversal ‚Üí Context Assembly ‚Üí LLM Generation ‚Üí Answer
         ‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ GraphRAG Layer ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò
""")

print("\n" + "="*80)
print("üéâ CHATBOT KNOWLEDGE GRAPH V·ªöI GRAPHRAG HO√ÄN CH·ªàNH!")
print("="*80)


‚úÖ HO√ÄN TH√ÄNH T·∫§T C·∫¢ Y√äU C·∫¶U:

1. ‚úì LLM nh·ªè ‚â§ 1B params (SimpleLLM template-based, c√≥ th·ªÉ n√¢ng c·∫•p l√™n TinyLlama 1.1B)
2. ‚úì GraphRAG: Bi·ªÉu di·ªÖn m·∫°ng x√£ h·ªôi d∆∞·ªõi d·∫°ng Knowledge Graph
3. ‚úì Multi-hop reasoning: T√¨m ƒë∆∞·ªùng ƒëi, ki·ªÉm tra k·∫øt n·ªëi qua nhi·ªÅu b∆∞·ªõc
4. ‚úì Benchmark dataset: 2000 c√¢u h·ªèi (700 connection + 700 university + 600 MCQ)
5. ‚úì Evaluation: So s√°nh accuracy tr√™n dataset

üìä TH·ªêNG K√ä ƒê·ªí TH·ªä UNIFIED:
- Nodes: 2,172 (person: 1,229 | university: 842 | country: 67 | career: 34)
- Edges: 68,452 m·ªëi quan h·ªá
- Relations:
  ‚Ä¢ alumni_of: 1,629 (person ‚Üí university)
  ‚Ä¢ same_uni: 8,707 (person ‚Üî person c√πng tr∆∞·ªùng)
  ‚Ä¢ same_birth_country: 39,957 (person ‚Üî person c√πng qu·ªëc gia)
  ‚Ä¢ link_to: 15,319 (Wikipedia mentions)
  ‚Ä¢ same_career: 1,298 (person ‚Üî person c√πng ngh·ªÅ)
  ‚Ä¢ has_career: 1,542 (person ‚Üí career) ‚ú® M·ªöI TH√äM!

üìÅ FILES T·∫†O:
- kg_chatbot.ipynb (notebook n√†y)

In [None]:

class RAGLLMChatbot:
    """
    RAG + LLM Chatbot: GraphRAG retrieves context, Qwen generates natural response
    
    Architecture:
    Query ‚Üí GraphRAG Context Retrieval ‚Üí Multi-hop Reasoning ‚Üí Qwen LLM ‚Üí Natural Answer
    """
    
    def __init__(self, kg: 'KnowledgeGraph', retriever: 'GraphRAGRetriever', 
                 llm, multi_hop_reasoner: 'MultiHopReasoner' = None):
        self.kg = kg
        self.retriever = retriever
        self.llm = llm
        self.multi_hop = multi_hop_reasoner
        self.chat_history = []
    
    def extract_entities(self, query: str) -> List[str]:
        """Extract person/university names from query"""
        entities = []
        
        # Check against graph nodes
        for node in self.kg.graph.nodes():
            if node.lower() in query.lower():
                entities.append(node)
        
        return entities[:3]  # Limit to 3 entities
    
    def chat(self, query: str, use_multi_hop: bool = True, verbose: bool = False) -> Dict:
        """
        Main chat function combining GraphRAG + LLM
        
        Args:
            query: User question in natural language
            use_multi_hop: Enable multi-hop reasoning
            verbose: Print intermediate steps
        
        Returns:
            {
                'question': original query,
                'entities': extracted entities,
                'graph_context': retrieved context,
                'reasoning': multi-hop result if applicable,
                'answer': generated answer,
                'sources': data sources
            }
        """
        
        result = {
            'question': query,
            'entities': [],
            'graph_context': '',
            'reasoning': None,
            'answer': '',
            'sources': []
        }
        
        # Step 1: Extract entities
        entities = self.extract_entities(query)
        result['entities'] = entities
        
        if verbose:
            print(f"üîç Entities found: {entities}")
        
        # Step 2: GraphRAG retrieval
        if entities:
            context = self.retriever.retrieve_multi_entity_context(entities, hop_depth=2)
            result['graph_context'] = context
            result['sources'].append('graph_retrieval')
            
            if verbose:
                print(f"üìä GraphRAG Context:\n{context[:500]}...")
        else:
            # Generic graph summary
            context = f"Alumni network graph with {len(self.kg.graph.nodes())} entities and {len(self.kg.graph.edges())} relationships."
            result['graph_context'] = context
        
        # Step 3: Multi-hop reasoning (optional)
        if use_multi_hop and len(entities) >= 2 and self.multi_hop:
            try:
                reasoning = self.multi_hop.find_path(entities[0], entities[1], max_hops=3)
                result['reasoning'] = reasoning
                result['sources'].append('multi_hop_reasoning')
                
                if verbose and reasoning['connected']:
                    print(f"üîó Multi-hop Path: {reasoning['path']}")
            except:
                pass
        
        # Step 4: LLM generation
        if hasattr(self.llm, 'generate'):
            answer = self.llm.generate(
                query=query,
                context=result['graph_context'],
                reasoning_result=result['reasoning']
            )
        else:
            # SimpleLLM fallback
            answer = self.llm.generate_answer(query)
        
        result['answer'] = answer
        
        # Store in history
        self.chat_history.append(result)
        
        return result
    
    def display_result(self, result: Dict):
        """Pretty print chat result"""
        print("\n" + "="*70)
        print(f"‚ùì Question: {result['question']}")
        print(f"üìå Entities: {', '.join(result['entities']) if result['entities'] else 'None'}")
        
        if result['reasoning'] and result['reasoning'].get('connected'):
            print(f"üîó Path: {' ‚Üí '.join(result['reasoning']['path'])}")
            print(f"   Hops: {result['reasoning']['hops']}")
        
        print(f"\nüí¨ Answer:\n{result['answer']}")
        print("="*70 + "\n")

# Initialize RAG+LLM Chatbot with SimpleLLM (fallback if Qwen not available)
print("\nü§ñ Initializing RAG+LLM Chatbot...")
rag_llm_chatbot = RAGLLMChatbot(
    kg=kg,
    retriever=rag_retriever,
    llm=llm,
    multi_hop_reasoner=reasoner
)
print("‚úÖ RAG+LLM Chatbot ready!")

## Demo: RAG+LLM Chatbot with Natural Questions

In [None]:
# Test RAG+LLM Chatbot with natural questions
print("="*70)
print("ü§ñ Testing RAG+LLM Chatbot")
print("="*70)

# Load natural questions
import json

with open('benchmark_dataset_natural_questions.json', 'r', encoding='utf-8') as f:
    natural_questions = json.load(f)

print(f"\nüìä Loaded {len(natural_questions)} natural questions")

# Test queries from different categories
test_queries = [
    # Connection question
    {
        'query': "C√≥ li√™n quan g√¨ gi·ªØa Bill Gates v√† Mark Zuckerberg?",
        'category': 'connection',
        'description': 'Connection between two tech entrepreneurs'
    },
    # Education lookup
    {
        'query': "B·∫°n c√≥ bi·∫øt Bill Gates h·ªçc ·ªü ƒë√¢u kh√¥ng?",
        'category': 'education_lookup',
        'description': 'Where did Bill Gates study'
    },
    # Career lookup  
    {
        'query': "Tim Cook l√†m ngh·ªÅ g√¨?",
        'category': 'career_lookup',
        'description': 'What is Tim Cook profession'
    },
    # Inference question (requires LLM reasoning)
    {
        'query': "Theo b·∫°n, c√≥ nh·ªØng ƒëi·ªÉm chung n√†o gi·ªØa Steve Jobs v√† Tim Cook?",
        'category': 'inference',
        'description': 'Inference about common points'
    }
]

results = []
for test_q in test_queries:
    print(f"\n{'='*70}")
    print(f"üìã Category: {test_q['category']}")
    print(f"‚ùì Question: {test_q['query']}")
    
    result = rag_llm_chatbot.chat(test_q['query'], use_multi_hop=True, verbose=True)
    results.append(result)
    
    rag_llm_chatbot.display_result(result)

print("\n‚úÖ Demo completed!")

## Evaluation: RAG+LLM on Natural Questions Dataset

In [None]:
# Evaluate RAG+LLM Chatbot on natural questions dataset
print("\n" + "="*70)
print("üìä Evaluating RAG+LLM Chatbot on Natural Questions")
print("="*70)

# Sample diverse questions from dataset
import random
random.seed(42)

eval_sample_size = 50
eval_questions = random.sample(natural_questions, min(eval_sample_size, len(natural_questions)))

print(f"\nSampling {len(eval_questions)} questions from {len(natural_questions)} total")

# Categorize eval questions
from collections import defaultdict
category_results = defaultdict(lambda: {'total': 0, 'answered': 0, 'with_reasoning': 0})

eval_start = time.time()

for i, q in enumerate(eval_questions):
    if i % 10 == 0:
        print(f"  Progress: {i}/{len(eval_questions)}...", end='\r')
    
    try:
        result = rag_llm_chatbot.chat(
            q.get('question', ''),
            use_multi_hop=True,
            verbose=False
        )
        
        category = q.get('category', 'unknown')
        category_results[category]['total'] += 1
        
        if result['answer']:
            category_results[category]['answered'] += 1
        
        if result['reasoning'] and result['reasoning'].get('connected'):
            category_results[category]['with_reasoning'] += 1
    
    except Exception as e:
        print(f"Error processing question {i}: {str(e)}")

eval_time = time.time() - eval_start

# Print evaluation results
print("\n" + "="*70)
print("üìà Evaluation Results:")
print("="*70)

total_answered = 0
total_questions = 0

for category in sorted(category_results.keys()):
    stats = category_results[category]
    total = stats['total']
    answered = stats['answered']
    with_reasoning = stats['with_reasoning']
    
    if total > 0:
        answer_rate = (answered / total) * 100
        reasoning_rate = (with_reasoning / total) * 100
        
        print(f"\n{category.upper()}:")
        print(f"  Total: {total}")
        print(f"  Answered: {answered}/{total} ({answer_rate:.1f}%)")
        print(f"  With Multi-hop Reasoning: {with_reasoning}/{total} ({reasoning_rate:.1f}%)")
        
        total_answered += answered
        total_questions += total

overall_rate = (total_answered / total_questions * 100) if total_questions > 0 else 0

print("\n" + "="*70)
print(f"Overall Answer Rate: {total_answered}/{total_questions} ({overall_rate:.1f}%)")
print(f"Evaluation Time: {eval_time:.2f}s ({eval_time/len(eval_questions):.3f}s per question)")
print("="*70)

print("\n‚úÖ Evaluation completed!")