In [1]:
import pandas as pd
import networkx as nx
import heapq
import math
import time
import os
import gc

# ==============================================================================
# 1. C·∫§U H√åNH & D·ªÆ LI·ªÜU
# ==============================================================================
# ƒê∆∞·ªùng d·∫´n file (B·∫°n s·ª≠a l·∫°i cho ph√π h·ª£p m√°y m√¨nh)
INPUT_CSV = '~/data/DataForCmp_GIS.csv'      # File ch·ª©a c√°c b·ªô tham s·ªë Eps, MinPts c·∫ßn test
DATA_FILE = '~/data/6_DSCanhKQ2_CanTho_XoaCon3Cot_XoaDongTrung.txt'  # File d·ªØ li·ªáu ƒë·ªì th·ªã (IdStart, IdEnd, Length)
OUTPUT_CSV = '~/data/KetQua_Raw_Benchmark.csv'

NUM_RUNS = 3  # S·ªë l·∫ßn ch·∫°y l·∫•y trung b√¨nh (ƒë·ªÉ gi·∫£m sai s·ªë ng·∫´u nhi√™n)

def load_graph_data(path):
    print(f"--- ƒêANG N·∫†P D·ªÆ LI·ªÜU: {os.path.basename(path)} ---")
    try:
        # ƒê·ªçc file (h·ªó tr·ª£ c·∫£ tab v√† space)
        df = pd.read_csv(path, sep='\t')
        if df.shape[1] < 3: df = pd.read_csv(path, delim_whitespace=True)
        
        df.columns = df.columns.str.strip()
        if 'IdStar' in df.columns: df.rename(columns={'IdStar': 'IdStart'}, inplace=True)
        
        # Chuy·ªÉn sang Dictionary ƒë·ªÉ truy xu·∫•t nhanh O(1)
        print("üîÑ ƒêang chuy·ªÉn ƒë·ªïi sang Adjacency Dictionary...")
        adj = {}
        nodes = set()
        for _, row in df.iterrows():
            u, v, w = int(row['IdStart']), int(row['IdEnd']), float(row['Length'])
            if u not in adj: adj[u] = []
            if v not in adj: adj[v] = []
            adj[u].append((v, w))
            adj[v].append((u, w))
            nodes.add(u); nodes.add(v)
            
        print(f"‚úÖ ƒê√£ n·∫°p: {len(nodes)} ƒë·ªânh.")
        return adj, list(nodes)
    except Exception as e:
        print(f"‚ùå L·ªói n·∫°p d·ªØ li·ªáu: {e}")
        return None, None

# ==============================================================================
# 2. THU·∫¨T TO√ÅN G·ªêC (NS-DBSCAN STANDARD)
# ==============================================================================

# --- Thu·∫≠t to√°n 1: LSPD G·ªëc (Kh√¥ng c·∫Øt t·ªâa) ---
def alg1_LSPD_original(adj, start_node, eps):
    distances = {start_node: 0}
    queue = [(0, start_node)]
    neighbors = []
    
    while queue:
        d, u = heapq.heappop(queue)
        
        # Ch·ªâ ki·ªÉm tra sau khi ƒë√£ l·∫•y ra kh·ªèi h√†ng ƒë·ª£i
        if d > eps: continue 
        neighbors.append(u)
        
        if u in adj:
            for v, weight in adj[u]:
                new_d = d + weight
                # V·∫´n c·ªông d·ªìn v√† th√™m v√†o h√†ng ƒë·ª£i d√π c·∫°nh c√≥ th·ªÉ r·∫•t d√†i
                if new_d <= eps:
                    if new_d < distances.get(v, float('inf')):
                        distances[v] = new_d
                        heapq.heappush(queue, (new_d, v))
    return neighbors

# --- Thu·∫≠t to√°n 2: T·∫°o b·∫£ng m·∫≠t ƒë·ªô G·ªëc (L·∫•y t·∫•t c·∫£) ---
def alg2_density_ordering_original(adj, points, eps):
    neighbors_cache = {}
    ordered_list = []
    
    for p in points:
        # Ch·∫°y LSPD cho T·∫§T C·∫¢ c√°c ƒëi·ªÉm
        nbrs = alg1_LSPD_original(adj, p, eps)
        neighbors_cache[p] = nbrs
        # L∆∞u t·∫•t c·∫£ v√†o danh s√°ch, k·ªÉ c·∫£ ƒëi·ªÉm m·∫≠t ƒë·ªô = 0
        ordered_list.append((len(nbrs), p))
        
    # S·∫Øp x·∫øp to√†n b·ªô danh s√°ch (Chi ph√≠ cao n·∫øu danh s√°ch d√†i)
    ordered_list.sort(key=lambda x: x[0], reverse=True)
    return ordered_list, neighbors_cache

# --- Thu·∫≠t to√°n 3: H√¨nh th√†nh c·ª•m (C√≥ ki·ªÉm tra nhi·ªÖu t∆∞·ªùng minh) ---
def alg3_clustering_original(ordered_list, neighbors_cache, min_pts):
    labels = {} # Dict l∆∞u nh√£n c·ª•m
    cluster_id = 0
    
    # L·∫•y danh s√°ch ƒëi·ªÉm t·ª´ b·∫£ng x·∫øp h·∫°ng
    sorted_points = [x[1] for x in ordered_list]
    
    for p in sorted_points:
        if p in labels: continue # ƒê√£ x·ª≠ l√Ω th√¨ b·ªè qua
        
        p_nbrs = neighbors_cache.get(p, [])
        
        # Ki·ªÉm tra ƒëi·ªÅu ki·ªán ƒëi·ªÉm l√µi
        if len(p_nbrs) >= min_pts:
            cluster_id += 1
            labels[p] = cluster_id # G√°n nh√£n c·ª•m
            
            # Lan truy·ªÅn (Expand Cluster)
            seeds = list(p_nbrs)
            i = 0
            while i < len(seeds):
                q = seeds[i]
                if q not in labels:
                    labels[q] = cluster_id
                    q_nbrs = neighbors_cache.get(q, [])
                    if len(q_nbrs) >= min_pts:
                        seeds.extend(q_nbrs)
                elif labels[q] == -1: # N·∫øu tr∆∞·ªõc ƒë√≥ b·ªã ƒë√°nh l√† Nhi·ªÖu
                    labels[q] = cluster_id # C·ª©u l·∫°i th√†nh bi√™n
                i += 1
        else:
            # G√°n nh√£n NHI·ªÑU t∆∞·ªùng minh (Explicit Noise)
            labels[p] = -1 
            
    return labels

# Wrapper ch·∫°y to√†n b·ªô quy tr√¨nh G·ªëc
def run_ns_dbscan_original(adj, points, eps, min_pts):
    # B∆∞·ªõc 1 & 2
    ordered_list, nbr_cache = alg2_density_ordering_original(adj, points, eps)
    # B∆∞·ªõc 3
    labels = alg3_clustering_original(ordered_list, nbr_cache, min_pts)
    return labels

# ==============================================================================
# 3. THU·∫¨T TO√ÅN C·∫¢I TI·∫æN (iNS-DBSCAN)
# ==============================================================================

# --- Thu·∫≠t to√°n 1: LSPD C·∫£i ti·∫øn (C·∫Øt t·ªâa c·∫°nh - Pruning) ---
def alg1_LSPD_improved(adj, start_node, eps):
    distances = {start_node: 0}
    queue = [(0, start_node)]
    neighbors = []
    
    while queue:
        d, u = heapq.heappop(queue)
        if d > eps: continue
        neighbors.append(u)
        
        if u in adj:
            for v, weight in adj[u]:
                # C·∫¢I TI·∫æN 1: C·∫Øt t·ªâa c·∫°nh ngay l·∫≠p t·ª©c
                if weight > eps: continue 
                
                new_d = d + weight
                if new_d <= eps:
                    if new_d < distances.get(v, float('inf')):
                        distances[v] = new_d
                        heapq.heappush(queue, (new_d, v))
    return neighbors

# --- Thu·∫≠t to√°n 2: T·∫°o b·∫£ng m·∫≠t ƒë·ªô C·∫£i ti·∫øn (L·ªçc ng∆∞·ª°ng Logarit) ---
def alg2_density_ordering_improved(adj, points, eps):
    neighbors_cache = {}
    ordered_list = []
    
    # C·∫¢I TI·∫æN 2A: T√≠nh ng∆∞·ª°ng l·ªçc Heuristic
    n = len(points)
    threshold = math.log(n) if n > 0 else 0
    
    for p in points:
        nbrs = alg1_LSPD_improved(adj, p, eps)
        
        # C·∫¢I TI·∫æN 2B: Ch·ªâ th√™m v√†o b·∫£ng n·∫øu m·∫≠t ƒë·ªô >= ng∆∞·ª°ng
        if len(nbrs) >= threshold:
            neighbors_cache[p] = nbrs
            ordered_list.append((len(nbrs), p))
        # C√°c ƒëi·ªÉm < threshold b·ªã LO·∫†I B·ªé ho√†n to√†n, ti·∫øt ki·ªám b·ªô nh·ªõ & th·ªùi gian sort
            
    # S·∫Øp x·∫øp danh s√°ch (Danh s√°ch n√†y ng·∫Øn h∆°n b·∫£n g·ªëc nhi·ªÅu)
    ordered_list.sort(key=lambda x: x[0], reverse=True)
    return ordered_list, neighbors_cache

# --- Thu·∫≠t to√°n 3: H√¨nh th√†nh c·ª•m (Nhi·ªÖu ng·∫ßm ƒë·ªãnh) ---
def alg3_clustering_improved(ordered_list, neighbors_cache, min_pts):
    labels = {}
    cluster_id = 0
    
    sorted_points = [x[1] for x in ordered_list]
    
    for p in sorted_points:
        if p in labels: continue
        
        p_nbrs = neighbors_cache.get(p, [])
        
        if len(p_nbrs) >= min_pts:
            cluster_id += 1
            labels[p] = cluster_id
            
            seeds = list(p_nbrs)
            i = 0
            while i < len(seeds):
                q = seeds[i]
                # C·∫¢I TI·∫æN 3: Ch·ªâ quan t√¢m g√°n nh√£n c·ª•m
                # Kh√¥ng t·ªën th·ªùi gian ki·ªÉm tra/g√°n l·∫°i nh√£n Noise (-1)
                if q not in labels:
                    labels[q] = cluster_id
                    q_nbrs = neighbors_cache.get(q, [])
                    if len(q_nbrs) >= min_pts:
                        seeds.extend(q_nbrs)
                i += 1
        # Kh√¥ng c√≥ d√≤ng "else: labels[p] = -1" -> Nhi·ªÖu ng·∫ßm ƒë·ªãnh
        
    return labels

# Wrapper ch·∫°y to√†n b·ªô quy tr√¨nh C·∫£i ti·∫øn
def run_ns_dbscan_improved(adj, points, eps, min_pts):
    ordered_list, nbr_cache = alg2_density_ordering_improved(adj, points, eps)
    labels = alg3_clustering_improved(ordered_list, nbr_cache, min_pts)
    return labels

# ==============================================================================
# 4. BENCHMARK ENGINE (RAW - KH√îNG CHE)
# ==============================================================================
def measure_time(func, *args):
    gc.collect() # D·ªçn r√°c b·ªô nh·ªõ ƒë·ªÉ ƒëo ch√≠nh x√°c h∆°n
    start = time.time()
    func(*args)
    end = time.time()
    return (end - start) * 1000 # ƒê·ªïi ra ms

def run_raw_benchmark():
    # 1. N·∫°p d·ªØ li·ªáu
    if not os.path.exists(DATA_FILE):
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y file d·ªØ li·ªáu: {DATA_FILE}")
        return
    
    adj, nodes = load_graph_data(DATA_FILE)
    if not adj: return

    # 2. T·∫°o file tham s·ªë m·∫´u n·∫øu ch∆∞a c√≥
    if not os.path.exists(INPUT_CSV):
        pd.DataFrame({
            'Eps': [200, 300, 400, 500],
            'MinPts': [10, 15, 20, 25]
        }).to_csv(INPUT_CSV, index=False)
        print("‚ö†Ô∏è ƒê√£ t·∫°o file tham s·ªë m·∫´u.")

    df_params = pd.read_csv(INPUT_CSV)
    results = []

    print("\n" + "="*85)
    print(f"{'BENCHMARK TRUNG TH·ª∞C (RAW RESULTS)':^85}")
    print("="*85)
    print(f"| {'Eps':<6} | {'MinPts':<6} | {'G·ªëc (ms)':<15} | {'C·∫£i ti·∫øn (ms)':<15} | {'Nhanh h∆°n (%)':<15} |")
    print("-" * 85)

    for _, row in df_params.iterrows():
        eps = float(row['Eps'])
        min_pts = int(row['MinPts'])
        
        total_orig = 0
        total_imp = 0
        
        # Ch·∫°y nhi·ªÅu l·∫ßn l·∫•y trung b√¨nh
        for _ in range(NUM_RUNS):
            total_orig += measure_time(run_ns_dbscan_original, adj, nodes, eps, min_pts)
            total_imp += measure_time(run_ns_dbscan_improved, adj, nodes, eps, min_pts)
            
        avg_orig = total_orig / NUM_RUNS
        avg_imp = total_imp / NUM_RUNS
        
        # T√≠nh % c·∫£i thi·ªán (D∆∞∆°ng l√† nhanh h∆°n, √Çm l√† ch·∫≠m h∆°n)
        improvement = ((avg_orig - avg_imp) / avg_orig) * 100
        
        print(f"| {int(eps):<6} | {min_pts:<6} | {avg_orig:<15.2f} | {avg_imp:<15.2f} | {improvement:<15.2f} |")
        
        results.append({
            'Eps': eps,
            'MinPts': min_pts,
            'Original_Time': avg_orig,
            'Improved_Time': avg_imp,
            'Improvement_Percent': improvement
        })

    # L∆∞u k·∫øt qu·∫£
    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print("="*85)
    print(f"‚úÖ K·∫øt qu·∫£ ƒë√£ l∆∞u t·∫°i: {OUTPUT_CSV}")

if __name__ == "__main__":
    run_raw_benchmark()

‚ùå Kh√¥ng t√¨m th·∫•y file d·ªØ li·ªáu: ~/data/CanTho_RoadNetwork.txt
