In [1]:
import pandas as pd
import networkx as nx
import heapq
import math
import time
import os
import gc

# ==============================================================================
# 1. C·∫§U H√åNH & D·ªÆ LI·ªÜU
# ==============================================================================

# --- ƒê∆Ø·ªúNG D·∫™N D·ªÆ LI·ªÜU ---
DATA_FILE = r'D:/GISS/data/6_DSCanhKQ2_CanTho_XoaCon3Cot_XoaDongTrung.txt'

# L∆∞u √Ω: INPUT v√† OUTPUT d√πng ƒë∆∞·ªùng d·∫´n t∆∞∆°ng ƒë·ªëi ho·∫∑c tuy·ªát ƒë·ªëi ƒë·ªÅu ƒë∆∞·ª£c
# Code s·∫Ω t·ª± ƒë·ªông t·∫°o th∆∞ m·ª•c n·∫øu ch∆∞a c√≥
INPUT_CSV = r'D:/GISS/data/DataForCmp_GIS.csv'      

# 3. File k·∫øt qu·∫£ ƒë·∫ßu ra
OUTPUT_CSV = r'D:/GISS/data/KetQua_Raw_Benchmark.csv'

NUM_RUNS = 3 

# H√†m h·ªó tr·ª£ t·∫°o th∆∞ m·ª•c an to√†n
def ensure_dir(file_path):
    directory = os.path.dirname(file_path)
    if directory and not os.path.exists(directory):
        os.makedirs(directory, exist_ok=True)
        print(f"üìÅ ƒê√£ t·∫°o th∆∞ m·ª•c: {directory}")

def load_graph_data(path):
    print(f"--- ƒêANG N·∫†P D·ªÆ LI·ªÜU T·ª™: {os.path.basename(path)} ---")
    try:
        # ƒê·ªçc file (h·ªó tr·ª£ c·∫£ tab v√† space)
        df = pd.read_csv(path, sep='\t', engine='python')
        if df.shape[1] < 3: 
            df = pd.read_csv(path, delim_whitespace=True, engine='python')
        
        df.columns = df.columns.str.strip()
        if 'IdStar' in df.columns: df.rename(columns={'IdStar': 'IdStart'}, inplace=True)
        
        # Ki·ªÉm tra c·ªôt b·∫Øt bu·ªôc
        if not {'IdStart', 'IdEnd', 'Length'}.issubset(df.columns):
            print(f"‚ùå File thi·∫øu c·ªôt! C√°c c·ªôt t√¨m th·∫•y: {list(df.columns)}")
            return None, None

        # Chuy·ªÉn sang Dictionary
        print("üîÑ ƒêang chuy·ªÉn ƒë·ªïi sang Adjacency Dictionary...")
        adj = {}
        nodes = set()
        for _, row in df.iterrows():
            u, v, w = int(row['IdStart']), int(row['IdEnd']), float(row['Length'])
            if u not in adj: adj[u] = []
            if v not in adj: adj[v] = []
            adj[u].append((v, w))
            adj[v].append((u, w))
            nodes.add(u); nodes.add(v)
            
        print(f"‚úÖ ƒê√£ n·∫°p th√†nh c√¥ng: {len(nodes)} ƒë·ªânh.")
        return adj, list(nodes)
    except Exception as e:
        print(f"‚ùå L·ªói n·∫°p d·ªØ li·ªáu: {e}")
        return None, None

# ==============================================================================
# 2. THU·∫¨T TO√ÅN G·ªêC (NS-DBSCAN STANDARD)
# ==============================================================================

def alg1_LSPD_original(adj, start_node, eps):
    distances = {start_node: 0}
    queue = [(0, start_node)]
    neighbors = []
    
    while queue:
        d, u = heapq.heappop(queue)
        if d > eps: continue 
        neighbors.append(u)
        
        if u in adj:
            for v, weight in adj[u]:
                new_d = d + weight
                if new_d <= eps:
                    if new_d < distances.get(v, float('inf')):
                        distances[v] = new_d
                        heapq.heappush(queue, (new_d, v))
    return neighbors

def alg2_density_ordering_original(adj, points, eps):
    neighbors_cache = {}
    ordered_list = []
    for p in points:
        nbrs = alg1_LSPD_original(adj, p, eps)
        neighbors_cache[p] = nbrs
        ordered_list.append((len(nbrs), p))
    ordered_list.sort(key=lambda x: x[0], reverse=True)
    return ordered_list, neighbors_cache

def alg3_clustering_original(ordered_list, neighbors_cache, min_pts):
    labels = {} 
    cluster_id = 0
    sorted_points = [x[1] for x in ordered_list]
    for p in sorted_points:
        if p in labels: continue 
        p_nbrs = neighbors_cache.get(p, [])
        if len(p_nbrs) >= min_pts:
            cluster_id += 1
            labels[p] = cluster_id 
            seeds = list(p_nbrs)
            i = 0
            while i < len(seeds):
                q = seeds[i]
                if q not in labels:
                    labels[q] = cluster_id
                    q_nbrs = neighbors_cache.get(q, [])
                    if len(q_nbrs) >= min_pts:
                        seeds.extend(q_nbrs)
                elif labels[q] == -1: 
                    labels[q] = cluster_id 
                i += 1
        else:
            labels[p] = -1 
    return labels

def run_ns_dbscan_original(adj, points, eps, min_pts):
    ordered_list, nbr_cache = alg2_density_ordering_original(adj, points, eps)
    labels = alg3_clustering_original(ordered_list, nbr_cache, min_pts)
    return labels

# ==============================================================================
# 3. THU·∫¨T TO√ÅN C·∫¢I TI·∫æN (iNS-DBSCAN)
# ==============================================================================

def alg1_LSPD_improved(adj, start_node, eps):
    distances = {start_node: 0}
    queue = [(0, start_node)]
    neighbors = []
    while queue:
        d, u = heapq.heappop(queue)
        if d > eps: continue
        neighbors.append(u)
        if u in adj:
            for v, weight in adj[u]:
                if weight > eps: continue # C·∫¢I TI·∫æN 1: Pruning
                new_d = d + weight
                if new_d <= eps:
                    if new_d < distances.get(v, float('inf')):
                        distances[v] = new_d
                        heapq.heappush(queue, (new_d, v))
    return neighbors

def alg2_density_ordering_improved(adj, points, eps):
    neighbors_cache = {}
    ordered_list = []
    n = len(points)
    threshold = math.log(n) if n > 0 else 0 # C·∫¢I TI·∫æN 2A: Threshold
    
    for p in points:
        nbrs = alg1_LSPD_improved(adj, p, eps)
        if len(nbrs) >= threshold: # C·∫¢I TI·∫æN 2B: Filtering
            neighbors_cache[p] = nbrs
            ordered_list.append((len(nbrs), p))
            
    ordered_list.sort(key=lambda x: x[0], reverse=True)
    return ordered_list, neighbors_cache

def alg3_clustering_improved(ordered_list, neighbors_cache, min_pts):
    labels = {}
    cluster_id = 0
    sorted_points = [x[1] for x in ordered_list]
    for p in sorted_points:
        if p in labels: continue
        p_nbrs = neighbors_cache.get(p, [])
        if len(p_nbrs) >= min_pts:
            cluster_id += 1
            labels[p] = cluster_id
            seeds = list(p_nbrs)
            i = 0
            while i < len(seeds):
                q = seeds[i]
                if q not in labels: # C·∫¢I TI·∫æN 3: Implicit Noise
                    labels[q] = cluster_id
                    q_nbrs = neighbors_cache.get(q, [])
                    if len(q_nbrs) >= min_pts:
                        seeds.extend(q_nbrs)
                i += 1
    return labels

def run_ns_dbscan_improved(adj, points, eps, min_pts):
    ordered_list, nbr_cache = alg2_density_ordering_improved(adj, points, eps)
    labels = alg3_clustering_improved(ordered_list, nbr_cache, min_pts)
    return labels

# ==============================================================================
# 4. BENCHMARK ENGINE
# ==============================================================================
def measure_time(func, *args):
    gc.collect() 
    start = time.time()
    func(*args)
    end = time.time()
    return (end - start) * 1000 

def run_raw_benchmark():
    if not os.path.exists(DATA_FILE):
        print(f"‚ùå Kh√¥ng t√¨m th·∫•y file d·ªØ li·ªáu: {DATA_FILE}")
        return
    
    adj, nodes = load_graph_data(DATA_FILE)
    if not adj: return

    # --- S·ª¨A L·ªñI ·ªû ƒê√ÇY: ƒê·∫£m b·∫£o th∆∞ m·ª•c t·ªìn t·∫°i tr∆∞·ªõc khi t·∫°o file ---
    ensure_dir(INPUT_CSV) 
    
    if not os.path.exists(INPUT_CSV):
        pd.DataFrame({
            'Eps': [100, 200, 300, 400, 500],
            'MinPts': [5, 10, 15, 20, 25]
        }).to_csv(INPUT_CSV, index=False)
        print("‚ö†Ô∏è ƒê√£ t·∫°o file tham s·ªë m·∫´u.")

    df_params = pd.read_csv(INPUT_CSV)
    results = []

    print("\n" + "="*95)
    print(f"{'BENCHMARK TRUNG TH·ª∞C (RAW RESULTS) - KH√îNG R√ÄNG BU·ªòC':^95}")
    print("="*95)
    print(f"| {'Eps':<6} | {'MinPts':<6} | {'G·ªëc (ms)':<15} | {'C·∫£i ti·∫øn (ms)':<15} | {'Nhanh h∆°n (%)':<15} |")
    print("-" * 95)

    for _, row in df_params.iterrows():
        eps = float(row['Eps'])
        min_pts = int(row['MinPts'])
        
        total_orig = 0
        total_imp = 0
        
        for _ in range(NUM_RUNS):
            total_orig += measure_time(run_ns_dbscan_original, adj, nodes, eps, min_pts)
            total_imp += measure_time(run_ns_dbscan_improved, adj, nodes, eps, min_pts)
            
        avg_orig = total_orig / NUM_RUNS
        avg_imp = total_imp / NUM_RUNS
        
        improvement = ((avg_orig - avg_imp) / avg_orig) * 100 if avg_orig > 0 else 0
        
        print(f"| {int(eps):<6} | {min_pts:<6} | {avg_orig:<15.2f} | {avg_imp:<15.2f} | {improvement:<15.2f} |")
        
        results.append({
            'Eps': eps,
            'MinPts': min_pts,
            'Original_Time_ms': round(avg_orig, 3),
            'Improved_Time_ms': round(avg_imp, 3),
            'Improvement_Percent': round(improvement, 2)
        })

    # --- S·ª¨A L·ªñI ·ªû ƒê√ÇY: ƒê·∫£m b·∫£o th∆∞ m·ª•c t·ªìn t·∫°i tr∆∞·ªõc khi l∆∞u k·∫øt qu·∫£ ---
    ensure_dir(OUTPUT_CSV)
    pd.DataFrame(results).to_csv(OUTPUT_CSV, index=False)
    print("="*95)
    print(f"‚úÖ K·∫øt qu·∫£ chi ti·∫øt ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: {os.path.abspath(OUTPUT_CSV)}")

if __name__ == "__main__":
    run_raw_benchmark()

--- ƒêANG N·∫†P D·ªÆ LI·ªÜU T·ª™: 6_DSCanhKQ2_CanTho_XoaCon3Cot_XoaDongTrung.txt ---
üîÑ ƒêang chuy·ªÉn ƒë·ªïi sang Adjacency Dictionary...
‚úÖ ƒê√£ n·∫°p th√†nh c√¥ng: 2434 ƒë·ªânh.

                     BENCHMARK TRUNG TH·ª∞C (RAW RESULTS) - KH√îNG R√ÄNG BU·ªòC                      
| Eps    | MinPts | G·ªëc (ms)        | C·∫£i ti·∫øn (ms)   | Nhanh h∆°n (%)   |
-----------------------------------------------------------------------------------------------
| 200    | 20     | 69.07           | 67.66           | 2.05            |
| 200    | 25     | 68.62           | 74.49           | -8.55           |
| 200    | 30     | 66.73           | 69.27           | -3.81           |
| 200    | 35     | 64.85           | 65.52           | -1.03           |
| 200    | 40     | 65.00           | 66.14           | -1.75           |
| 250    | 20     | 87.24           | 88.40           | -1.33           |
| 250    | 25     | 89.09           | 85.52           | 4.00            |
| 250    | 30     