In [1]:
import networkx as nx
import numpy as np
import os
from collections import Counter
import random
import math
import io

# --- CONFIGURATION: ASSUMED LOCAL DATASET FILE ---
FILENAME = "ca-netscience.mtx"
# The code will now treat the content of this .mtx file as a simple edge list.

# Define a list of expected header/comment characters to skip
SKIP_CHARS = ('%', '#', '*')

# --- 1. LOAD NETWORKX GRAPH (ROBUST EDGE LIST READING) ---
print(f"Loading graph from local file ({FILENAME} by skipping header lines)...")
if not os.path.exists(FILENAME):
    raise FileNotFoundError(f"Error: The file '{FILENAME}' was not found. Please check the file name and path.")

try:
    # Filter file content to skip header/comment lines before loading
    with open(FILENAME, 'r') as f:
        # Filter out lines starting with comment characters AND ensure lines are not empty
        edge_lines = [line for line in f if not line.strip().startswith(SKIP_CHARS) and line.strip()]

    # Write the clean edge data to an in-memory file for NetworkX to read
    clean_data = "\n".join(edge_lines)

    # Load the graph from the cleaned edge data. This is the fix for the .mtx formatting issue.
    G_full = nx.read_edgelist(
        io.StringIO(clean_data), # Use the in-memory string as a file
        create_using=nx.Graph(),
        nodetype=int,
        data=False,
        encoding='utf-8'
    )

except Exception as e:
    G_full = nx.Graph()
    print(f"FATAL ERROR: Failed to read file as an edge list. Check file content. Details: {e}")
    exit()

# --- 2. FILTER TO LARGEST CONNECTED COMPONENT (LCC) ---
if G_full.number_of_nodes() == 0:
    raise ValueError("The graph loaded zero nodes. Check the file content.")

largest_cc = max(nx.connected_components(G_full), key=len)
G = G_full.subgraph(largest_cc).copy()

# --- 3. CREATE INDEX MAPPING ---
node_list = sorted(G.nodes())
node_to_index = {node: i for i, node in enumerate(node_list)}
index_to_node = {i: node for node, i in node_to_index.items()}
NUM_AUTHORS = G.number_of_nodes()
DIMENSION = NUM_AUTHORS

print(f"Data Loaded Successfully: NetScience LCC has {NUM_AUTHORS} authors and {G.number_of_edges()} edges.")

Loading graph from local file (ca-netscience.mtx by skipping header lines)...
Data Loaded Successfully: NetScience LCC has 379 authors and 914 edges.


In [3]:
def calculate_modularity(G, community_assignment_array):
    """
    Calculates the Modularity (Q) of a community assignment.
    """
    node_assignment = {index_to_node[i]: community_assignment_array[i]
                       for i in range(NUM_AUTHORS)}

    communities = {}
    for node, c_id in node_assignment.items():
        if c_id not in communities:
            communities[c_id] = set()
        communities[c_id].add(node)

    partition = list(communities.values())

    try:
        Q = nx.community.modularity(G, partition)
    except ZeroDivisionError:
        Q = 0.0

    return Q

In [4]:
# --- PSO HYPERPARAMETERS ---
NUM_PARTICLES = 30
MAX_ITERATIONS = 50
NUM_CLUSTERS = 10
W_MAX = 0.9
W_MIN = 0.4
C1 = 2.0
C2 = 2.0
V_MAX = 6.0

# --- PARTICLE CLASS ---
class Particle:
    def __init__(self, dimension, K):
        self.dimension = dimension
        self.K = K
        self.position = np.random.randint(0, self.K, size=self.dimension)
        self.velocity = np.zeros(self.dimension, dtype=float)
        self.pbest_position = self.position.copy()
        self.pbest_fitness = -1.0
        self.current_fitness = -1.0

# --- INITIALIZATION & MAIN DPSO LOOP ---
print(f"\n--- Starting Discrete PSO for NetScience Community Detection (K={NUM_CLUSTERS}) ---")
random.seed(42)
np.random.seed(42)

swarm = [Particle(DIMENSION, NUM_CLUSTERS) for _ in range(NUM_PARTICLES)]
gbest_position = swarm[0].position.copy()
gbest_fitness = -1.0

for iter in range(MAX_ITERATIONS):
    w = W_MAX - (W_MAX - W_MIN) * iter / MAX_ITERATIONS

    for particle in swarm:
        particle.current_fitness = calculate_modularity(G, particle.position)

        if particle.current_fitness > particle.pbest_fitness:
            particle.pbest_fitness = particle.current_fitness
            particle.pbest_position = particle.position.copy()

            if particle.pbest_fitness > gbest_fitness:
                gbest_fitness = particle.pbest_fitness
                gbest_position = particle.pbest_position.copy()

    for particle in swarm:
        r1 = np.random.rand(DIMENSION)
        r2 = np.random.rand(DIMENSION)

        cognitive_term = C1 * r1 * (particle.pbest_position - particle.position)
        social_term = C2 * r2 * (gbest_position - particle.position)

        new_velocity = w * particle.velocity + cognitive_term + social_term
        new_velocity = np.clip(new_velocity, -V_MAX, V_MAX)
        particle.velocity = new_velocity

        for i in range(DIMENSION):
            #
            p_switch = 1.0 / (1.0 + math.exp(-particle.velocity[i]))

            if random.random() < p_switch:
                pbest_assignment = particle.pbest_position[i]
                gbest_assignment = gbest_position[i]

                if random.random() < 0.5:
                    particle.position[i] = gbest_assignment
                else:
                    particle.position[i] = pbest_assignment

            particle.position[i] = int(particle.position[i] % NUM_CLUSTERS)


    print(f"Iteration {iter+1}/{MAX_ITERATIONS}: Global Best Modularity = {gbest_fitness:.4f}")

# --- FINAL RESULT ---
print("\n--- OPTIMIZATION COMPLETE ---")
print(f"Final Best Modularity Score: {gbest_fitness:.4f}")

final_assignments = {index_to_node[i]: gbest_position[i] for i in range(DIMENSION)}
community_counts = Counter(final_assignments.values())
print(f"Number of distinct communities found: {len(community_counts)}")
print("Top 5 largest communities (size):", community_counts.most_common(5))


--- Starting Discrete PSO for NetScience Community Detection (K=10) ---
Iteration 1/50: Global Best Modularity = 0.0188
Iteration 2/50: Global Best Modularity = 0.0188
Iteration 3/50: Global Best Modularity = 0.0188
Iteration 4/50: Global Best Modularity = 0.0200
Iteration 5/50: Global Best Modularity = 0.0200
Iteration 6/50: Global Best Modularity = 0.0276
Iteration 7/50: Global Best Modularity = 0.0276
Iteration 8/50: Global Best Modularity = 0.0286
Iteration 9/50: Global Best Modularity = 0.0286
Iteration 10/50: Global Best Modularity = 0.0293
Iteration 11/50: Global Best Modularity = 0.0293
Iteration 12/50: Global Best Modularity = 0.0342
Iteration 13/50: Global Best Modularity = 0.0345
Iteration 14/50: Global Best Modularity = 0.0345
Iteration 15/50: Global Best Modularity = 0.0345
Iteration 16/50: Global Best Modularity = 0.0438
Iteration 17/50: Global Best Modularity = 0.0438
Iteration 18/50: Global Best Modularity = 0.0438
Iteration 19/50: Global Best Modularity = 0.0445
Itera