In [4]:
import networkx as nx
import numpy as np
import os
from collections import Counter
import random
import math
import gzip # Import for GZIP file handling
import io

# --- CONFIGURATION: ASSUMED LOCAL DATASET FILE ---
FILENAME_ARCHIVE = "email-Eu-core.txt.gz"
FILENAME_EDGE_LIST = "email-eu-core.txt" # The name of the file after extraction

# Define a list of expected header/comment characters to skip
SKIP_CHARS = ('%', '#', '*')

# --- 1. EXTRACT DATA FROM ARCHIVE (GZIP) ---
print(f"Extracting {FILENAME_EDGE_LIST} from {FILENAME_ARCHIVE}...")
if not os.path.exists(FILENAME_ARCHIVE):
    raise FileNotFoundError(f"Error: The archive file '{FILENAME_ARCHIVE}' was not found. Please check the filename and path.")

if not os.path.exists(FILENAME_EDGE_LIST):
    try:
        # Use gzip to open the compressed file and write to the uncompressed file
        with gzip.open(FILENAME_ARCHIVE, 'rb') as f_in:
            with open(FILENAME_EDGE_LIST, 'wb') as f_out:
                f_out.write(f_in.read())
        print("Extraction successful.")
    except Exception as e:
        print(f"Error during GZIP extraction: {e}")
        exit()
else:
    print("Edge list file already extracted.")

# --- 2. LOAD NETWORKX GRAPH (ROBUST EDGE LIST READING) ---
print(f"Loading graph from local file ({FILENAME_EDGE_LIST})...")

try:
    # Filter file content to skip header/comment lines before loading
    with open(FILENAME_EDGE_LIST, 'r') as f:
        # Filter out lines starting with comment characters AND ensure lines are not empty
        edge_lines = [line for line in f if not line.strip().startswith(SKIP_CHARS) and line.strip()]

    # Write the clean edge data to an in-memory file for NetworkX to read
    clean_data = "\n".join(edge_lines)

    G_full = nx.read_edgelist(
        io.StringIO(clean_data), # Use the in-memory string as a file
        create_using=nx.Graph(),
        nodetype=int,
        data=False,
        encoding='utf-8'
    )

except Exception as e:
    G_full = nx.Graph()
    print(f"FATAL ERROR: Failed to read file as an edge list. Details: {e}")
    exit()

# --- 3. FILTER TO LARGEST CONNECTED COMPONENT (LCC) ---
if G_full.number_of_nodes() == 0:
    raise ValueError("The graph loaded zero nodes. Check the file content.")

largest_cc = max(nx.connected_components(G_full), key=len)
G = G_full.subgraph(largest_cc).copy()

# --- 4. CREATE INDEX MAPPING ---
node_list = sorted(G.nodes())
node_to_index = {node: i for i, node in enumerate(node_list)}
index_to_node = {i: node for node, i in node_to_index.items()}
NUM_USERS = G.number_of_nodes()
DIMENSION = NUM_USERS # Define DIMENSION for subsequent PSO blocks

print(f"Data Loaded Successfully: Email-Eu-core LCC has {NUM_USERS} users and {G.number_of_edges()} edges.")

Extracting email-eu-core.txt from email-Eu-core.txt.gz...
Extraction successful.
Loading graph from local file (email-eu-core.txt)...
Data Loaded Successfully: Email-Eu-core LCC has 986 users and 16687 edges.


In [5]:
def calculate_modularity(G, community_assignment_array):
    """
    Calculates the Modularity (Q) of a community assignment for the Facebook graph.
    """
    node_assignment = {index_to_node[i]: community_assignment_array[i]
                       for i in range(NUM_USERS)}

    communities = {}
    for node, c_id in node_assignment.items():
        if c_id not in communities:
            communities[c_id] = set()
        communities[c_id].add(node)

    partition = list(communities.values())

    try:
        Q = nx.community.modularity(G, partition)
    except ZeroDivisionError:
        Q = 0.0

    return Q

In [6]:
# --- PSO HYPERPARAMETERS ---
NUM_PARTICLES = 50
MAX_ITERATIONS = 100
NUM_CLUSTERS = 30       # Target number of communities (K)
W_MAX = 0.9
W_MIN = 0.4
C1 = 2.0
C2 = 2.0
V_MAX = 6.0

# --- PARTICLE CLASS ---
class Particle:
    def __init__(self, dimension, K):
        self.dimension = dimension
        self.K = K
        self.position = np.random.randint(0, self.K, size=self.dimension)
        self.velocity = np.zeros(self.dimension, dtype=float)
        self.pbest_position = self.position.copy()
        self.pbest_fitness = -1.0
        self.current_fitness = -1.0

# --- INITIALIZATION & MAIN DPSO LOOP ---
print(f"\n--- Starting Discrete PSO for Facebook Community Detection (K={NUM_CLUSTERS}) ---")
random.seed(42)
np.random.seed(42)

swarm = [Particle(DIMENSION, NUM_CLUSTERS) for _ in range(NUM_PARTICLES)]
gbest_position = swarm[0].position.copy()
gbest_fitness = -1.0

for iter in range(MAX_ITERATIONS):
    # Dynamic Inertia Weight
    w = W_MAX - (W_MAX - W_MIN) * iter / MAX_ITERATIONS

    for particle in swarm:
        particle.current_fitness = calculate_modularity(G, particle.position)

        if particle.current_fitness > particle.pbest_fitness:
            particle.pbest_fitness = particle.current_fitness
            particle.pbest_position = particle.position.copy()

            if particle.pbest_fitness > gbest_fitness:
                gbest_fitness = particle.pbest_fitness
                gbest_position = particle.pbest_position.copy()

    for particle in swarm:
        r1 = np.random.rand(DIMENSION)
        r2 = np.random.rand(DIMENSION)

        cognitive_term = C1 * r1 * (particle.pbest_position - particle.position)
        social_term = C2 * r2 * (gbest_position - particle.position)

        new_velocity = w * particle.velocity + cognitive_term + social_term
        new_velocity = np.clip(new_velocity, -V_MAX, V_MAX)
        particle.velocity = new_velocity

        for i in range(DIMENSION):
            p_switch = 1.0 / (1.0 + math.exp(-particle.velocity[i]))

            if random.random() < p_switch:
                pbest_assignment = particle.pbest_position[i]
                gbest_assignment = gbest_position[i]

                if random.random() < 0.5:
                    particle.position[i] = gbest_assignment
                else:
                    particle.position[i] = pbest_assignment

            particle.position[i] = int(particle.position[i] % NUM_CLUSTERS)


    print(f"Iteration {iter+1}/{MAX_ITERATIONS}: Global Best Modularity = {gbest_fitness:.4f}")

# --- FINAL RESULT ---
print("\n--- OPTIMIZATION COMPLETE ---")
print(f"Final Best Modularity Score: {gbest_fitness:.4f}")

final_assignments = {index_to_node[i]: gbest_position[i] for i in range(DIMENSION)}
community_counts = Counter(final_assignments.values())
print(f"Number of distinct communities found: {len(community_counts)}")
print("Top 5 largest communities (size):", community_counts.most_common(5))


--- Starting Discrete PSO for Facebook Community Detection (K=30) ---
Iteration 1/100: Global Best Modularity = 0.0364
Iteration 2/100: Global Best Modularity = 0.0364
Iteration 3/100: Global Best Modularity = 0.0366
Iteration 4/100: Global Best Modularity = 0.0372
Iteration 5/100: Global Best Modularity = 0.0379
Iteration 6/100: Global Best Modularity = 0.0385
Iteration 7/100: Global Best Modularity = 0.0386
Iteration 8/100: Global Best Modularity = 0.0386
Iteration 9/100: Global Best Modularity = 0.0386
Iteration 10/100: Global Best Modularity = 0.0388
Iteration 11/100: Global Best Modularity = 0.0388
Iteration 12/100: Global Best Modularity = 0.0395
Iteration 13/100: Global Best Modularity = 0.0395
Iteration 14/100: Global Best Modularity = 0.0395
Iteration 15/100: Global Best Modularity = 0.0395
Iteration 16/100: Global Best Modularity = 0.0395
Iteration 17/100: Global Best Modularity = 0.0395
Iteration 18/100: Global Best Modularity = 0.0395
Iteration 19/100: Global Best Modulari