In [40]:
import networkx as nx
import pandas as pd
from tqdm import tqdm
import math

In [2]:
graph = "stexpanded"
community_algorithm = "bigclam"

graph_file = "data/filtered_triples_weighted/" + graph + ".triples"
communities_file = "./results/communities_" + community_algorithm + "/" + graph + "_10000.txt"

In [3]:
graph_df = pd.read_csv(graph_file, sep="###", engine="python", header=None)
G = nx.Graph()
for row in graph_df.itertuples():
    G.add_edge(int(row[1]), int(row[2]))
    # G.add_edge(int(row[1]), int(row[2]), weight=int(row[3]))

communities = []
with open(communities_file) as cf:
    for line in cf:
        numbers_set = {int(num) for num in line.strip().split("\t")}
        communities.append(numbers_set)

In [None]:
community_vector = pd.DataFrame({'id': sorted(G.nodes)})
# print(community_vector)

### Community degree centrality
Community degree centrality measures the importance of a node within its own community by counting the number of connections it has with other nodes in the same community, highlighting its influence locally rather than across the entire network.

In [None]:
def overlapping_community_degree_centrality(G, communities):
    centrality_scores = {}
    for community in communities:
        subgraph = G.subgraph(community)
        centrality = nx.degree_centrality(subgraph)
        for node, score in centrality.items():
            centrality_scores.setdefault(node, []).append(score)
    return {node: sum(scores) / len(scores) for node, scores in centrality_scores.items()}

In [None]:
centrality_dict = overlapping_community_degree_centrality(G, communities)

In [None]:
community_vector['degree_centrality'] = community_vector['id'].map(centrality_dict)

### Community betweenness centrality
Community betweenness centrality measures how often a node acts as a bridge within its community, connecting different parts of the community by appearing on the shortest paths between other nodes within the same community.

In [None]:
def overlapping_community_betweenness_centrality(G, communities):
    centrality_scores = {}
    for community in tqdm(communities):
        subgraph = G.subgraph(community)
        centrality = nx.betweenness_centrality(subgraph)
        for node, score in centrality.items():
            centrality_scores.setdefault(node, []).append(score)
    return {node: sum(scores) / len(scores) for node, scores in centrality_scores.items()}

In [None]:
centrality_dict = overlapping_community_betweenness_centrality(G, communities)

In [None]:
community_vector['betweenness_centrality'] = community_vector['id'].map(centrality_dict)

### Community closeness centrality
Community closeness centrality measures how quickly a node can reach other nodes within its own community, calculated by the average shortest path distance to all other nodes in that community.

In [None]:
def overlapping_community_closeness_centrality(G, communities):
    centrality_scores = {}
    for community in tqdm(communities):
        subgraph = G.subgraph(community)
        centrality = nx.closeness_centrality(subgraph)
        for node, score in centrality.items():
            centrality_scores.setdefault(node, []).append(score)
    return {node: sum(scores) / len(scores) for node, scores in centrality_scores.items()}

In [None]:
centrality_dict = overlapping_community_closeness_centrality(G, communities)

In [None]:
community_vector['closeness_centrality'] = community_vector['id'].map(centrality_dict)

### Community vector v1

In [None]:
community_vector.to_csv('results/community_vectors/community_vector.csv', sep=';', index=False)

In [41]:
community_vector = pd.read_csv('results/community_vectors/community_vector_v1.csv', sep=';')

### Overlapping participation coefficient
The overlapping participation coefficient measures the extent to which a node is evenly connected across multiple communities, indicating whether it plays a bridging role by participating in several communities or is primarily focused within one.

In [42]:
def overlapping_participation_coefficient(G, communities):
    node_community_map = {node: set() for node in G.nodes()}
    for i, community in enumerate(communities):
        for node in community:
            node_community_map[node].add(i)
    participation_scores = {}
    for node in G.nodes():
        community_set = node_community_map[node]
        if not community_set:
            participation_scores[node] = math.nan
            continue
        external_links = sum(1 for neighbor in G.neighbors(node) if not node_community_map[neighbor].issubset(community_set))
        total_links = len(list(G.neighbors(node)))
        participation_scores[node] = external_links / total_links if total_links > 0 else 0
    return participation_scores

In [43]:
centrality_dict = overlapping_participation_coefficient(G, communities)

In [44]:
community_vector['participation_coefficient'] = community_vector['id'].map(centrality_dict)

### Community affiliation index
The community affiliation index quantifies the strength of a node’s membership within a particular community, often based on the number and intensity of its connections to other nodes within that community relative to those outside it.

In [45]:
def community_affiliation_index(G, communities):
    node_community_map = {node: set() for node in G.nodes()}
    for i, community in enumerate(communities):
        for node in community:
            node_community_map[node].add(i)
    cai_scores = {}
    for node in G.nodes():
        community_set = node_community_map[node]
        if not community_set:
            cai_scores[node] = math.nan
            continue
        intra_edges = sum(1 for neighbor in G.neighbors(node) if node_community_map[neighbor].intersection(node_community_map[node]))
        total_edges = len(list(G.neighbors(node)))
        cai_scores[node] = intra_edges / total_edges if total_edges > 0 else 0
    return cai_scores

In [46]:
centrality_dict = community_affiliation_index(G, communities)

In [47]:
community_vector['community_affiliation_index'] = community_vector['id'].map(centrality_dict)

### Overlapping node modularity
Overlapping node modularity assesses how well a node’s connections are grouped within its multiple community affiliations, measuring the extent to which its connections are dense within each community compared to between communities.

In [48]:
def overlapping_node_modularity(G, communities):
    node_community_map = {node: set() for node in G.nodes()}
    for i, community in enumerate(communities):
        for node in community:
            node_community_map[node].add(i)
    modularity_scores = {node: 0 for node in G.nodes()}
    total_edges = G.number_of_edges()
    for community in communities:
        internal_edges = G.subgraph(community).number_of_edges()
        community_modularity = internal_edges / total_edges - (len(community) / (2 * total_edges)) ** 2
        for node in community:
            modularity_scores[node] += community_modularity / len(node_community_map[node])
    for node in G.nodes():
        if not node_community_map[node]:
            modularity_scores[node] = math.nan
    return modularity_scores

In [49]:
centrality_dict = overlapping_node_modularity(G, communities)

In [50]:
community_vector['overlapping_node_modularity'] = community_vector['id'].map(centrality_dict)

### Community vector v2

In [51]:
community_vector.to_csv('results/community_vectors/community_vector.csv', sep=';', index=False)

In [52]:
community_vector = pd.read_csv('results/community_vectors/community_vector_v2.csv', sep=';')