# Centrality Algorithms

参照：https://neo4j.com/docs/graph-data-science-client/current/tutorials/centrality-algorithms/

In [2]:
from graphdatascience import GraphDataScience
import pandas as pd
# import dotenv 
import os
import sys 
from neo4j import GraphDatabase
from dotenv import load_dotenv
import networkx as nx
import matplotlib.pyplot as plt

In [3]:
load_dotenv()

True

In [7]:
# NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
# NEO4J_AUTH = None
URI = os.getenv("NEO4J_URI")
AUTH = (os.getenv("NEO4J_USERNAME"), os.getenv("NEO4J_PASSWORD"))

if os.environ.get("NEO4J_USER") and os.environ.get("NEO4J_PASSWORD"):
    NEO4J_AUTH = (
        os.environ.get("NEO4J_USER"),
        os.environ.get("NEO4J_PASSWORD"),
    )

try:
    gds = GraphDataScience(URI, auth=AUTH)
    print("Connection established.😺")
except Exception as e:
    print(f"Error: {e}", file=sys.stderr)

Connection established.😺


In [8]:
from graphdatascience.server_version.server_version import ServerVersion

assert gds.server_version() >= ServerVersion(1, 8, 0)

## 2. Importing the dataset
- `reachability-meta.csv.gz`: 都市名とその情報
- `reachability.txt.gz`: リレーションデータ
    - `i`から`j`へ都市がつながる。（航空航路時間がある閾値より低いとき）

In [9]:
nodes_info_df = pd.read_csv("https://snap.stanford.edu/data/reachability-meta.csv.gz", compression="gzip")
nodes_info_df.head()

Unnamed: 0,node_id,name,metro_pop,latitude,longitude
0,0,"Abbotsford, BC",133497.0,49.051575,-122.328849
1,1,"Aberdeen, SD",40878.0,45.45909,-98.487324
2,2,"Abilene, TX",166416.0,32.449175,-99.741424
3,3,"Akron/Canton, OH",701456.0,40.79781,-81.371567
4,4,"Alamosa, CO",9433.0,37.46818,-105.873599


In [10]:
routes_df = pd.read_csv(
    "https://snap.stanford.edu/data/reachability.txt.gz",
    sep=" ",
    skiprows=6,
    header=None,
    compression="gzip",
    names=["Origin", "Destination", "Weight"],
)
routes_df.head()

Unnamed: 0,Origin,Destination,Weight
0,27,0,-757
1,57,0,-84
2,70,0,-1290
3,74,0,-465
4,86,0,-700


グラフはとても小さいので、`UNWIND`を使うのが最初のグラフを作る上で最もシンプルで分かりやすい。

グラフが大きい場合はもっと他のやり方の方がいいらしい。（`neo4j-admin import`や`CREATE DATABASE`など）

※ `.to_dict("records")`は一般的なメソッド。

In [11]:
gds.run_cypher(
    "UNWIND $nodes AS node CREATE (n:City {node_id: node.node_id, name: node.name, population: node.metro_pop})",
    params={"nodes": nodes_info_df.to_dict("records")},
)

gds.run_cypher(
    """
    UNWIND $rels AS rel
    MATCH (source:City {node_id: rel.Origin}), (target:City {node_id: rel.Destination})
    CREATE (source)-[:HAS_FLIGHT_TO]->(target)
    """,
    params={"rels": routes_df.to_dict("records")},
)

In [12]:
G, result = gds.graph.project("airline", "City", "HAS_FLIGHT_TO")

print(f"The projection took {result['projectMillis']} ms")

# We can use convenience methods on `G` to check if the projection looks correct
print(f"Graph '{G.name()}' node count: {G.node_count()}")
print(f"Graph '{G.name()}' node labels: {G.node_labels()}")
print(f"Graph '{G.name()}' relationship count: {G.relationship_count()}")

The projection took 130 ms
Graph 'airline' node count: 456
Graph 'airline' node labels: ['City']
Graph 'airline' relationship count: 71959


## 3. Eigenvector Centrality
- ノードの重要度をコネクションネットワークから計る
    - スコアが高いほど、他のノードへの影響力が高いとみなされる

今回のairportsデータセットで実施すると、
- 他とよく繋がっている空港
- 他の**重要な**空港と繋がっている空港
を見つけられる。

In [13]:
eigenvector_centrality_result = gds.eigenvector.mutate(G, 
                                                       maxIterations=100, 
                                                       mutateProperty="eigenvectorCentrality"
                                                       )

In [14]:
G.node_properties()

City    [eigenvectorCentrality]
dtype: object

↓上の実装が収束していれば、イテレーション数が表示される

In [15]:
 if eigenvector_centrality_result.didConverge:
    print(
        f"The number of iterations taken by Eigenvector Centrality to run is {eigenvector_centrality_result.ranIterations}."
    )
else:
    print("Algorithm did not converge!")

The number of iterations taken by Eigenvector Centrality to run is 13.


In [19]:
# 固有ベクトルの中心性スコア分布
display(eigenvector_centrality_result.centralityDistribution)

# DBにこのスコアを書き込む
gds.graph.nodeProperties.write(G, ["eigenvectorCentrality"])

{'min': 0.0012630745768547058,
 'max': 0.08567857742309569,
 'p90': 0.07575749605894089,
 'p999': 0.0856785699725151,
 'p99': 0.08469437807798386,
 'p50': 0.0386042520403862,
 'p75': 0.05822562426328659,
 'p95': 0.08167456835508347,
 'mean': 0.041094380038741385}

writeMillis                                                         21
graphName                                                      airline
nodeProperties                                 [eigenvectorCentrality]
propertiesWritten                                                  456
configuration        {'jobId': '3ce4558c-a8ae-42d2-a4f0-6222e36b2c4...
Name: 0, dtype: object

固有ベクトル中心性スコアを使って、
- 主要ハブ
- 広大なネットワークを持つ空港
を持つ上位20都市を見れる。

In [20]:
def display_top_20_cities(centrality_measure):
    """
    Function to execute the Cypher query to retrieve the top 20 cities with the highest centrality measure.
    """
    query = f"""
    MATCH (n:City)
    RETURN n.node_id AS node_id, n.name AS name, n.population AS population, n.{centrality_measure} AS {centrality_measure}
    ORDER BY n.{centrality_measure} DESC
    LIMIT 20
    """
    result = gds.run_cypher(query=query)

    print(result)

display_top_20_cities("eigenvectorCentrality")




    node_id                     name  population  eigenvectorCentrality
0       246          Los Angeles, CA  12940000.0               0.085678
1       368        San Francisco, CA   4391000.0               0.085393
2        94    Dallas/Fort Worth, TX   6527000.0               0.085097
3       230            Las Vegas, NV   1970000.0               0.084824
4        74              Chicago, IL   9505000.0               0.084694
5       100               Denver, CO   2600000.0               0.084620
6       324              Phoenix, AZ   4263000.0               0.084607
7       383       Seattle/Tacoma, WA   3500000.0               0.084396
8       434           Washington, DC   5704000.0               0.084002
9       269  Minneapolis/St Paul, MN   3318000.0               0.083769
10      294             New York, NY  19020000.0               0.083696
11      323         Philadelphia, PA   5992000.0               0.083678
12      102              Detroit, MI   4286000.0               0

## 4. Betweenness Centrality
ネットワーク中における当該ノードが、橋渡し・中間ノードとしてどれほど重要かを定量する。  
あるノードが、**他のペアノードにおけるshortest pathに登場する頻度を定量する。**

今回のデータセットでは、
- 都市間中心性の高い都市／空港は、直行便が就航していない空港間の重要な乗り換えポイントや接続ハブとして機能
- これらの都市・空港は、航空旅行の流れを促進する上で重要な役割を果たしており、ネットワーク全体の接続性にとっても不可欠



In [21]:
betweenness_centrality_result = gds.betweenness.mutate(G, mutateProperty="betweennessCentrality")

In [22]:
# We can verify that the betweennessCentrality was mutated
G.node_properties()

City    [betweennessCentrality, eigenvectorCentrality]
dtype: object

In [23]:
betweenness_centrality_result.centralityDistribution

{'min': 0.04446244239807129,
 'max': 3628.7656249999995,
 'p90': 1006.8867185115814,
 'p999': 3628.7656247615814,
 'p99': 3265.2968747615814,
 'p50': 21.20861792564392,
 'p75': 184.29199194908142,
 'p95': 1823.7578122615814,
 'mean': 298.0898581821668}

In [24]:
gds.graph.nodeProperties.write(G, ["betweennessCentrality"])

writeMillis                                                         37
graphName                                                      airline
nodeProperties                                 [betweennessCentrality]
propertiesWritten                                                  456
configuration        {'jobId': 'daa27d82-402d-4c06-83e4-817123b5c8f...
Name: 0, dtype: object

In [25]:
# 乗り換えポイント（直行便がない空港間）
display_top_20_cities("betweennessCentrality")

    node_id                     name  population  betweennessCentrality
0       246          Los Angeles, CA  12940000.0            3628.755650
1       100               Denver, CO   2600000.0            3435.693657
2       294             New York, NY  19020000.0            3297.750685
3       416              Toronto, ON   6324000.0            3295.316691
4       368        San Francisco, CA   4391000.0            3265.288064
5        74              Chicago, IL   9505000.0            3250.731388
6       230            Las Vegas, NV   1970000.0            3156.983626
7       434           Washington, DC   5704000.0            3108.796793
8        94    Dallas/Fort Worth, TX   6527000.0            3094.760797
9       324              Phoenix, AZ   4263000.0            2815.294080
10      383       Seattle/Tacoma, WA   3500000.0            2600.388338
11      269  Minneapolis/St Paul, MN   3318000.0            2386.544069
12      367            San Diego, CA   3140000.0            2225

## 5. Degree Centrality
あるノードが持つエッジの数を定量

空港の例では、数＝他の都市との接続数を表す。

In [26]:
degree_centrality_result = gds.degree.mutate(G, mutateProperty="degreeCentrality")

In [27]:
G.node_properties()

City    [betweennessCentrality, eigenvectorCentrality,...
dtype: object

In [28]:
degree_centrality_result.centralityDistribution

{'min': 5.0,
 'max': 443.00195312499994,
 'p90': 329.0019226074219,
 'p999': 443.0019226074219,
 'p99': 429.0019226074219,
 'p50': 126.00045776367188,
 'p75': 217.00094604492188,
 'p95': 384.0019226074219,
 'mean': 157.80525314598754}

In [29]:
gds.graph.nodeProperties.write(G, ["degreeCentrality"])

writeMillis                                                         33
graphName                                                      airline
nodeProperties                                      [degreeCentrality]
propertiesWritten                                                  456
configuration        {'jobId': 'afd9d27b-2bcc-48ec-bfae-d8dcbe02b32...
Name: 0, dtype: object

In [30]:
display_top_20_cities("degreeCentrality")

    node_id                      name  population  degreeCentrality
0       246           Los Angeles, CA  12940000.0             443.0
1       230             Las Vegas, NV   1970000.0             432.0
2       368         San Francisco, CA   4391000.0             430.0
3        74               Chicago, IL   9505000.0             430.0
4       100                Denver, CO   2600000.0             429.0
5       294              New York, NY  19020000.0             428.0
6        94     Dallas/Fort Worth, TX   6527000.0             427.0
7       434            Washington, DC   5704000.0             427.0
8       324               Phoenix, AZ   4263000.0             424.0
9       416               Toronto, ON   6324000.0             409.0
10      269   Minneapolis/St Paul, MN   3318000.0             406.0
11      383        Seattle/Tacoma, WA   3500000.0             406.0
12       46                Boston, MA   4591000.0             404.0
13      323          Philadelphia, PA   5992000.

## 7. クリーンアップ
- GDSのインメモリ
- DB
からメモリ削除

In [31]:
# Cleanup GDS
G.drop()

graphName                                                          airline
database                                                             neo4j
databaseLocation                                                     local
memoryUsage                                                               
sizeInBytes                                                             -1
nodeCount                                                              456
relationshipCount                                                    71959
configuration            {'relationshipProjection': {'HAS_FLIGHT_TO': {...
density                                                           0.346824
creationTime                           2024-09-22T10:12:33.831870000+00:00
modificationTime                       2024-09-22T12:07:11.831926000+00:00
schema                   {'graphProperties': {}, 'nodes': {'City': {'be...
schemaWithOrientation    {'graphProperties': {}, 'nodes': {'City': {'be...
Name: 0, dtype: object

In [32]:
# Cleanup database
gds.run_cypher("MATCH (n:City) DETACH DELETE n")