In [1]:
import spacy

nlp = spacy.load("en_core_sci_sm")
doc = nlp("Alterations in the hypocretin receptor 2 and preprohypocretin genes produce narcolepsy in some animals.")
for token in doc:
    print(token.text, token.lemma_, token.pos_)

Alterations alteration NOUN
in in ADP
the the DET
hypocretin hypocretin NOUN
receptor receptor NOUN
2 2 NUM
and and CCONJ
preprohypocretin preprohypocretin NOUN
genes gene NOUN
produce produce VERB
narcolepsy narcolepsy NOUN
in in ADP
some some DET
animals animal NOUN
. . PUNCT


  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [None]:
import pickle
import networkx as nx
import matplotlib.pyplot as plt

# 讀取圖檔
with open("../data/graphs_nx/23210975.gpickle", "rb") as f:
    G = pickle.load(f)

# 畫圖（用優化 spring layout）
plt.figure(figsize=(14, 12))
pos = nx.spring_layout(G, seed=42, k=4, iterations=200)

# 畫節點
nx.draw_networkx_nodes(G, pos, node_size=400, node_color="lightblue")

# 畫邊
nx.draw_networkx_edges(G, pos, width=1.0, edge_color="gray", alpha=0.6)

# 畫標籤
nx.draw_networkx_labels(G, pos, font_size=10)

plt.title("Semantic Graph for PMID 23210975", fontsize=14)
plt.axis("off")
plt.tight_layout()
plt.show()


In [2]:
import pickle
import networkx as nx

path = "../data/graphs_pmi/train/27901430.gpickle"

with open(path, "rb") as f:
    G = pickle.load(f)

print(f"✔️ 圖類型: {type(G)}")
print(f"🔢 節點數: {len(G.nodes)}")
print(f"🔗 邊數: {len(G.edges)}")

# 檢查第一些條邊的內容
for i, (u, v, d) in enumerate(G.edges(data=True)):
    print(f"{i}: {u} -- {v} | data: {d}")
    if i >= 5: break

✔️ 圖類型: <class 'networkx.classes.graph.Graph'>
🔢 節點數: 0
🔗 邊數: 0


In [4]:
import os
import pickle
import networkx as nx
from pathlib import Path
import numpy as np
from tqdm import tqdm

GRAPH_DIR = "../data/graphs_pmi/train"  # 修改為你的路徑

def analyze_graphs(graph_dir):
    stats = {
        "num_graphs": 0,
        "node_counts": [],
        "edge_counts": [],
        "empty_graphs": [],
        "tiny_graphs": [],
    }

    for path in tqdm(list(Path(graph_dir).glob("*.gpickle"))):
        try:
            with open(path, "rb") as f:
                G = pickle.load(f)
            if not isinstance(G, nx.Graph):
                continue

            num_nodes = G.number_of_nodes()
            num_edges = G.number_of_edges()

            stats["num_graphs"] += 1
            stats["node_counts"].append(num_nodes)
            stats["edge_counts"].append(num_edges)

            if num_nodes == 0 or num_edges == 0:
                stats["empty_graphs"].append(path.name)
            elif num_nodes <= 1:
                stats["tiny_graphs"].append(path.name)

        except Exception as e:
            print(f"[錯誤] 讀取失敗 {path.name}: {e}")

    return stats


def summarize(stats):
    print("====== PubMed Graph Dataset 統計報告 ======")
    print(f"📦 總圖數：{stats['num_graphs']}")
    print(f"🪫 空圖（節點或邊為 0）：{len(stats['empty_graphs'])}")
    print(f"⚠️ 節點數 ≤ 1 的圖：{len(stats['tiny_graphs'])}")

    if stats["node_counts"]:
        print(f"📈 節點數 - 平均: {np.mean(stats['node_counts']):.2f} | 中位數: {np.median(stats['node_counts'])} | 最大: {np.max(stats['node_counts'])} | 最小: {np.min(stats['node_counts'])}")
    if stats["edge_counts"]:
        print(f"📉 邊數   - 平均: {np.mean(stats['edge_counts']):.2f} | 中位數: {np.median(stats['edge_counts'])} | 最大: {np.max(stats['edge_counts'])} | 最小: {np.min(stats['edge_counts'])}")
    print("\n✅ 完成")


if __name__ == "__main__":
    stats = analyze_graphs(GRAPH_DIR)
    summarize(stats)

100%|██████████| 2268/2268 [00:00<00:00, 12450.94it/s]

📦 總圖數：2268
🪫 空圖（節點或邊為 0）：93
⚠️ 節點數 ≤ 1 的圖：0
📈 節點數 - 平均: 26.71 | 中位數: 27.0 | 最大: 172 | 最小: 0
📉 邊數   - 平均: 404.39 | 中位數: 338.0 | 最大: 14378 | 最小: 0

✅ 完成



