In [1]:
from py2neo import Graph
import networkx as nx

# 连接数据库
graph = Graph("bolt://localhost:7687", auth=("neo4j", "password"))

# 查询所有节点和边
query = """
MATCH (a)-[r]->(b)
RETURN id(a) as source, id(b) as target, type(r) as relation, properties(r) as edge_attr
"""
edges = graph.run(query).data()

# 构建 NetworkX 图
G = nx.DiGraph()
for edge in edges:
    G.add_edge(edge["source"], edge["target"], **(edge["edge_attr"] or {}))

# 添加节点属性
node_query = "MATCH (n) RETURN id(n) as id, labels(n) as labels, properties(n) as attr"
nodes = graph.run(node_query).data()

for node in nodes:
    G.add_node(node["id"], **node["attr"])


In [2]:
# -----------------------------
# 图结构基本统计信息输出
# -----------------------------

print("✅ 图结构统计信息")
print(f"👉 节点总数：{G.number_of_nodes()}")
print(f"👉 边总数：{G.number_of_edges()}")

# 节点入度、出度
in_degrees = dict(G.in_degree())
out_degrees = dict(G.out_degree())

avg_in_degree = sum(in_degrees.values()) / len(in_degrees)
avg_out_degree = sum(out_degrees.values()) / len(out_degrees)

print(f"👉 平均入度：{avg_in_degree:.2f}")
print(f"👉 平均出度：{avg_out_degree:.2f}")

# 节点标签统计（如果有标签）
label_count = {}
for node in nodes:
    for label in node["labels"]:
        label_count[label] = label_count.get(label, 0) + 1
print("👉 各类型节点数量：")
for label, count in label_count.items():
    print(f"   - {label}: {count} 个")

# 关系类型统计
relation_count = {}
for edge in edges:
    rel = edge['relation']
    relation_count[rel] = relation_count.get(rel, 0) + 1
print("👉 各类型关系数量：")
for rel, count in relation_count.items():
    print(f"   - {rel}: {count} 条")


✅ 图结构统计信息
👉 节点总数：44112
👉 边总数：276586
👉 平均入度：6.27
👉 平均出度：6.27
👉 各类型节点数量：
   - Disease: 8808 个
   - Drug: 3828 个
   - Food: 4870 个
   - Check: 3353 个
   - Department: 54 个
   - Producer: 17201 个
   - Symptom: 5998 个
👉 各类型关系数量：
   - recommand_eat: 40236 条
   - no_eat: 22247 条
   - do_eat: 22238 条
   - belongs_to: 8844 条
   - common_drug: 14649 条
   - drugs_of: 17315 条
   - recommand_drug: 59467 条
   - need_check: 39423 条
   - has_symptom: 54717 条
   - acompany_with: 12029 条


In [3]:
# 收集所有 Department 类型的部门名
department_names = []
for node in nodes:
    if "Department" in node["labels"]:
        dept_name = node["attr"].get("name") or node["attr"].get("department_name")
        if dept_name:
            department_names.append(dept_name)

# 按字典序排序部门名
department_names.sort()

# 拼接成一个用顿号分隔的字符串
departments_str = "、".join(department_names)

print(f"👉 部门列表（共 {len(department_names)} 个）：{departments_str}")


👉 部门列表（共 54 个）：不孕不育、中医科、中医综合、五官科、产科、传染科、儿科、儿科综合、其他科室、其他综合、内分泌科、内科、减肥、口腔科、呼吸内科、外科、妇产科、妇科、小儿内科、小儿外科、康复科、心内科、心理科、心胸外科、急诊科、性病科、感染科、整形美容科、普外科、泌尿内科、泌尿外科、消化内科、烧伤科、生殖健康、男科、皮肤性病科、皮肤科、眼科、神经内科、神经外科、精神科、耳鼻喉科、肛肠科、肝病、肝胆外科、肾内科、肿瘤内科、肿瘤外科、肿瘤科、营养科、血液科、遗传病科、风湿免疫科、骨外科


In [1]:
# 全流程：症状 → 疾病 → 科室预测模型构建
# 使用 PyTorch Geometric + Neo4j + BERT 特征
import torch
import torch.nn as nn
import torch.nn.functional as F
from py2neo import Graph
from tqdm import tqdm
from transformers import BertTokenizer, BertModel
from torch_geometric.data import HeteroData
from torch_geometric.transforms import ToUndirected
from torch_geometric.nn import HeteroConv, GATConv, Linear
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

# ========== 1. 连接 Neo4j 并提取数据 ==========
graph = Graph("bolt://localhost:7687", auth=("neo4j", "12345678"))

# 提取症状-疾病-科室三跳路径
sym_dis_dept_query = """
MATCH (s:Symptom)<-[:has_symptom]-(d:Disease)-[:belongs_to]->(dept:Department)
RETURN id(s) as sid, s.name as sname, id(d) as did, d.name as dname, id(dept) as dept_id, dept.name as dept_name
"""
results = graph.run(sym_dis_dept_query).data()

# 构建实体字典与映射
symptoms, diseases, departments = {}, {}, {}
edges_sym_dis, edges_dis_dept = [], []

for row in results:
    sid, sname = row["sid"], row["sname"]
    did, dname = row["did"], row["dname"]
    dept_id, dept_name = row["dept_id"], row["dept_name"]
    symptoms[sid] = sname
    diseases[did] = dname
    departments[dept_id] = dept_name
    edges_sym_dis.append((sid, did))
    edges_dis_dept.append((did, dept_id))

symptom_id_map = {nid: i for i, nid in enumerate(symptoms)}
disease_id_map = {nid: i for i, nid in enumerate(diseases)}
department_id_map = {nid: i for i, nid in enumerate(departments)}

# ========= 统计信息 =========
print("数据统计")
print(f"症状 (Symptom)节点数：{len(symptoms)}")
print(f"疾病 (Disease)节点数：{len(diseases)}")
print(f"科室 (Department)节点数：{len(departments)}")
print(f"症状-疾病边数：{len(edges_sym_dis)}")
print(f"疾病-科室边数：{len(edges_dis_dept)}\n")



  from .autonotebook import tqdm as notebook_tqdm


数据统计
症状 (Symptom)节点数：5998
疾病 (Disease)节点数：8765
科室 (Department)节点数：48
症状-疾病边数：54717
疾病-科室边数：54717



In [20]:
# 构建 department 索引 → 名称 映射
department_idx2name = {
    idx: departments[nid]  # nid 是 Neo4j 中的节点 ID，departments[nid] 是名称
    for nid, idx in department_id_map.items()
}

import pickle
with open("department_idx2name.pkl", "wb") as f:
    pickle.dump(department_idx2name, f)


In [2]:
# ========== 2. 构建 HeteroData 异构图 ==========
data = HeteroData()
data['symptom'].num_nodes = len(symptoms)
data['disease'].num_nodes = len(diseases)
data['department'].num_nodes = len(departments)

# 边：symptom <-> disease
edge_index_sd = torch.tensor([[symptom_id_map[s], disease_id_map[d]] for s, d in edges_sym_dis], dtype=torch.long).t()
data['symptom', 'has_symptom', 'disease'].edge_index = edge_index_sd

# 边：disease <-> department
edge_index_dd = torch.tensor([[disease_id_map[d], department_id_map[dept]] for d, dept in edges_dis_dept], dtype=torch.long).t()
data['disease', 'belongs_to', 'department'].edge_index = edge_index_dd

# 自动添加反向边
data = ToUndirected()(data)


In [15]:
torch.save(data, "data.pt")

In [6]:
import pickle

# ========== 3. 生成 BERT 特征 ==========
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"使用设备: {device}")

tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertModel.from_pretrained('bert-base-chinese').to(device)
model.eval()  # 关闭 dropout 等训练态

@torch.no_grad()
def get_bert_embedding(text: str) -> torch.Tensor:
    """对单个文本返回 [CLS] 向量 (1, 768)。"""
    inputs  = tokenizer(text, return_tensors='pt', truncation=True,
                        padding=True, max_length=10).to(device)
    outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :]    # shape: (1, 768)

def build_or_load(name_dict, save_path, desc, name2idx_path):
    if os.path.exists(save_path) and os.path.exists(name2idx_path):
        print(f"载入缓存特征：{save_path}")
        feats = torch.load(save_path)
        with open(name2idx_path, "rb") as f:
            name2idx = pickle.load(f)
        return feats, name2idx

    print(f"编码 {desc} ...")
    feats = []
    name2idx = {}
    for i, (nid, name) in enumerate(tqdm(name_dict.items(), total=len(name_dict), desc=desc)):
        vec = get_bert_embedding(name).cpu()
        feats.append(vec.squeeze(0))
        name2idx[name] = i

    feats = torch.stack(feats)
    torch.save(feats, save_path)
    with open(name2idx_path, "wb") as f:
        pickle.dump(name2idx, f)
    print(f"已保存到 {save_path} 和 {name2idx_path}")
    return feats, name2idx

symptom_feat, symptom_name2idx = build_or_load(symptoms, "symptom_feat.pt", "症状", "symptom_name2idx.pkl")
disease_feat, disease_name2idx = build_or_load(diseases, "disease_feat.pt", "疾病", "disease_name2idx.pkl")
department_feat, department_name2idx = build_or_load(departments, "department_feat.pt", "科室", "department_name2idx.pkl")


data['symptom'].x = symptom_feat
data['disease'].x = disease_feat
data['department'].x = department_feat


使用设备: cuda




编码 症状 ...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
症状: 100%|██████████| 5998/5998 [00:38<00:00, 156.57it/s]


已保存到 symptom_feat.pt 和 symptom_name2idx.pkl
编码 疾病 ...


疾病: 100%|██████████| 8765/8765 [00:56<00:00, 156.00it/s]


已保存到 disease_feat.pt 和 disease_name2idx.pkl
编码 科室 ...


科室: 100%|██████████| 48/48 [00:00<00:00, 146.64it/s]

已保存到 department_feat.pt 和 department_name2idx.pkl





In [7]:
# 构造 symptom → department 的标签列表
symptom_label = torch.full((len(symptoms),), -1, dtype=torch.long)  # -1 表示无标签（跳过）

# 反向构建：通过疾病连接的科室
for (s_id, d_id) in edges_sym_dis:
    dept_ids = [dept for (did, dept) in edges_dis_dept if did == d_id]
    if len(dept_ids) == 0:
        continue
    s_idx = symptom_id_map[s_id]
    # 注意：存在多个疾病连接到不同科室的情况，仅取第一个（也可以改为多标签 one-hot）
    dept_idx = department_id_map[dept_ids[0]]
    symptom_label[s_idx] = dept_idx

# 去除无标签的样本
mask = symptom_label != -1
labels = symptom_label[mask]
print(f"共构建有效标签样本数：{labels.size(0)}")

共构建有效标签样本数：5998


In [8]:
torch.save(symptom_label, "symptom_label.pt")
torch.save(mask, "mask.pt")


In [9]:
symptom_label = torch.load("symptom_label.pt")
mask = torch.load("mask.pt")
labels = symptom_label[mask]

  symptom_label = torch.load("symptom_label.pt")
  mask = torch.load("mask.pt")


In [10]:
from sklearn.model_selection import train_test_split

# 有效症状索引（mask=True 处）
valid_sym_idx = torch.where(mask)[0]        # 5998 全部有效
train_idx, test_idx = train_test_split(valid_sym_idx.numpy(),
                                       test_size=0.2,
                                       random_state=42,
                                       shuffle=True)

train_idx = torch.tensor(train_idx, dtype=torch.long)
test_idx  = torch.tensor(test_idx,  dtype=torch.long)

print(f"训练集样本: {len(train_idx)}，测试集样本: {len(test_idx)}")


训练集样本: 4798，测试集样本: 1200


In [11]:
class SymptomToDeptGNN(nn.Module):
    def __init__(self, hidden_dim=128, out_dim=len(department_id_map)):
        super().__init__()
        self.conv = HeteroConv({
            ('symptom', 'has_symptom', 'disease'): GATConv((-1, -1), hidden_dim, add_self_loops=False),
            ('disease', 'rev_has_symptom', 'symptom'): GATConv((-1, -1), hidden_dim, add_self_loops=False),
            ('disease', 'belongs_to', 'department'): GATConv((-1, -1), hidden_dim, add_self_loops=False),
            ('department','rev_belongs_to','disease'): GATConv((-1, -1), hidden_dim, add_self_loops=False),
        }, aggr='sum')
        self.lin = Linear(hidden_dim, out_dim)

    def forward(self, x_dict, edge_index_dict):
        x_dict = self.conv(x_dict, edge_index_dict)    # 异构消息传播
        x_dict = {k: F.relu(v) for k, v in x_dict.items()}
        return self.lin(x_dict['symptom'])             # 仅输出症状节点 logits


In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
data = data.to(device)
labels = labels.to(device)            # size = 5998
train_idx = train_idx.to(device)
test_idx  = test_idx.to(device)

model = SymptomToDeptGNN(hidden_dim=128).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-3)
loss_fn = nn.CrossEntropyLoss()

print("开始训练 ...")
for epoch in range(1, 101):
    model.train()
    logits = model(data.x_dict, data.edge_index_dict)     # [num_symptom, 54]
    loss   = loss_fn(logits[train_idx], labels[train_idx])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0 or epoch == 1:
        model.eval()
        with torch.no_grad():
            pred = logits[test_idx].argmax(dim=1)
            acc  = (pred == labels[test_idx]).float().mean().item()
        print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Test Acc: {acc:.4f}")


开始训练 ...
Epoch 001 | Loss: 3.9007 | Test Acc: 0.0283
Epoch 010 | Loss: 2.4937 | Test Acc: 0.3225
Epoch 020 | Loss: 1.7720 | Test Acc: 0.4783
Epoch 030 | Loss: 1.3902 | Test Acc: 0.5650
Epoch 040 | Loss: 1.1775 | Test Acc: 0.5933
Epoch 050 | Loss: 1.0255 | Test Acc: 0.6183
Epoch 060 | Loss: 0.9062 | Test Acc: 0.6167
Epoch 070 | Loss: 0.8177 | Test Acc: 0.6283
Epoch 080 | Loss: 0.7301 | Test Acc: 0.6333
Epoch 090 | Loss: 0.6662 | Test Acc: 0.6317
Epoch 100 | Loss: 0.6098 | Test Acc: 0.6433


In [14]:
# 保存模型
torch.save(model.state_dict(), "gnn_model.pt")

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

class SymptomMLP(nn.Module):
    def __init__(self, in_dim=768, hidden=128, out_dim=54):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(in_dim, hidden),
            nn.LayerNorm(hidden),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden, out_dim)
        )


    def forward(self, x):
        return self.fc(x)

# 加载数据
symptom_feat = torch.load("symptom_feat.pt")  # shape: [N, 768]
symptom_label = torch.load("symptom_label.pt")  # shape: [N]
mask = torch.load("mask.pt")  # shape: [N]

x = symptom_feat[mask]
y = symptom_label[mask]

train_idx, test_idx = train_test_split(range(len(x)), test_size=0.2, random_state=42)

# 模型、优化器
mlp_model = SymptomMLP().to(device)
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

x, y = x.to(device), y.to(device)

print("🚀 开始训练 MLP ...")
for epoch in range(1, 101):
    mlp_model.train()
    logits = mlp_model(x[train_idx])
    loss = loss_fn(logits, y[train_idx])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0 or epoch == 1:
        mlp_model.eval()
        with torch.no_grad():
            pred = mlp_model(x[test_idx]).argmax(dim=1)
            acc = (pred == y[test_idx]).float().mean().item()
        print(f"Epoch {epoch:03d} | Loss: {loss.item():.4f} | Test Acc: {acc:.4f}")


In [56]:
torch.save(mlp_model.state_dict(), "mlp_model.pt")


In [None]:
import pickle

with open("symptom_name2idx.pkl", "rb") as f:
    symptom_name2idx = pickle.load(f)
with open("department_idx2name.pkl", "rb") as f:
    dept_idx2name = pickle.load(f)

symptom_feat = torch.load("symptom_feat.pt")
mlp_model = SymptomMLP().to(device)
mlp_model.load_state_dict(torch.load("mlp_model.pt", map_location=device))
mlp_model.eval()

@torch.no_grad()
def get_symptom_vector(name):
    """从缓存中查找症状向量"""
    if name in symptom_name2idx:
        idx = symptom_name2idx[name]
        return symptom_feat[idx].unsqueeze(0)
    else:
        print(f"❗症状 `{name}` 未命中缓存，忽略")
        return None

@torch.no_grad()
def predict_with_mlp(symptom_names, k=5):
    vecs = [get_symptom_vector(name) for name in symptom_names]
    vecs = [v for v in vecs if v is not None]

    if not vecs:
        print("❗无有效症状输入，无法预测")
        return []

    input_tensor = torch.mean(torch.cat(vecs, dim=0), dim=0, keepdim=True).to(device)
    logits = mlp_model(input_tensor)
    probs = F.softmax(logits, dim=-1)
    topk = torch.topk(probs, k=k)

    return [(dept_idx2name[int(idx)], float(score)) for idx, score in zip(topk.indices[0], topk.values[0])]

input_symptoms = ["发烧", "咳嗽", "乏力", "头痛", "恶心"]
results = predict_with_mlp(input_symptoms)

print(f"综合推荐科室（根据症状：{', '.join(input_symptoms)}）：")
for i, (dept, score) in enumerate(results):
    print(f"  Top-{i+1}: {dept} ({score:.4f})")


  symptom_feat = torch.load("symptom_feat.pt")
  mlp_model.load_state_dict(torch.load("mlp_model.pt", map_location=device))
