In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# 指定本地路径
model_path = "/mnt/c/Users/94903/Desktop/cellannotation/PubMedbert"

# 加载组件
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)

# 测试推理
text1 = "Primary cultured cells"
inputs = tokenizer(text1, return_tensors="pt")
outputs = model(**inputs)
print(outputs)
# 自定义池化（与原始实现一致）
def mean_pooling(output, mask):
    embeddings = output.last_hidden_state
    mask_expanded = mask.unsqueeze(-1).expand(embeddings.size()).float()
    return torch.sum(embeddings * mask_expanded, 1) / torch.clamp(mask_expanded.sum(1), min=1e-9)

sentence_emb = mean_pooling(outputs, inputs['attention_mask'])
print(sentence_emb.shape)  # 应输出 torch.Size([1, 768])

In [None]:
import torch

# 加载检查点文件
checkpoint_path = "/home/lxz/PubMedbert/finetune_bert.pth"
checkpoint = torch.load(checkpoint_path, map_location='cpu')  

# 打印检查点的基本信息
print("="*50)
print(f"检查点文件: {checkpoint_path}")
print(f"文件包含的键: {list(checkpoint.keys())}")
print("="*50)

  checkpoint = torch.load(checkpoint_path, map_location='cpu')  # 使用CPU避免GPU内存问题


检查点文件: /home/lxz/PubMedbert/finetune_bert.pth
文件包含的键: ['bert_state', 'projection_state', 'rel_projection_state']


In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

# 加载本地模型
model_path = "/home/lxz/PubMedbert"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path).to("cuda")

# 定义均值池化函数
def mean_pooling(output, mask):
    embeddings = output.last_hidden_state
    mask_expanded = mask.unsqueeze(-1).expand(embeddings.size()).float()
    return torch.sum(embeddings * mask_expanded, 1) / torch.clamp(mask_expanded.sum(1), min=1e-9)

# 待比较的术语
terms = ["Primary cultured cells", "primary cultured cell"]
# terms = ["Primary cultured cells", "neural crest derived fibroblast"]
# 编码文本
inputs = tokenizer(terms, padding=True, truncation=True, return_tensors="pt").to("cuda")
with torch.no_grad():
    outputs = model(**inputs)

# 获取句向量
embeddings = mean_pooling(outputs, inputs['attention_mask'])

# 计算余弦相似度
sim = F.cosine_similarity(embeddings[0], embeddings[1], dim=0).item()
print(f"语义相似度: {sim:.4f}")  # 输出范围[-1,1]，越接近1越相似

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


语义相似度: 0.9858


In [None]:
import re
from collections import defaultdict

def extract_triplets_and_cell_types(file_path):
    """
    从文件中提取三元组和所有细胞类型
    返回:
        - triplets: [(head, relation, tail), ...]
        - cell_types: set() 所有唯一的细胞类型
    """
    triplets = []
    cell_types = set()
    relation_counter = defaultdict(int)
    
    # 匹配模式：head relation tail
    pattern = re.compile(r"(.+?)\s+(is_a|disjoint_from|exact_synonyms|broad_synonyms|"
                         r"related_synonyms|develops from|develops into|synapsed to)\s+(.+)")
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            match = pattern.match(line)
            if match:
                head, relation, tail = match.groups()
                triplets.append((head, relation, tail))
                cell_types.update([head, tail])
                relation_counter[relation] += 1
    
    # print(f"提取完成！共 {len(triplets)} 个三元组，{len(cell_types)} 种细胞类型")
    # print("关系统计:", dict(relation_counter))
    
    return triplets, cell_types

# 使用示例
file_path = "/mnt/c/Users/94903/Desktop/cellannotation/PubMedbert/triples.txt"
triplets, cell_types = extract_triplets_and_cell_types(file_path)

# 转换为您需要的格式
relation_mapping = {
    "is_a": 0,
    "disjoint_from": 1,
    "exact_synonyms": 2,
    "broad_synonyms": 3,
    "related_synonyms": 4,
    "develops from": 5,
    "develops into": 6,
    "synapsed to": 7
}

data = []
for h, r, t in triplets:
    data.append({
        "head": h,
        "tail": t,
        "relation": relation_mapping[r]
    })

# # 结果验证
# print("\n前3个三元组示例:")
# for item in data[:3]:
#     print(item)

# print("\n细胞类型示例:", list(cell_types)[:5])

提取完成！共 7310 个三元组，5626 种细胞类型
关系统计: {'is_a': 3818, 'exact_synonyms': 2270, 'related_synonyms': 527, 'broad_synonyms': 262, 'disjoint_from': 32, 'develops from': 372, 'develops into': 14, 'synapsed to': 15}

前3个三元组示例:
{'head': 'primary cultured cell', 'tail': 'cultured cell', 'relation': 0}
{'head': 'neural crest derived fibroblast', 'tail': 'fibroblast', 'relation': 0}
{'head': 'neuronal receptor cell', 'tail': 'sensory neuron', 'relation': 0}

细胞类型示例: ['serous cell of epithelium of terminal bronchiole', 'somatocrinin secreting cell', 'non-nucleated secondary lens fibre', 'S2b fibroblast', 'cuboidal GC']


In [8]:
import re

def extract_triplets_and_types(file_path):
    """
    提取文件中所有三元组和唯一细胞类型
    返回:
        triplets: [(head, relation, tail), ...]
        cell_types: set() 所有唯一的细胞类型
    """
    triplets = []
    cell_types = set()
    
    # 匹配所有预定义关系（使用正则表达式中的命名捕获组）
    pattern = re.compile(
        r"(?P<head>.+?)\s+"
        r"(?P<relation>is_a|disjoint_from|exact_synonyms|broad_synonyms|"
        r"related_synonyms|develops from|develops into|synapsed to)\s+"
        r"(?P<tail>.+)"
    )
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:  # 非空行处理
                match = pattern.match(line)
                if match:
                    head = match.group('head')
                    relation = match.group('relation')
                    tail = match.group('tail')
                    
                    triplets.append((head, relation, tail))
                    cell_types.update([head, tail])
    
    return triplets, cell_types

# 使用示例
file_path = "/mnt/c/Users/94903/Desktop/cellannotation/PubMedbert/triples.txt"
triplets, cell_types = extract_triplets_and_types(file_path)

# 转换为目标格式（如果需要）
relation_mapping = {
    "is_a": 0,
    "disjoint_from": 1,
    "exact_synonyms": 2,
    "broad_synonyms": 3,
    "related_synonyms": 4,
    "develops from": 5,
    "develops into": 6,
    "synapsed to": 7
}

data = [{
    "head": h,
    "tail": t,
    "relation": relation_mapping[r]
} for h, r, t in triplets]

# 查看结果样例
print(f"提取到 {len(triplets)} 个三元组")
print(f"发现 {len(cell_types)} 种唯一细胞类型")
print("\n前3个三元组示例:")
for h, r, t in triplets[:3]:
    print(f"{h} {r} {t}")

print("\n部分细胞类型示例:", sorted(cell_types)[:5])

提取到 7310 个三元组
发现 5626 种唯一细胞类型

前3个三元组示例:
primary cultured cell is_a cultured cell
neural crest derived fibroblast is_a fibroblast
neuronal receptor cell is_a sensory neuron

部分细胞类型示例: ['360 nm-cone', '5-HT neuron', '5-HT secreting cell', '5-Hydroxytryptamine secreting cell', '5-hydroxytryptamine neuron']


In [12]:
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

# 初始化
device = "cuda" if torch.cuda.is_available() else "cpu"
text = "primary cultured cell"

# 加载模型和tokenizer
tokenizer = AutoTokenizer.from_pretrained("/home/lxz/PubMedbert")
base_model = AutoModel.from_pretrained("/home/lxz/PubMedbert").to(device)
finetuned_model = AutoModel.from_pretrained("/home/lxz/PubMedbert").to(device)
finetuned_model.load_state_dict(torch.load("/home/lxz/PubMedbert/finetune_bert.pth", map_location=device)["bert_state"])

# 生成embedding
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
with torch.no_grad():
    base_embedding = base_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    finetuned_embedding = finetuned_model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()

# 计算相似度
similarity = cosine_similarity(base_embedding, finetuned_embedding)[0][0]

print(f"Embedding相似度: {similarity:.4f}")

  finetuned_model.load_state_dict(torch.load("/home/lxz/PubMedbert/finetune_bert.pth", map_location=device)["bert_state"])


Embedding相似度: 0.9130
是否一致: 否


In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

class ProjectedBERT(nn.Module):
    def __init__(self, bert_model):
        super().__init__()
        self.bert = bert_model
        self.projection = nn.Sequential(
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.LayerNorm(512),
            nn.Dropout(0.1)  
        )
        
    def forward(self, **inputs):
        outputs = self.bert(**inputs)
        pooled = outputs.last_hidden_state.mean(dim=1)  # 平均池化
        return self.projection(pooled)  # 通过完整投影层

# 初始化
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("/home/lxz/PubMedbert")

# 1. 加载基础BERT模型（无投影层）
base_model = AutoModel.from_pretrained("/home/lxz/PubMedbert").to(device)

# 2. 加载微调模型（带完整投影层）
finetuned_model = ProjectedBERT(
    AutoModel.from_pretrained("/home/lxz/PubMedbert")
).to(device)

# 加载检查点参数
checkpoint = torch.load("/home/lxz/PubMedbert/finetune_bert.pth", map_location=device)

# 精确加载参数
finetuned_model.bert.load_state_dict(checkpoint['bert_state'])
finetuned_model.projection.load_state_dict(checkpoint['projection_state'])

# 生成embedding的函数（推理时关闭dropout）
def get_embedding(model, text):
    model.eval()  # 确保dropout被关闭
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    with torch.no_grad():
        # 处理不同模型的输出差异
        if isinstance(model, ProjectedBERT):
            output = model(**inputs)  # 直接获取投影后的张量
        else:
            output = model(**inputs).last_hidden_state.mean(dim=1)  # 原始BERT需要手动池化
        return output.cpu().numpy()  # 转换为numpy数组

# 计算相似度
def compare_similarity(model, text_a, text_b):
    emb_a = get_embedding(model, text_a)
    emb_b = get_embedding(model, text_b)
    return cosine_similarity(emb_a, emb_b)[0][0]

# 使用示例
text_pairs = [
    ("common myeloid progenitor", "megakaryocyte-erythroid progenitor cell"),
    ("common myeloid progenitor", "common lymphoid progenitor"),
    ("neuronal receptor cell", "sensory neuron")
]

for text1, text2 in text_pairs:
    base_sim = compare_similarity(base_model, text1, text2)
    tuned_sim = compare_similarity(finetuned_model, text1, text2)
    
    print(f"\n文本对: '{text1}' vs '{text2}'")
    print(f"原始BERT相似度: {base_sim:.4f}")
    print(f"投影后相似度: {tuned_sim:.4f}")
    print(f"差异变化: {tuned_sim - base_sim:+.4f}")

  checkpoint = torch.load("/home/lxz/PubMedbert/finetune_bert.pth", map_location=device)



文本对: 'common myeloid progenitor' vs 'megakaryocyte-erythroid progenitor cell'
原始BERT相似度: 0.8692
投影后相似度: 0.9289
差异变化: +0.0597

文本对: 'common myeloid progenitor' vs 'common lymphoid progenitor'
原始BERT相似度: 0.7871
投影后相似度: 0.8739
差异变化: +0.0867

文本对: 'neuronal receptor cell' vs 'sensory neuron'
原始BERT相似度: 0.8858
投影后相似度: 0.9239
差异变化: +0.0381


In [3]:
import re
from collections import defaultdict

def extract_and_organize_triplets(file_path):
    """
    提取并组织三元组，按头实体分类，支持按类型和关系查询
    
    参数:
        file_path: 包含三元组的文本文件路径
        
    返回:
        tuple: (triplet_dict, all_cell_types)
            - triplet_dict: 字典 {head_entity: [(relation, tail_entity), ...]}
            - all_cell_types: 所有唯一细胞类型的集合
    """
    # 正则表达式匹配三元组
    pattern = re.compile(
        r"(?P<head>.+?)\s+"
        r"(?P<relation>is_a|disjoint_from|exact_synonyms|broad_synonyms|"
        r"related_synonyms|develops_from|develops_into|synapsed_to)\s+"
        r"(?P<tail>.+)"
    )
    
    # 使用defaultdict自动初始化空列表
    triplet_dict = defaultdict(list)
    all_cell_types = set()
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                match = pattern.match(line)
                if match:
                    head = match.group('head')
                    relation = match.group('relation')
                    tail = match.group('tail')
                    
                    # 添加到字典和集合
                    triplet_dict[head].append((relation, tail))
                    all_cell_types.update([head, tail])
    
    return dict(triplet_dict), all_cell_types

# 示例使用
file_path = "/home/lxz/PubMedbert/triples.txt"
triplet_dict, cell_types = extract_and_organize_triplets(file_path)

# --- 查询功能示例 ---
def get_relations_by_head(head_entity, relation_type=None):
    """
    根据头实体和可选的关系类型查询三元组
    
    参数:
        head_entity: 要查询的头实体名称
        relation_type: 可选，指定关系类型（如"is_a"）
        
    返回:
        list: 匹配的三元组列表 [(relation, tail), ...]
    """
    if head_entity not in triplet_dict:
        return []
    
    if relation_type is None:
        return triplet_dict[head_entity]
    else:
        return [(rel, tail) for rel, tail in triplet_dict[head_entity] 
                if rel == relation_type]

# 示例查询1：获取所有与"T_cell"相关的三元组
print("T_cell的所有关系:")
for rel, tail in get_relations_by_head("T cell"):
    print(f"  {rel} {tail}")

# 示例查询2：只获取"T_cell"的"is_a"关系
print("\nT_cell的is_a关系:")
for rel, tail in get_relations_by_head("T cell", "is_a"):
    print(f"  {rel} {tail}")

NameError: name 'triplets' is not defined