# 调用ccpoem

In [5]:
import os
import logging

import torch
import torch.nn as nn
import numpy as np
from transformers import BertModel, BertTokenizer

logging.basicConfig(
    format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s', level=logging.INFO)

gpu_list = None


class Bert(nn.Module):
    def __init__(self, BERT_PATH='static/BERT_CCPoem_v1'):
        super(Bert, self).__init__()

        self.bert = BertModel.from_pretrained(BERT_PATH)

    def init_multi_gpu(self, device):
        self.bert = nn.DataParallel(self.bert, device_ids=device)

    def forward(self, data, cls=False):
        result = []
        # print(data)
        x = data['input_ids']
        y = self.bert(x, attention_mask=data['attention_mask'],
                         token_type_ids=data['token_type_ids'])[0]
        
        if(cls):
            result = y[:, 0, :].view(y.size(0), -1)
            result = result.cpu().tolist()
        else:
            result = []
            y = y.cpu()
            # y = torch.mean(y, 1)
            # result = y.cpu().tolist()
            for i in range(y.shape[0]):
                tmp = y[i][1:torch.sum(data['attention_mask'][i]) - 1, :]
                result.append(tmp.mean(0).tolist())

        return result


class BertFormatter():
    def __init__(self, BERT_PATH='static/BERT_CCPoem_v1'):
        self.tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

    def process(self, data):
        res_dict = self.tokenizer.batch_encode_plus(
            data, pad_to_max_length=True)

        input_list = {'input_ids': torch.LongTensor(res_dict['input_ids']),
                      'attention_mask': torch.LongTensor(res_dict['attention_mask']),
                      "token_type_ids": torch.LongTensor(res_dict['token_type_ids'])}
        return input_list


def init(BERT_PATH="static/BERT_CCPoem_v1"):
    global gpu_list
    gpu_list = []

    device_list = os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")
    if(device_list[0] == ""):
        device_list = []
    for a in range(0, len(device_list)):
        gpu_list.append(int(a))

    cuda = torch.cuda.is_available()
    logging.info("CUDA available: %s" % str(cuda))
    if not cuda and len(gpu_list) > 0:
        logging.error("CUDA is not available but specific gpu id")
        raise NotImplementedError

    model = Bert(BERT_PATH)
    formatter = BertFormatter(BERT_PATH)
    if len(gpu_list) > 0:
        model = model.cuda()
    if(len(gpu_list) > 1):
        try:
            model.init_multi_gpu(gpu_list)
        except Exception as e:
            logging.warning(
                "No init_multi_gpu implemented in the model, use single gpu instead. {}".format(str(e)))
    return model, formatter


def predict_vec_rep(data, model, formatter):
    data = formatter.process(data)
    model.eval()

    for i in data:
        if(isinstance(data[i], torch.Tensor)):
            if len(gpu_list) > 0:
                data[i] = data[i].cuda()

    result = model(data)

    return result


def cos_sim(vector_a, vector_b, sim=True):

    vector_a = np.mat(vector_a)
    vector_b = np.mat(vector_b)
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    cos = num / denom
    if(not sim):
        return cos
    sim = 0.5 + 0.5 * cos
    return sim


if __name__ == '__main__':
    model, formatter = init()
    result = predict_vec_rep(["一行白鹭上青天"], model, formatter)[0]
    print(result)



  from .autonotebook import tqdm as notebook_tqdm
2023-09-03 19:53:13,288 - 1229874609.py[line:71] - INFO: CUDA available: False
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[0.37986961007118225, -0.8445234298706055, 0.926456868648529, -1.732959508895874, 0.4790334105491638, -0.30134692788124084, 0.9458165168762207, -0.45697417855262756, -1.1309224367141724, -0.44595903158187866, -1.1679950952529907, -0.335366815328598, 1.076470971107483, 0.4351254403591156, -1.435951828956604, -0.10431935638189316, 1.5164940357208252, -0.6068114638328552, -0.5779765248298645, 0.05628534033894539, 0.17600564658641815, -0.7748263478279114, 0.4632478654384613, -0.9195488691329956, 0.29605832695961, -0.5998052954673767, 0.47118112444877625, 0.17195908725261688, -0.6707261800765991, -0.4623022675514221, -0.034068960696458817, 0.217732235789299, -0.7499228715896606, -1.0339088439941406, 0.41781359910964966, 0.015698501840233803, -1.0734797716140747, 0.33579131960868835, 0.47090885043144226, -0.11875031143426895, 0.35522153973579407, -0.2716936767101288, 0.27442288398742676, -1.1179730892181396, 0.3024638295173645, 0.6947304010391235, 0.21860523521900177, -0.35350894927978516, -



In [None]:
model, formatter = init()
result_0 = predict_vec_rep(["一行白鹭上青天"], model, formatter)[0]
result_1 = predict_vec_rep(['白鹭一行登碧霄'], model, formatter)[0]
result_2 = predict_vec_rep(["飞却青天白鹭鸶"], model, formatter)[0]

print(cos_sim(result_0, result_1))
print(cos_sim(result_0, result_2))

# 查找一句话全唐诗中语义相似度最高的词

In [None]:
import json
with open('output/quantangshi.json','r',encoding='utf-8') as f:
    poetdata = json.load(f)
print(poetdata[100])

In [None]:
cyword ="藹藹"
line = "肅肅宵征，夙夜在公"
result1 = predict_vec_rep([line], model, formatter)[0]
for poet in poetdata:
    paras = poet['paragraphs']
    for para in paras:
        result2 = predict_vec_rep([para], model, formatter)[0]
        sim = cos_sim(result1, result2)
        if sim > 0.8:
            print(line,para,sim)



# 尋找語料庫中包含該重言詞的句子

In [1]:
import pickle,json

# 指定要加载的文件名
namelist = ['ms','chuci','wx','yfsj','ytxy','qts']


with open('output/chongyanlist.pkl', 'rb') as file:
        chongyanlist = list(pickle.load(file))

totaljson = []
for x in chongyanlist:
    dic = {"word":x,'ms':[],'chuci':[],'wx':[],'yfsj':[],'ytxy':[],'qts':[]}
    totaljson.append(dic)
print(totaljson)

for i in namelist:
    file_name = f'output/set_{i}.pkl'
    # 使用pickle.load加载set对象
    with open(file_name, 'rb') as file:
        loaded_set = pickle.load(file)
    # print(f'Loaded set: {loaded_set}')

    for j in loaded_set:
        for word in chongyanlist:
            if word in j:
                ind = chongyanlist.index(word)
                totaljson[ind][i].append(j)
                # print(totaljson[ind],j)
with open('output/timeline.json','w',encoding='utf-8') as f:
     json.dump(totaljson,f)
# for i in totaljson:
#      print(i)

[{'word': '肅肅', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '咽咽', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '摵摵', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '亭亭', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '長長', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '毿毿', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '奕奕', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '凜凜', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '鱗鱗', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '密密', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '兀兀', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '藹藹', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qt

In [8]:
word = "丁丁"
ind = chongyanlist.index(word)
i = totaljson[ind]
print(len(i['ms']),len(i['chuci']),len(i['wx']),len(i['yfsj']),len(i['ytxy']),len(i['qts']))
print(i)


2 0 0 5 0 38
{'word': '丁丁', 'ms': ['椓之丁丁', '伐木丁丁'], 'chuci': [], 'wx': [], 'yfsj': ['丁丁漏水夜何長', '宮漏丁丁夜向晨', '寒雁丁丁渡遼水', '秋天丁丁復凍凍', '丁丁暖漏滴花影'], 'ytxy': [], 'qts': ['丁丁向晚急還稀', '初驚宵漏丁丁促', '伐木丁丁', '丁丁入波心', '宮漏夜丁丁', '丁丁漏向盡', '撫毛千萬喚丁丁', '月落漏丁丁', '收拾可丁丁', '尚聞丁丁聲', '別夜漏丁丁', '持斧自丁丁', '好鳥響丁丁', '香風下天漏丁丁', '伐木丁丁一樵叟', '丁丁幽鐘遠', '丁丁窗雨繁', '雙璫丁丁聯尺素', '丁丁玉漏發深宮', '擊霜寒玉亂丁丁', '丁丁在前澗', '丁丁寒漏滴聲稀', '丁丁玉漏殘', '丁丁啄門疑啄木', '伐木丁丁山更幽', '丁丁海女弄金環', '丁丁漏水夜何長', '宮漏丁丁夜向晨', '漏移寒箭丁丁急', '飄飄颻颻寒丁丁', '紫槽紅撥夜丁丁', '丁丁玉漏咽銅壺', '杵臼聲丁丁', '影剎遙丁丁', '秋天丁丁復凍凍', '禁漏丁丁', '殘漏自丁丁', '丁丁暖漏滴花影']}


In [7]:
simdic = {}
for line in i['qts']:
    
    result1 = predict_vec_rep([line], model, formatter)[0]
    for k,v in i.items():
        if k == 'word' or k == 'qts':
            continue
        for j in v:
            result2 = predict_vec_rep([j], model, formatter)[0]
            sim = cos_sim(result1, result2)
            print(f'{line},{k}-{j},{sim}')
            simdic[f'{line},{k}-{j}'] = sim

smi_sorted = sorted(simdic.items(), key=lambda e:e[1],reverse=True)
print(smi_sorted)

肅肅儀仗裏,ms-肅肅鴇行,0.7811968564272176
肅肅儀仗裏,ms-肅肅其羽,0.6684157723755959
肅肅儀仗裏,ms-肅肅王命,0.6998899685376653
肅肅儀仗裏,ms-肅肅鴇翼,0.7098173787146647
肅肅儀仗裏,ms-至止肅肅,0.6549092974065667
肅肅儀仗裏,ms-肅肅在廟,0.7294736603865197
肅肅儀仗裏,ms-肅肅謝功,0.7085238781472449
肅肅儀仗裏,ms-肅肅兔罝,0.6485563737949835
肅肅儀仗裏,ms-肅肅鴇羽,0.787505037110233
肅肅儀仗裏,ms-肅肅宵征,0.7273219430571197
肅肅儀仗裏,wx-墓門兮肅肅,0.6584146609960182
肅肅儀仗裏,wx-肅肅凄風,0.6557246929880391
肅肅儀仗裏,wx-肅肅戒徂兩,0.7454030040677123
肅肅儀仗裏,wx-竦肅肅以靜謐,0.7420312718384947
肅肅儀仗裏,wx-肅肅之儀盡,0.7439764671763102
肅肅儀仗裏,wx-肅肅廣殿陰,0.8629234647404017
肅肅儀仗裏,wx-安得肅肅羽,0.7028362885920209
肅肅儀仗裏,wx-唯羨肅肅翰,0.7325194718718044
肅肅儀仗裏,wx-肅肅階䦳,0.8032630330248323
肅肅儀仗裏,wx-肅肅習習,0.8032630330248323
肅肅儀仗裏,wx-肅肅宵駕動,0.7818120904367649
肅肅儀仗裏,wx-肅肅荊王,0.7236874234192762
肅肅儀仗裏,wx-肅肅高桐枝,0.6451232240759092
肅肅儀仗裏,wx-肅肅宵征,0.7273219430571197
肅肅儀仗裏,wx-出紫宮之肅肅兮,0.6812673739240459
肅肅儀仗裏,wx-肅肅君子,0.6433079413449805
肅肅儀仗裏,wx-肅肅我祖,0.621114640056846
肅肅儀仗裏,wx-肅肅先生,0.6801023280187977
肅肅儀仗裏,wx-肅肅莎雞羽,0.738065238867793
肅肅儀仗裏,yfsj-四面肅肅稽首,0.694658300506