# 调用ccpoem

In [1]:
import os
import logging

import torch
import torch.nn as nn
import numpy as np
from transformers import BertModel, BertTokenizer

logging.basicConfig(
    format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s', level=logging.INFO)

gpu_list = None


class Bert(nn.Module):
    def __init__(self, BERT_PATH='static/BERT_CCPoem_v1'):
        super(Bert, self).__init__()

        self.bert = BertModel.from_pretrained(BERT_PATH)

    def init_multi_gpu(self, device):
        self.bert = nn.DataParallel(self.bert, device_ids=device)

    def forward(self, data, cls=False):
        result = []
        # print(data)
        x = data['input_ids']
        y = self.bert(x, attention_mask=data['attention_mask'],
                         token_type_ids=data['token_type_ids'])[0]
        
        if(cls):
            result = y[:, 0, :].view(y.size(0), -1)
            result = result.cpu().tolist()
        else:
            result = []
            y = y.cpu()
            # y = torch.mean(y, 1)
            # result = y.cpu().tolist()
            for i in range(y.shape[0]):
                tmp = y[i][1:torch.sum(data['attention_mask'][i]) - 1, :]
                result.append(tmp.mean(0).tolist())

        return result


class BertFormatter():
    def __init__(self, BERT_PATH='static/BERT_CCPoem_v1'):
        self.tokenizer = BertTokenizer.from_pretrained(BERT_PATH)

    def process(self, data):
        res_dict = self.tokenizer.batch_encode_plus(
            data, pad_to_max_length=True)

        input_list = {'input_ids': torch.LongTensor(res_dict['input_ids']),
                      'attention_mask': torch.LongTensor(res_dict['attention_mask']),
                      "token_type_ids": torch.LongTensor(res_dict['token_type_ids'])}
        return input_list


def init(BERT_PATH="static/BERT_CCPoem_v1"):
    global gpu_list
    gpu_list = []

    device_list = os.environ.get("CUDA_VISIBLE_DEVICES", "").split(",")
    if(device_list[0] == ""):
        device_list = []
    for a in range(0, len(device_list)):
        gpu_list.append(int(a))

    cuda = torch.cuda.is_available()
    logging.info("CUDA available: %s" % str(cuda))
    if not cuda and len(gpu_list) > 0:
        logging.error("CUDA is not available but specific gpu id")
        raise NotImplementedError

    model = Bert(BERT_PATH)
    formatter = BertFormatter(BERT_PATH)
    if len(gpu_list) > 0:
        model = model.cuda()
    if(len(gpu_list) > 1):
        try:
            model.init_multi_gpu(gpu_list)
        except Exception as e:
            logging.warning(
                "No init_multi_gpu implemented in the model, use single gpu instead. {}".format(str(e)))
    return model, formatter


def predict_vec_rep(data, model, formatter):
    data = formatter.process(data)
    model.eval()

    for i in data:
        if(isinstance(data[i], torch.Tensor)):
            if len(gpu_list) > 0:
                data[i] = data[i].cuda()

    result = model(data)

    return result


def cos_sim(vector_a, vector_b, sim=True):

    vector_a = np.mat(vector_a)
    vector_b = np.mat(vector_b)
    num = float(vector_a * vector_b.T)
    denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    cos = num / denom
    if(not sim):
        return cos
    sim = 0.5 + 0.5 * cos
    return sim


if __name__ == '__main__':
    model, formatter = init()
    result = predict_vec_rep(["一行白鹭上青天"], model, formatter)[0]
    print(result)



  from .autonotebook import tqdm as notebook_tqdm
2023-09-05 20:33:10,428 - 1229874609.py[line:71] - INFO: CUDA available: False
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


[0.37986961007118225, -0.8445234298706055, 0.926456868648529, -1.732959508895874, 0.4790334105491638, -0.30134692788124084, 0.9458165168762207, -0.45697417855262756, -1.1309224367141724, -0.44595903158187866, -1.1679950952529907, -0.335366815328598, 1.076470971107483, 0.4351254403591156, -1.435951828956604, -0.10431935638189316, 1.5164940357208252, -0.6068114638328552, -0.5779765248298645, 0.05628534033894539, 0.17600564658641815, -0.7748263478279114, 0.4632478654384613, -0.9195488691329956, 0.29605832695961, -0.5998052954673767, 0.47118112444877625, 0.17195908725261688, -0.6707261800765991, -0.4623022675514221, -0.034068960696458817, 0.217732235789299, -0.7499228715896606, -1.0339088439941406, 0.41781359910964966, 0.015698501840233803, -1.0734797716140747, 0.33579131960868835, 0.47090885043144226, -0.11875031143426895, 0.35522153973579407, -0.2716936767101288, 0.27442288398742676, -1.1179730892181396, 0.3024638295173645, 0.6947304010391235, 0.21860523521900177, -0.35350894927978516, -



In [None]:
model, formatter = init()
result_0 = predict_vec_rep(["一行白鹭上青天"], model, formatter)[0]
result_1 = predict_vec_rep(['白鹭一行登碧霄'], model, formatter)[0]
result_2 = predict_vec_rep(["飞却青天白鹭鸶"], model, formatter)[0]

print(cos_sim(result_0, result_1))
print(cos_sim(result_0, result_2))

# 查找一句话全唐诗中语义相似度最高的词

In [None]:
import json
with open('output/quantangshi.json','r',encoding='utf-8') as f:
    poetdata = json.load(f)
print(poetdata[100])

In [None]:
cyword ="藹藹"
line = "肅肅宵征，夙夜在公"
result1 = predict_vec_rep([line], model, formatter)[0]
for poet in poetdata:
    paras = poet['paragraphs']
    for para in paras:
        result2 = predict_vec_rep([para], model, formatter)[0]
        sim = cos_sim(result1, result2)
        if sim > 0.8:
            print(line,para,sim)



# 尋找語料庫中包含該重言詞的句子

In [2]:
import pickle,json

# 指定要加载的文件名
namelist = ['ms','chuci','wx','yfsj','ytxy','qts']


with open('output/chongyanlist.pkl', 'rb') as file:
        chongyanlist = list(pickle.load(file))

totaljson = []
for x in chongyanlist:
    dic = {"word":x,'ms':[],'chuci':[],'wx':[],'yfsj':[],'ytxy':[],'qts':[]}
    totaljson.append(dic)
print(totaljson)

for i in namelist:
    file_name = f'output/set_{i}.pkl'
    # 使用pickle.load加载set对象
    with open(file_name, 'rb') as file:
        loaded_set = pickle.load(file)
    # print(f'Loaded set: {loaded_set}')

    for j in loaded_set:
        for word in chongyanlist:
            if word in j:
                ind = chongyanlist.index(word)
                totaljson[ind][i].append(j)
                # print(totaljson[ind],j)
with open('output/timeline.json','w',encoding='utf-8') as f:
     json.dump(totaljson,f)
# for i in totaljson:
#      print(i)

[{'word': '陶陶', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '玄玄', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '濛濛', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '颼颼', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '忡忡', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '田田', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '簇簇', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '長長', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '淼淼', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '層層', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '翳翳', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qts': []}, {'word': '栖栖', 'ms': [], 'chuci': [], 'wx': [], 'yfsj': [], 'ytxy': [], 'qt

In [3]:
word = "蒼蒼"
ind = chongyanlist.index(word)
i = totaljson[ind]
print(len(i['ms']),len(i['chuci']),len(i['wx']),len(i['yfsj']),len(i['ytxy']),len(i['qts']))
print(i)


1 0 6 30 0 365
{'word': '蒼蒼', 'ms': ['蒹葭蒼蒼'], 'chuci': [], 'wx': ['珍木鬱蒼蒼', '山樹鬱蒼蒼', '太谷晦蒼蒼', '寒渚夜蒼蒼', '蒼蒼中山桂', '無以測其淺深；仰蒼蒼之色者'], 'yfsj': ['露濕月蒼蒼', '仰天依舊蒼蒼色', '蒼蒼上古原', '天蒼蒼兮上無緣', '蒼蒼之天', '風瑟瑟兮野蒼蒼', '泣血仰頭兮訴蒼蒼', '暗哭蒼蒼天', '兩鬢蒼蒼十指黑', '繐帳空蒼蒼', '蒼蒼圓蓋', '雲霧四起月蒼蒼', '蒼蒼茂陵樹', '去時只覺天蒼蒼', '高天蒼蒼高不極', '蒼蒼川上月', '蒼蒼萬里道', '蒼蒼林薄遠', '巫峽蒼蒼煙雨時', '蒼蒼水氣雜遙天', '蒼蒼雲松', '誰諭蒼蒼造物意', '蒼蒼聳極天', '臺館曉蒼蒼', '鄴城蒼蒼白露微', '蒼蒼蟲網遍', '蒼蒼茫茫在何處', '其色蒼蒼', '白日祭蒼蒼', '蒼蒼別路迷'], 'ytxy': [], 'qts': ['嘉陵江水色蒼蒼', '煙露日蒼蒼', '風雨正蒼蒼', '鬱鬱復蒼蒼', '中道許蒼蒼', '蒼蒼之天', '歲晏空蒼蒼', '終古柏蒼蒼', '蒼蒼珉井一百丈', '蒼蒼樹裏聞', '遠樹蒼蒼妙喜寺', '仰面問蒼蒼', '兩鬢蒼蒼十指黑', '水墨蒼蒼半壁陰', '應自負蒼蒼', '蓮闕逈蒼蒼', '野雲低迷煙蒼蒼', '柯葉寒蒼蒼', '蒼蒼中條山', '蒼蒼曉欲臨', '曉禁蒼蒼換直還', '雲木蒼蒼但閉關', '蒼蒼三徑微', '疊嶂凌蒼蒼', '誰喻蒼蒼造物意', '殿前松柏晦蒼蒼', '暮色忽蒼蒼', '雲木蒼蒼數萬株', '窮陰蒼蒼雪雰雰', '手栽松樹蒼蒼老', '與招魂魄上蒼蒼', '磨圍山月正蒼蒼', '花柳蒼蒼月欲來', '逼人色蒼蒼', '露濕月蒼蒼', '天但蒼蒼色', '逈立向蒼蒼', '仰天依舊蒼蒼色', '蒼蒼上古原', '蒼蒼露草青蒿氣', '蒼蒼何處是伊川', '蒼蒼殊未收', '風波隱隱石蒼蒼', '蒼蒼前洲日', '煙樹蒼蒼故郢城', '百里獨蒼蒼', '蒼蒼丁零塞', '聳幹蒼蒼', '蒼蒼五老霧中壇', '禁城春色曉蒼蒼', '蒼蒼烟月滿川亭', '紫姑神下月蒼蒼', '若爲問得蒼蒼意', '宮樹蒼蒼靜掩扃', '九泉煙冷樹蒼蒼',

In [5]:
simdic = {}
for line in i['qts']:
    
    result1 = predict_vec_rep([line], model, formatter)[0]
    for k,v in i.items():
        if k == 'word' or k == 'qts':
            continue
        for j in v:
            result2 = predict_vec_rep([j], model, formatter)[0]
            sim = cos_sim(result1, result2)
            if sim > 0.7:
                print(f'{line},{k}-{j},{sim}')
            simdic[f'{line},{k}-{j}'] = sim

smi_sorted = sorted(simdic.items(), key=lambda e:e[1],reverse=True)
print(smi_sorted)



嘉陵江水色蒼蒼,wx-山樹鬱蒼蒼,0.700508571624155
嘉陵江水色蒼蒼,wx-寒渚夜蒼蒼,0.7082463794362764
嘉陵江水色蒼蒼,yfsj-仰天依舊蒼蒼色,0.7174923798723003
嘉陵江水色蒼蒼,yfsj-蒼蒼茂陵樹,0.7634038937046606
嘉陵江水色蒼蒼,yfsj-蒼蒼川上月,0.7326948256360595
嘉陵江水色蒼蒼,yfsj-蒼蒼水氣雜遙天,0.7243194485578783
嘉陵江水色蒼蒼,yfsj-鄴城蒼蒼白露微,0.7203793017838478
嘉陵江水色蒼蒼,yfsj-其色蒼蒼,0.7754156662738187
煙露日蒼蒼,ms-蒹葭蒼蒼,0.7046587798514102
煙露日蒼蒼,wx-珍木鬱蒼蒼,0.7529667417785246
煙露日蒼蒼,wx-山樹鬱蒼蒼,0.7400988789954628
煙露日蒼蒼,wx-太谷晦蒼蒼,0.7549531744645326
煙露日蒼蒼,wx-寒渚夜蒼蒼,0.7284119876955606
煙露日蒼蒼,wx-無以測其淺深；仰蒼蒼之色者,0.7294182778664747
煙露日蒼蒼,yfsj-露濕月蒼蒼,0.8987656356780176
煙露日蒼蒼,yfsj-仰天依舊蒼蒼色,0.7180710340525391
煙露日蒼蒼,yfsj-蒼蒼上古原,0.7094283727070293
煙露日蒼蒼,yfsj-天蒼蒼兮上無緣,0.7541041466097402
煙露日蒼蒼,yfsj-泣血仰頭兮訴蒼蒼,0.72189826360305
煙露日蒼蒼,yfsj-兩鬢蒼蒼十指黑,0.700317463526522
煙露日蒼蒼,yfsj-繐帳空蒼蒼,0.7237925970129393
煙露日蒼蒼,yfsj-蒼蒼圓蓋,0.7569586243631661
煙露日蒼蒼,yfsj-雲霧四起月蒼蒼,0.7402455350391974
煙露日蒼蒼,yfsj-蒼蒼川上月,0.7382411592875952
煙露日蒼蒼,yfsj-蒼蒼林薄遠,0.7536731293610246
煙露日蒼蒼,yfsj-蒼蒼水氣雜遙天,0.7395983489842961
煙露日蒼蒼,yfsj-蒼蒼聳極天,0.7528621910870212
煙露日蒼蒼,yf