In [2]:
import tqdm
import json
import torch
import random
import numpy as np
from sklearn.utils import shuffle

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [1]:
config = {}
config["train_corpus_path"] = "./wiki_train.txt"
config["test_corpus_path"] = "./wiki_test.txt"
config["word2idx_path"] = "./my_bert_word2idx.json"
config["output_path"] = "./output_wiki_bert"

config["batch_size"] = 1
config["max_seq_len"] = 200
config["vocab_size"] = 32162
config["lr"] = 2e-6
config["num_workers"] = 0

In [3]:
word2idx_path = config["word2idx_path"]
word2idx = {}

with open(word2idx_path, "r", encoding="utf-8") as f:
    word2idx = json.load(f)

In [4]:
word2idx

{'#PAD#': 0,
 '#UNK#': 1,
 '#SEP#': 3,
 '#CLS#': 2,
 '#MASK#': 4,
 '#NUM#': 5,
 '计': 20,
 '算': 21,
 '机': 22,
 '科': 23,
 '学': 24,
 '\n': 25,
 '（': 26,
 '，': 27,
 '有': 28,
 '时': 29,
 '缩': 30,
 '写': 31,
 '为': 32,
 '）': 33,
 '是': 34,
 '系': 35,
 '统': 36,
 '性': 37,
 '研': 38,
 '究': 39,
 '信': 40,
 '息': 41,
 '与': 42,
 '的': 43,
 '理': 44,
 '论': 45,
 '基': 46,
 '础': 47,
 '以': 48,
 '及': 49,
 '它': 50,
 '们': 51,
 '在': 52,
 '中': 53,
 '如': 54,
 '何': 55,
 '应': 56,
 '用': 57,
 '实': 58,
 '技': 59,
 '术': 60,
 '。': 61,
 ' ': 62,
 '通': 63,
 '常': 64,
 '被': 65,
 '形': 66,
 '容': 67,
 '对': 68,
 '那': 69,
 '些': 70,
 '创': 71,
 '造': 72,
 '、': 73,
 '描': 74,
 '述': 75,
 '转': 76,
 '换': 77,
 '法': 78,
 '处': 79,
 '包': 80,
 '含': 81,
 '很': 82,
 '多': 83,
 '分': 84,
 '支': 85,
 '领': 86,
 '域': 87,
 '；': 88,
 '强': 89,
 '调': 90,
 '特': 91,
 '定': 92,
 '结': 93,
 '果': 94,
 '比': 95,
 '图': 96,
 '而': 97,
 '探': 98,
 '讨': 99,
 '问': 100,
 '题': 101,
 '质': 102,
 '复': 103,
 '杂': 104,
 '还': 105,
 '一': 106,
 '专': 107,
 '注': 108,
 '于': 109,
 '怎': 110,

In [5]:
corpus_path = config["train_corpus_path"]
lines = []
corpus_lines = 0
with open(corpus_path, "r", encoding="utf-8") as f:
    # 将数据集全部加载到内存
    lines = [eval(line) for line in tqdm.tqdm(f, desc="Loading Dataset")]
    corpus_lines = len(lines)


Loading Dataset: 64it [00:00, 621.76it/s]


In [6]:
lines[1]

{'text1': '哲学',
 'text2': '哲学\n\n哲学（）是研究普遍的、根本的问题的学科，包括存在、知识、价值、理智、心灵、语言等领域。哲学与其他学科的不同是其批判的方式、通常是系统化的方法，并以理性论证为基础。在日常用语中，其也可被引申为个人或团体的最基本信仰、概念或态度。\n\n英语词语（）源于古希腊语中的，意思为「爱智慧」，有时也译为「智慧的朋友」，该词由（philos，爱）的派生词（Philein，去爱）和（Sophia，智慧）组合而成。一般认为，古希腊思想家毕达哥拉斯最先在著作中引入“哲学家”和“哲学”这两个术语。\n\n“哲”一词在中国起源很早，如“孔门十哲”，“古圣先哲”等词，“哲”或“哲人”，专指那些善于思辨，学问精深者，即西方近世“哲学家”，“思想家”之谓。在《易经》当中已经开始讨论哲学问题，形而上学的中文名称取自《易经·系辞上传》「形而上者谓之道，形而下者谓之器」一语。1874年，日本启蒙家西周，在《百一新论》中首先用汉文「哲学」来翻译"philosophy"一词。\n\n英国哲学家罗素对哲学的定义是：\n\n胡适在《中国哲学史大纲》中称「凡研究人生切要的问题，从根本上着想，要寻一个根本的解决：这种学问叫做哲学」。\n\n虽然哲学源自西方的传统，但许多文明在历史上都存在著一些相似的论题。东亚和南亚的哲学被称之为东方哲学，而北非和中东则因为其和欧洲密切的互动，因此常被视为是西方哲学的一部份。\n\n对哲学的主题亦存在许多看法。一些人认为哲学是对问题本身过程的审查；另外一些人则认为实质上存在著哲学必须去回答的哲学命题。\n\n\n古希腊哲学家透过问问题来进行哲学实践，他们所提的问题大概可以归类为三类，这三类问题分别形成了哲学的基础学科——分别是形而上学、伦理学、认识论（或知识论） 。\n\n有意思的是，现代哲学上蒙现出"不要求精确理由"的哲学论调，如"本质技巧"(认定本质不可知)，这种现象将不可知论(世界上终究有人不能理解的存在)的重要性提高了。\n\n哲学可以分为很多不同的分支，主要包括形而上学、知识论、伦理学、逻辑学和美学。\n\n\n很多人类社群思考过哲学问题并且互相学习建立了各种哲学流派。\n\n东方哲学是通过每个地区的历史时期来组织的。西方哲学一般可以分为三个或更多时期，最重要的是古典哲学、中世纪哲学和近代哲学。\n\n印度

In [7]:
corpus_lines

64

In [14]:
index = 34
t1, t2, is_next_label = lines[index]["text1"], lines[index]["text2"], 1

In [15]:
t1, t2, is_next_label

('中国计算机科学大事年表',
 '中国计算机科学大事年表\n\n中国计算机科学大事年表按照时间顺序罗列了中华人民共和国建国以来，在计算机科学领域发生的重大事件。\n\n',
 1)

In [16]:
pad_index = 0
unk_index = 1
cls_index = 2
sep_index = 3
mask_index = 4
num_index = 5

In [20]:

char_tokens_ = list(t1)
print("char_tokens_", char_tokens_)
char_tokens = [word2idx.get(char, unk_index) for char in char_tokens_]
print("char_tokens", char_tokens)

output_label = []
for i, token in enumerate(char_tokens):
    prob = random.random()
    if prob < 0.30:
        prob /= 0.30
        output_label.append(char_tokens[i])
        # 80% randomly change token to mask token
        if prob < 0.8:
            char_tokens[i] = mask_index
        # 10% randomly change token to random token
        elif prob < 0.9:
            char_tokens[i] = random.randrange(len(word2idx))
    else:
        output_label.append(0)

t1_random, t1_label = char_tokens, output_label
print("output_label", output_label)
print("char_tokens_after", char_tokens)

char_tokens_ ['中', '国', '计', '算', '机', '科', '学', '大', '事', '年', '表']
char_tokens [53, 257, 20, 21, 22, 23, 24, 211, 138, 249, 498]
output_label [0, 0, 0, 0, 22, 23, 24, 0, 138, 0, 0]
char_tokens_after [53, 257, 20, 21, 4, 4, 4, 211, 4, 249, 498]


In [21]:
char_tokens_ = list(t2)
print("char_tokens_", char_tokens_)
char_tokens = [word2idx.get(char, unk_index) for char in char_tokens_]
print("char_tokens", char_tokens)

output_label = []
for i, token in enumerate(char_tokens):
    prob = random.random()
    if prob < 0.30:
        prob /= 0.30
        output_label.append(char_tokens[i])
        # 80% randomly change token to mask token
        if prob < 0.8:
            char_tokens[i] = mask_index
        # 10% randomly change token to random token
        elif prob < 0.9:
            char_tokens[i] = random.randrange(len(word2idx))
    else:
        output_label.append(0)

t2_random, t2_label = char_tokens, output_label
print("output_label", output_label)
print("char_tokens_after", char_tokens)

char_tokens_ ['中', '国', '计', '算', '机', '科', '学', '大', '事', '年', '表', '\n', '\n', '中', '国', '计', '算', '机', '科', '学', '大', '事', '年', '表', '按', '照', '时', '间', '顺', '序', '罗', '列', '了', '中', '华', '人', '民', '共', '和', '国', '建', '国', '以', '来', '，', '在', '计', '算', '机', '科', '学', '领', '域', '发', '生', '的', '重', '大', '事', '件', '。', '\n', '\n']
char_tokens [53, 257, 20, 21, 22, 23, 24, 211, 138, 249, 498, 25, 25, 53, 257, 20, 21, 22, 23, 24, 211, 138, 249, 498, 527, 528, 29, 221, 1982, 165, 734, 517, 265, 53, 1016, 121, 817, 620, 126, 257, 227, 257, 48, 207, 27, 52, 20, 21, 22, 23, 24, 86, 87, 258, 660, 43, 209, 211, 138, 164, 61, 25, 25]
output_label [0, 257, 0, 21, 0, 0, 0, 0, 138, 249, 498, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 0, 249, 498, 0, 0, 0, 0, 1982, 0, 0, 0, 265, 0, 0, 0, 817, 0, 126, 257, 0, 0, 0, 0, 0, 0, 0, 21, 0, 23, 0, 86, 0, 0, 660, 0, 209, 0, 0, 164, 0, 0, 25]
char_tokens_after [53, 4, 20, 3288, 22, 23, 24, 211, 138, 4, 4, 25, 25, 53, 257, 20, 21, 22, 23, 24, 4, 138, 4, 199, 527, 528, 2

In [23]:
t1 = [cls_index] + t1_random + [sep_index]
t2 = t2_random + [sep_index]
print(t1_random)
print(t1)
print(t2_random)
print(t2)

[53, 257, 20, 21, 4, 4, 4, 211, 4, 249, 498]
[2, 53, 257, 20, 21, 4, 4, 4, 211, 4, 249, 498, 3]
[53, 4, 20, 3288, 22, 23, 24, 211, 138, 4, 4, 25, 25, 53, 257, 20, 21, 22, 23, 24, 4, 138, 4, 199, 527, 528, 29, 221, 4, 165, 734, 517, 4, 53, 1016, 121, 4, 620, 126, 257, 227, 257, 48, 207, 27, 52, 20, 967, 22, 4, 24, 4, 87, 258, 4, 43, 4, 211, 138, 4, 61, 25, 4]
[53, 4, 20, 3288, 22, 23, 24, 211, 138, 4, 4, 25, 25, 53, 257, 20, 21, 22, 23, 24, 4, 138, 4, 199, 527, 528, 29, 221, 4, 165, 734, 517, 4, 53, 1016, 121, 4, 620, 126, 257, 227, 257, 48, 207, 27, 52, 20, 967, 22, 4, 24, 4, 87, 258, 4, 43, 4, 211, 138, 4, 61, 25, 4, 3]


In [24]:
print(t1_label)
t1_label = [pad_index] + t1_label + [pad_index]
print(t1_label)
print(t2_label)
t2_label = t2_label + [pad_index]
print(t2_label)

[0, 0, 0, 0, 22, 23, 24, 0, 138, 0, 0]
[0, 0, 0, 0, 0, 22, 23, 24, 0, 138, 0, 0, 0]
[0, 257, 0, 21, 0, 0, 0, 0, 138, 249, 498, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 0, 249, 498, 0, 0, 0, 0, 1982, 0, 0, 0, 265, 0, 0, 0, 817, 0, 126, 257, 0, 0, 0, 0, 0, 0, 0, 21, 0, 23, 0, 86, 0, 0, 660, 0, 209, 0, 0, 164, 0, 0, 25]
[0, 257, 0, 21, 0, 0, 0, 0, 138, 249, 498, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 0, 249, 498, 0, 0, 0, 0, 1982, 0, 0, 0, 265, 0, 0, 0, 817, 0, 126, 257, 0, 0, 0, 0, 0, 0, 0, 21, 0, 23, 0, 86, 0, 0, 660, 0, 209, 0, 0, 164, 0, 0, 25, 0]


In [26]:
seq_len= config["max_seq_len"]
segment_label = ([0 for _ in range(len(t1))] + [1 for _ in range(len(t2))])[:seq_len]
print(len(t1) + len(t2), seq_len, segment_label)

77 200 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [27]:
bert_input = (t1 + t2)[:seq_len]
bert_label = (t1_label + t2_label)[:seq_len]
print(bert_input)
print(bert_label)

[2, 53, 257, 20, 21, 4, 4, 4, 211, 4, 249, 498, 3, 53, 4, 20, 3288, 22, 23, 24, 211, 138, 4, 4, 25, 25, 53, 257, 20, 21, 22, 23, 24, 4, 138, 4, 199, 527, 528, 29, 221, 4, 165, 734, 517, 4, 53, 1016, 121, 4, 620, 126, 257, 227, 257, 48, 207, 27, 52, 20, 967, 22, 4, 24, 4, 87, 258, 4, 43, 4, 211, 138, 4, 61, 25, 4, 3]
[0, 0, 0, 0, 0, 22, 23, 24, 0, 138, 0, 0, 0, 0, 257, 0, 21, 0, 0, 0, 0, 138, 249, 498, 0, 0, 0, 0, 0, 0, 0, 0, 0, 211, 0, 249, 498, 0, 0, 0, 0, 1982, 0, 0, 0, 265, 0, 0, 0, 817, 0, 126, 257, 0, 0, 0, 0, 0, 0, 0, 21, 0, 23, 0, 86, 0, 0, 660, 0, 209, 0, 0, 164, 0, 0, 25, 0]


In [28]:
torch.tensor(bert_input)

tensor([   2,   53,  257,   20,   21,    4,    4,    4,  211,    4,  249,  498,
           3,   53,    4,   20, 3288,   22,   23,   24,  211,  138,    4,    4,
          25,   25,   53,  257,   20,   21,   22,   23,   24,    4,  138,    4,
         199,  527,  528,   29,  221,    4,  165,  734,  517,    4,   53, 1016,
         121,    4,  620,  126,  257,  227,  257,   48,  207,   27,   52,   20,
         967,   22,    4,   24,    4,   87,  258,    4,   43,    4,  211,  138,
           4,   61,   25,    4,    3])