# 代码目的

将所有数据按照字向量整理成numpy的格式，存储在/data中

In [2]:
import numpy as np
import os

In [3]:
def pos_embed(x, maxlen):
    """
    针对每一个句子，以每一个字相对实体的位置做 embedding
    这里pos_embed的作用是，通过固定长度fixlen，计算每个字到实体字的距离，并加上61，变为正数，返回范围确定在1-121；
    距离大于60的，返回的范围为0和122。如果一个句子很长，则最终返回的序列为：【0,0,0,1,2,3,...,120,121,122,122,122】
    两个实体可能表达会更复杂
    """
    if x < -maxlen:
        return 0
    elif -maxlen <= x <= maxlen:
        return x + maxlen + 1
    elif x > maxlen:
        return 1 + maxlen + maxlen + 1


def find_index(x, y):
    """
    找到实体词在字符串中的index；找不到就返回-1
    """
    flag = -1
    for i in range(len(y)):
        if x != y[i]:
            continue
        else:
            return i
    return flag

# 读取字向量

In [4]:
def readWordEmbedding(path):
    """
    读取字向量文本，生成字典和字向量数组
    """
    print('reading word embedding data')
    vec = []  # 字向量集合（列表，其中的每个值是一个列表）
    word2id = {}  # 从字向量中构建字典
    # 读取中文字向量中的字和向量
    # 字数：16117，向量维度：100
    # 这里可以考虑使用预训练的字向量（NER项目中，可以在项目中需要那个bin文件）
    with open(path, encoding='utf-8') as f:
        content = f.readline()
        # content = content.strip().split()
        # dim = int(content[1])
        while True:
            content = f.readline()
            if content == '':
                break
            content = content.strip().split()
            dim = len(content[1:])
            word2id[content[0]] = len(word2id)
            content = content[1:]
            content = [(float)(i) for i in content]
            vec.append(content)

    # 将【空】和【未知】添加到字典中
    word2id['UNK'] = len(word2id)
    word2id['BLANK'] = len(word2id)
    # 对应的，在向量列表中添加两个向量，长度与字向量长度相同，均值为0，标准差为0.05
    vec.append(np.random.normal(size=dim, loc=0, scale=0.05))
    vec.append(np.random.normal(size=dim, loc=0, scale=0.05))
    vec = np.array(vec, dtype=np.float32)  # 列表转化为数组
    
    np.save('./data/vec.npy', vec)
    return vec, dim, word2id

In [5]:
vec_path = './origin_data/vec.txt'
vector, dimen, word2id_dict = readWordEmbedding(vec_path)

reading word embedding data


# 读取关系

In [6]:
def readRealationToId(path):
    """
    读取关系数据，生成关系字典
    """
    print('reading relation to id')
    relation2id = {}
    with open(path, 'r', encoding='utf-8') as f:
        content = f.readlines()
        for sin_content in content:
            if sin_content:
                pair = sin_content.strip('\n').split()
                relation2id[pair[0]] = int(pair[1])
    return relation2id

In [7]:
re_path = './origin_data/relation2id.txt'
relation2id_dict = readRealationToId(re_path)  # 长度为12

reading relation to id


# 读取训练数据

In [7]:
def readTrainData(path, word2id, relation2id):
    """
    读取训练数据，生成实体对-标签句子字典
    """
    # 最大文本长度
    fixlen = 70
    # max length of position embedding is 60 (-60~+60)
    maxlen = 60

    train_sen = {}  # {entity pair:[[[[word1,rel_e1_1,rel_e2_1],[word2,rel_e1_2,rel_e2_2],...], [[],[],...]]]}
    train_ans = {}  # {entity pair:[label1,label2,...]} the label is one-hot vector

    print('reading train data')
    with open(path, 'r', encoding='utf-8') as f:
        while True:
            content = f.readline()  #  处理每一行训练句子文本
            if content == '':
                break

            content = content.strip().split()
            # 获取实体名称
            en1 = content[0]
            en2 = content[1]
            # 获取关系
            # relation = 0
            if content[2] not in relation2id:
                relation = relation2id['NA']
            else:
                relation = relation2id[content[2]]
            # put the same entity pair sentences into a dict
            
            tup = (en1, en2)
            label_tag = 0
            y_id = relation
            label = [0 for i in range(len(relation2id))]
            label[y_id] = 1  # 这里的label实际上变成了一个one-hot向量
            
            if tup not in train_sen:
                train_sen[tup] = []
                train_sen[tup].append([])
                
                train_ans[tup] = []
                train_ans[tup].append(label)  # 用来标记一个tup的关系，label表示一个one-hot向量
            else:
                temp = find_index(label, train_ans[tup])  
                if temp == -1:
                    # label列表没有出现在已知label列表集合中，说明这是同实体对的新label列表，
                    # 需要在句子中新加入一个空列表，label列表集合中新加入一个新label列表
                    train_ans[tup].append(label)
                    train_sen[tup].append([])
                    label_tag = len(train_ans[tup]) - 1  # label_tag：同一实体对下对label列表进行标号
                else:
                    label_tag = temp

            sentence = content[3]

            # 找到实体词在句子中的位置，找不到的话设置为0
#             en1pos = 0
#             en2pos = 0
            #For Chinese
            en1pos = sentence.find(en1)
            if en1pos == -1:
                en1pos = 0
            en2pos = sentence.find(en2)
            if en2pos == -1:
                en2pos = 0

            output = []
            # Embeding the position
            for i in range(fixlen):
                word = word2id['BLANK']  # 16116
                rel_e1 = pos_embed(i - en1pos, maxlen)
                rel_e2 = pos_embed(i - en2pos, maxlen)
                # [word, rel_e1, rel_e2] 表示一个字相对两个实体字的相对位置，word先全部定为【空】，然后再依次赋予id值
                output.append([word, rel_e1, rel_e2])  

            for i in range(min(fixlen, len(sentence))):  # 选取最小的长度，超过的截断，不足的其他位补充为【空】
                # word = 0
                if sentence[i] not in word2id:
                    word = word2id['UNK']
                else:
                    word = word2id[sentence[i]]
                output[i][0] = word  # 将 word2id 中的词对应的id赋值给 output[i][0]

            train_sen[tup][label_tag].append(output)
    return train_sen, train_ans

In [8]:
train_path = './origin_data/train.txt'
train_sentences, train_labels = readTrainData(train_path, word2id_dict, relation2id_dict)

reading train data


In [9]:
train_labels

{('朱时茂', '陈佩斯'): [[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]],
 ('女', '卢润森'): [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('侯佩岑', '黄伯俊'): [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('李敖', '王尚勤'): [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('傅家俊', '丁俊晖'): [[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]],
 ('梁左', '梁天'): [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]],
 ('司马懿', '诸葛亮'): [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('徐宗汉', '张竹君'): [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('慕容德', '慕容暐'): [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('李菁', '郭德纲'): [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]],
 ('唐杰忠', '刘伟'): [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('钱钟书', '辛笛'): [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]],
 ('元武', '元华'): [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('郭全宝', '郭启儒'): [[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]],
 ('傅全香', '吴小楼'): [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('巩俐', '黄和祥'): [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('郑家钧', '夏明翰'): [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 ('李嘉昭', '李云经'): [[1, 0, 0, 0, 0, 0, 0, 0,

# 读取测试数据

In [10]:
def readTestData(path, word2id, relation2id):
    """
    读取测试数据，生成实体对-标签句子字典
    """
    fixlen = 70
    maxlen = 60
    print('reading test data')

    test_sen = {}  # {entity pair:[[sentence 1],[sentence 2]...]}
    test_ans = {}  # {entity pair:[labels,...]} the labels is N-hot vector (N is the number of multi-label)

    with open('./origin_data/test.txt', 'r', encoding='utf-8') as f:
        while True:
            content = f.readline()
            if content == '':
                break

            content = content.strip().split()
            en1 = content[0]
            en2 = content[1]
            if content[2] not in relation2id:
                relation = relation2id['NA']
            else:
                relation = relation2id[content[2]]
            
            tup = (en1, en2)
            y_id = relation
            
            if tup not in test_sen:
                test_sen[tup] = []
                label_tag = 0  # 完全没用上
                label = [0 for i in range(len(relation2id))]
                label[y_id] = 1
                test_ans[tup] = label
            else:
                test_ans[tup][y_id] = 1

            sentence = content[3]

#             en1pos = 0
#             en2pos = 0

            #For Chinese
            en1pos = sentence.find(en1)
            if en1pos == -1:
                en1pos = 0
            en2pos = sentence.find(en2)
            if en2pos == -1:
                en2pos = 0

            output = []

            for i in range(fixlen):
                word = word2id['BLANK']
                rel_e1 = pos_embed(i - en1pos, maxlen)
                rel_e2 = pos_embed(i - en2pos, maxlen)
                output.append([word, rel_e1, rel_e2])

            for i in range(min(fixlen, len(sentence))):
                word = 0
                if sentence[i] not in word2id:
                    word = word2id['UNK']
                else:
                    word = word2id[sentence[i]]

                output[i][0] = word
            test_sen[tup].append(output)
    return test_sen, test_ans

In [11]:
test_path = './origin_data/test.txt'
test_sentences, test_labels = readTestData(test_path, word2id_dict, relation2id_dict)

reading test data


## 注意（可能引起的问题）：

train_sentences比test_sentences每一个value中要多一层列表括号；

train_labels比test_labels每一个value中要多一层列表括号

In [12]:
# 创建存放训练，测试数据位置
if not os.path.exists("data"):
    os.makedirs("data")

# 生成训练数据

In [13]:
def generateTrainData(train_sen, train_ans):
    print('organizing train data')
    train_x = []
    train_y = []
    with open('./data/train_q&a.txt', 'w', encoding='utf-8') as f:
        temp = 0
        for tup in train_sen:
            if len(train_ans[tup]) != len(train_sen[tup]):  # 检查是否句子标注和label标注是一一对应的
                print('ERROR')
            lenth = len(train_ans[tup])  # 关系标签的个数
            for j in range(lenth):
                train_x.append(train_sen[tup][j])
                train_y.append(train_ans[tup][j])
                # 在文件中写入【序号，实体1，实体2，标签位置】
                f.write(str(temp) + '\t' + tup[0] + '\t' + tup[1] + '\t' + str(np.argmax(train_ans[tup][j])) + '\n')
                temp += 1
    train_x = np.array(train_x)
    train_y = np.array(train_y)
    np.save('./data/train_x.npy', train_x)
    np.save('./data/train_y.npy', train_y)
    return train_x, train_y

In [14]:
train_data, train_label = generateTrainData(train_sentences, train_labels)

organizing train data




# 生成测试数据

In [15]:
def generateTestData(test_sen, test_ans):
    print('organizing test data')
    test_x = []
    test_y = []
    with open('./data/test_q&a.txt', 'w', encoding='utf-8') as f:
        temp = 0
        for tup in test_sen:
            test_x.append(test_sen[tup])
            test_y.append(test_ans[tup])
            tempstr = ''
            for j in range(len(test_ans[tup])):
                if test_ans[tup][j] != 0:
                    tempstr = tempstr + str(j) + '\t'  # 获取值为1的index
            # 在文件中写入【序号，实体1，实体2，标签位置】
            f.write(str(temp) + '\t' + tup[0] + '\t' + tup[1] + '\t' + tempstr + '\n')
            temp += 1
    test_x = np.array(test_x)
    test_y = np.array(test_y)
    np.save('./data/testall_x.npy', test_x)
    np.save('./data/testall_y.npy', test_y)
    return test_x, test_y

In [16]:
test_data, test_label = generateTestData(test_sentences, test_labels)

organizing test data


In [17]:
train_data[1]

[[[1053, 32, 46],
  [4159, 33, 47],
  [613, 34, 48],
  [11, 35, 49],
  [108, 36, 50],
  [99, 37, 51],
  [311, 38, 52],
  [242, 39, 53],
  [90, 40, 54],
  [229, 41, 55],
  [151, 42, 56],
  [529, 43, 57],
  [33, 44, 58],
  [51, 45, 59],
  [713, 46, 60],
  [1053, 47, 61],
  [1803, 48, 62],
  [734, 49, 63],
  [1, 50, 64],
  [723, 51, 65],
  [236, 52, 66],
  [16115, 53, 67],
  [322, 54, 68],
  [10, 55, 69],
  [90, 56, 70],
  [229, 57, 71],
  [33, 58, 72],
  [324, 59, 73],
  [92, 60, 74],
  [265, 61, 75],
  [1, 62, 76],
  [1053, 63, 77],
  [4159, 64, 78],
  [613, 65, 79],
  [11, 66, 80],
  [61, 67, 81],
  [313, 68, 82],
  [45, 69, 83],
  [24, 70, 84],
  [16115, 71, 85],
  [322, 72, 86],
  [64, 73, 87],
  [996, 74, 88],
  [1532, 75, 89],
  [16115, 76, 90],
  [445, 77, 91],
  [82, 78, 92],
  [205, 79, 93],
  [16116, 80, 94],
  [16116, 81, 95],
  [16116, 82, 96],
  [16116, 83, 97],
  [16116, 84, 98],
  [16116, 85, 99],
  [16116, 86, 100],
  [16116, 87, 101],
  [16116, 88, 102],
  [16116, 89, 10

# 将字id，实体距离1，实体距离2数据分离

In [18]:
def seperate():
    print('reading training data')
    x_train = np.load('./data/train_x.npy', allow_pickle=True)

    train_word = []
    train_pos1 = []
    train_pos2 = []

    print('seprating train data')
    for i in range(len(x_train)):
        word = []
        pos1 = []
        pos2 = []
        for j in x_train[i]:
            temp_word = []
            temp_pos1 = []
            temp_pos2 = []
            for k in j:  # 获取字id，实体距离1，实体距离2
                temp_word.append(k[0])
                temp_pos1.append(k[1])
                temp_pos2.append(k[2])
            word.append(temp_word)
            pos1.append(temp_pos1)
            pos2.append(temp_pos2)
        train_word.append(word)
        train_pos1.append(pos1)
        train_pos2.append(pos2)
    
    # 将字id，实体距离1，实体距离2分开
    train_word = np.array(train_word)
    train_pos1 = np.array(train_pos1)
    train_pos2 = np.array(train_pos2)
    np.save('./data/train_word.npy', train_word)
    np.save('./data/train_pos1.npy', train_pos1)
    np.save('./data/train_pos2.npy', train_pos2)

    print('seperating test all data')
    x_test = np.load('./data/testall_x.npy', allow_pickle=True)
    test_word = []
    test_pos1 = []
    test_pos2 = []

    for i in range(len(x_test)):
        word = []
        pos1 = []
        pos2 = []
        for j in x_test[i]:
            temp_word = []
            temp_pos1 = []
            temp_pos2 = []
            for k in j:
                temp_word.append(k[0])
                temp_pos1.append(k[1])
                temp_pos2.append(k[2])
            word.append(temp_word)
            pos1.append(temp_pos1)
            pos2.append(temp_pos2)
        test_word.append(word)
        test_pos1.append(pos1)
        test_pos2.append(pos2)

    test_word = np.array(test_word)
    test_pos1 = np.array(test_pos1)
    test_pos2 = np.array(test_pos2)

    np.save('./data/testall_word.npy', test_word)
    np.save('./data/testall_pos1.npy', test_pos1)
    np.save('./data/testall_pos2.npy', test_pos2)

In [19]:
seperate()

reading training data
seprating train data
seperating test all data




# 保存vec中的字

In [20]:
def get_metadata():
    with open('./data/metadata.tsv', 'w', encoding='utf-8') as fwrite:
        with open('./origin_data/vec.txt', encoding='utf-8') as f:
            f.readline()
            while True:
                content = f.readline().strip()
                if content == '':
                    break
                name = content.split()[0]
                fwrite.write(name + '\n')

In [21]:
get_metadata()