In [1]:
import pickle
import spacy
import math
import copy
import xgboost as xgb
import numpy as np
import time

In [2]:
### Read the Training Data
train_file = './Data/train.pickle'
train_mentions = pickle.load(open(train_file, 'rb'))

### Read the Training Labels...
train_label_file = './Data/train_labels.pickle'
train_labels = pickle.load(open(train_label_file, 'rb'))

### Read the Dev Data... (For Final Evaluation, we will replace it with the Test Data)
dev_file = './Data/dev.pickle'
dev_mentions = pickle.load(open(dev_file, 'rb'))

### Read the Parsed Entity Candidate Pages...
fname = './Data/parsed_candidate_entities.pickle'
parsed_entity_pages = pickle.load(open(fname, 'rb'))

### Read the Mention docs...
mens_docs_file = "./Data/men_docs.pickle"
men_docs = pickle.load(open(mens_docs_file, 'rb'))

dev_label_file = './Data/dev_labels.pickle'
dev_labels = pickle.load(open(dev_label_file, 'rb'))

In [3]:
def initiation(documents):
    global tf_entities, tf_tokens, idf_entities, idf_tokens

    tf_entities = {}
    tf_tokens = {}
    idf_tokens = {}
    idf_entities = {}
    ents_counter = {} # 用于和token中的entity抵消
    for i in documents:
        doc = nlp(documents[i])
        for n in doc.ents:
            if len(n) == 1:
                for tmp in n:
                    e = n.lemma_
            else:
                e = n.orth_
            if e not in tf_entities:
                tf_entities[e] = {}
                tf_entities[e][i] = 1
                if len(n) == 1:
                    ents_counter[e] = {}
                    ents_counter[e][i] = 1
            elif i not in tf_entities[e]:
                tf_entities[e][i] = 1
                if len(n) == 1:
                    ents_counter[e][i] = 1
            else:
                tf_entities[e][i] += 1
                if len(n) == 1:
                    ents_counter[e][i] += 1

        for text in doc:
            if not text.is_punct and not text.is_space and not text.is_stop:
                word = text.lemma_
                if word not in ents_counter or i not in ents_counter[word]:
                    if word not in tf_tokens:
                        tf_tokens[word] = {}
                        tf_tokens[word][i] = 1
                    elif i not in tf_tokens[word]:
                        tf_tokens[word][i] = 1
                    else:
                        tf_tokens[word][i] += 1
                else:
                    if ents_counter[word][i] > 1:
                        ents_counter[word][i] -= 1
                    else:
                        del ents_counter[word][i]

    idf_tokens = copy.deepcopy(tf_tokens)
    idf_entities = copy.deepcopy(tf_entities)

    # update the score
    for key in idf_tokens:
        for i in idf_tokens[key]:
            idf_tokens[key][i] = (1+math.log(1+math.log(idf_tokens[key][i]))) * (1+math.log(len(documents)/(1+len(idf_tokens[key]))))
    for key in idf_entities:
        for i in idf_entities[key]:
            idf_entities[key][i] = (1+math.log(idf_entities[key][i])) * (1+math.log(len(documents)/(1+len(idf_entities[key]))))
    
    return tf_entities, tf_tokens, idf_entities, idf_tokens

def tf_idf(candidate, doc_id):
    tokens_score = entities_score = 0
    candidate = nlp(candidate.replace('_', ' '))
    for ent in candidate.ents:
        if ent.lemma_ in idf_entities and doc_id in idf_entities[ent.lemma_]:
            entities_score += idf_entities[ent.lemma_][doc_id]

    for token in candidate:
        if token.lemma_ in idf_tokens and doc_id in idf_tokens[token.lemma_]:
            tokens_score += idf_tokens[token.lemma_][doc_id]

    combined_score = (entities_score + tokens_score * 0.4)/len(candidate)

    return combined_score

def minDistance(word1, word2):
    if not word1:
        return len(word2 or '') or 0

    if not word2:
        return len(word1 or '') or 0

    size1 = len(word1)
    size2 = len(word2)

    last = 0
    tmp = list(range(size2 + 1))
    value = None

    for i in list(range(size1)):
        tmp[0] = i + 1
        last = i
        for j in range(size2):
            if word1[i] == word2[j]:
                value = last
            else:
                value = 1 + min(last, tmp[j], tmp[j + 1])
            last = tmp[j+1]
            tmp[j+1] = value
    return value

def cosine(str1, str2):
    str2 = str2.replace('_', ' ')
    list_word1 = str1.split()
#     print(list_word1)
    list_word2 = str2.split()
#     print(list_word2)
    key_word = list(set(list_word1 + list_word2))
    word_vector1 = np.zeros(len(key_word))
    word_vector2 = np.zeros(len(key_word))
    count = 0
    flag = True
    for i in range(len(key_word)):
        for j in range(len(list_word1)):
            if key_word[i] == list_word1[j]:
                word_vector1[i] += 1
     
        for k in range(len(list_word2)):
            if key_word[i] == list_word2[k]:
                word_vector2[i] += 1
            if list_word2[k] in list_word1 and flag:
                count += 1  
        flag = False

    dist1=float(np.dot(word_vector1,word_vector2)/(np.linalg.norm(word_vector1)*np.linalg.norm(word_vector2)))
    count = count/len(list_word1)
    return dist1,count

def label_value(data, label):
    lis = []
    for i in data:
        for word in data[i]['candidate_entities']:
            if word == label[i]['label']:
                lis.append(1)
            else:
                lis.append(0)
    return np.array(lis)

def extract_features(mentions):
    distance_list = []
    tfidf_list = []
    same_counter = []
    cosine_list = []
    for i in mentions:
        for candidate in mentions[i]['candidate_entities']:
            distance_list.append(minDistance(mentions[i]['mention'], candidate))
            tfidf_list.append(tf_idf(candidate,mentions[i]['doc_title']))
            cosine_value, same_count = cosine(mentions[i]['mention'], candidate)
            cosine_list.append(cosine_value)
            same_counter.append(same_count)
    return np.array([tfidf_list, distance_list, cosine_list, same_counter]).T 

def data_group(mentions):
    data_group = []
    for i in mentions:
        data_group.append(len(mentions[i]['candidate_entities']))
    data_groups = np.array(data_group)
    return data_groups

def transform_data(features, groups, labels=None):
    xgb_data = xgb.DMatrix(data=features, label=labels)
    xgb_data.set_group(groups)
    return xgb_data

def compute_accuracy(result, data_labels):
    assert set(list(result.keys())) - set(list(data_labels.keys())) == set()
    TP = 0.0
    for id_ in result.keys():
        if result[id_] == data_labels[id_]['label']:
            TP +=1
    assert len(result) == len(data_labels)
    return TP/len(result)

In [4]:
global nlp
nlp = spacy.load('en_core_web_sm')
start = time.time()
initiation(men_docs)
print("initiation cost time: {}".format(time.time()-start))

start = time.time()
train_data = extract_features(train_mentions)
train_groups = data_group(train_mentions)
train_label = label_value(train_mentions,train_labels)
xgboost_train = transform_data(train_data, train_groups, train_label)
print("train data set cost time: {}".format(time.time()-start))

# Test Data
start = time.time()
test_data = extract_features(dev_mentions)
test_groups = data_group(dev_mentions)
xgboost_test = transform_data(test_data, test_groups)
print("test data set cost time: {}".format(time.time()-start))

initiation cost time: 4.897442817687988
train data set cost time: 72.26122903823853
test data set cost time: 21.699875831604004


In [23]:
start = time.time()
param = {'max_depth': 7, 'eta': 0.05, 'silent': 1, 'objective': 'rank:pairwise', 'min_child_weight': 0.01, 'lambda':100} # 290-0.54658
classifier = xgb.train(param, xgboost_train, num_boost_round=2000)
##  Predict test data...
# 4 1650 0.7922535211267606
# 8 4900 0.8133802816901409
# 8 1690 0.8133802816901409
# 7 1690 0.8169014084507042
# 7 2000 0.8169014084507042
# 7 2100 0.8204225352112676
# 7 2300 0.8204225352112676
# 7 2500 0.8204225352112676
# 7 2590 0.8204225352112676
# 7 4900 0.8133802816901409
# 6 1690 0.7992957746478874
# 6 4900 0.8169014084507042

preds = classifier.predict(xgboost_test)
print("predict cost time: {}".format(time.time()-start))

start = time.time()
result = []
counter = 0
preds = list(preds)
test_group = list(test_groups)

for i in range(len(test_group)):
    tmp = preds[counter:(test_group[i]+counter)]
    result.append(tmp.index(max(tmp)))
    counter += test_group[i]

result_dict = {}
for i in range(len(result)):
    result_dict[i+1] = dev_mentions[i+1]['candidate_entities'][result[i]]
print("build result cost time: {}".format(time.time()-start))
compute_accuracy(result_dict, dev_labels)

predict cost time: 11.29089903831482
build result cost time: 0.0013039112091064453


0.8169014084507042

In [None]:
acc_dict

In [None]:
{0.5403726708074534: {'depth': 8, 'num': 650},
 0.5527950310559007: {'depth': 6, 'num': 700},
 0.5590062111801242: {'depth': 6, 'num': 650},
 0.5652173913043478: {'depth': 1, 'num': 3100},
 0.5900621118012422: {'depth': 1, 'num': 400},
 0.5838509316770186: {'depth': 1, 'num': 550},
 0.577639751552795: {'depth': 1, 'num': 2900},
 0.5714285714285714: {'depth': 1, 'num': 3050},
 0.546583850931677: {'depth': 7, 'num': 450},
 0.5341614906832298: {'depth': 8, 'num': 700}}

In [5]:
min_acc = 0
acc_dict = {}
for depth in range(1,10):
    jishu = 0
    flag = False
    num = 100
    while num < 5000:
        param = {'max_depth': depth, 'eta': 0.05, 'silent': 1, 'objective': 'rank:pairwise', 'min_child_weight': 0.01, 'lambda':100} # 290-0.54658
        classifier = xgb.train(param, xgboost_train, num_boost_round=num)
        ##  Predict test data...
        preds = classifier.predict(xgboost_test)

        result = []
        counter = 0
        preds = list(preds)
        test_group = list(test_groups)

        for i in range(len(test_group)):
            tmp = preds[counter:(test_group[i]+counter)]
            result.append(tmp.index(max(tmp)))
            counter += test_group[i]

        result_dict = {}
        for i in range(len(result)):
            result_dict[i+1] = dev_mentions[i+1]['candidate_entities'][result[i]]
        accuracy = compute_accuracy(result_dict, dev_labels)
        
        if len(acc_dict) >= 10:
            min_acc = min(acc_dict.keys())
            if accuracy > min_acc:
                flag = False
                del acc_dict[min_acc]
                acc_dict[accuracy] = {}
                acc_dict[accuracy]['depth'] = depth
                acc_dict[accuracy]['num'] = num
                num += 50
            else:
                flag = True
                if flag:
                    jishu += 1
                if jishu > 5 and jishu <= 10:
                    num += 300
                elif jishu > 10:
                    break
                else:
                    num += 50
        else:
            acc_dict[accuracy] = {}
            acc_dict[accuracy]['depth'] = depth
            acc_dict[accuracy]['num'] = num
            num += 50
            
        print(depth,num,flag,jishu)

1 150 False 0
1 200 False 0
1 250 False 0
1 300 False 0
1 350 False 0
1 400 False 0
1 450 False 0
1 500 False 0
1 550 False 0
1 600 False 0
1 650 False 0
1 700 False 0
1 750 False 0
1 800 False 0
1 850 False 0
1 900 False 0
1 950 False 0
1 1000 False 0
1 1050 False 0
1 1100 False 0
1 1150 False 0
1 1200 False 0
1 1250 False 0
1 1300 False 0
1 1350 False 0
1 1400 False 0
1 1450 False 0
1 1500 False 0
1 1550 False 0
1 1600 False 0
1 1650 False 0
1 1700 False 0
1 1750 False 0
1 1800 False 0
1 1850 False 0
1 1900 False 0
1 1950 False 0
1 2000 False 0
1 2050 False 0
1 2100 False 0
1 2150 False 0
1 2200 False 0
1 2250 False 0
1 2300 False 0
1 2350 False 0
1 2400 False 0
1 2450 False 0
1 2500 False 0
1 2550 False 0
1 2600 False 0
1 2650 False 0
1 2700 False 0
1 2750 False 0
1 2800 False 0
1 2850 False 0
1 2900 False 0
1 2950 False 0
1 3000 False 0
1 3050 False 0
1 3100 False 0
1 3150 False 0
1 3200 False 0
1 3250 False 0
1 3300 False 0
1 3350 False 0
1 3400 False 0
1 3450 False 0
1 3500 False