In [1]:
import os
# 使用第三张GPU卡
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
import json
import random
from pyjarowinkler import distance
import numpy as np
train=True
# 训练集中的作者论文信息
with open("train/train_author.json", "r") as f2:
    author_data = json.load(f2)

# 训练集的论文元信息
with open("train/train_pub.json", "r") as f2:
    pubs_dict = json.load(f2)

print(len(author_data))


name_train = set()

# 筛选训练集，只取同名作者数大于等于5个的名字作为训练集。
for name in author_data:
    persons = author_data[name]
    if(len(persons) > 5):
        name_train.add((name))

print(len(name_train))

# 采样500个训练例子，一个训练例子包含paper和正例作者以及5个负例作者（正负例比=1：5）

# 记录paper所属作者和名字
paper2aid2name = {}

for author_name in name_train:
    persons = author_data[author_name]
    for person in persons:
        paper_list = persons[person]
        for paper_id in paper_list:
            paper2aid2name[paper_id] = (author_name, person)

print(len(paper2aid2name))
# print(paper2aid2name)

total_paper_list = list(paper2aid2name.keys())

# 采样10000篇paper作为训练集
print(len(total_paper_list))
# train_paper_list = random.sample(total_paper_list, 40000)  # comment it to train all
train_paper_list = total_paper_list

# train_paper_list = []
    
# 把采样的500篇paper转变成对应的训练例子，一个训练例子包含paper和正例作者以及5个负例作者（正负例比=1：5）
train_instances = []
for paper_id in train_paper_list:
    
    # 保存对应的正负例
    pos_ins = set()
    neg_ins = set()
    
    paper_author_name = paper2aid2name[paper_id][0]
    paper_author_id = paper2aid2name[paper_id][1]
    
    pos_ins.add((paper_id, paper_author_id))
    
    # 获取同名的所有作者(除了本身)作为负例的candidate
    persons = list(author_data[paper_author_name].keys())
    persons.remove(paper_author_id)
    assert len(persons) == (len(list(author_data[paper_author_name].keys())) - 1)
    
    # 每个正例采样5个负例
    neg_author_list = random.sample(persons, 5)
    for i in neg_author_list:
        neg_ins.add((paper_id, i))
        
    train_instances.append((pos_ins, neg_ins))
    
print(len(train_instances))

221
196
198607
198607
198607


In [3]:
from pyjarowinkler import distance


# 对author_name 进行清洗
def clean_name(name):
    if name is None:
        return ""
    x = [k.strip() for k in name.lower().strip().replace(".", "").replace("-", " ").replace("_", ' ').split()]
    # x = [k.strip() for k in name.lower().strip().replace("-", "").replace("_", ' ').split()]
    full_name = ' '.join(x)
    name_part = full_name.split()
    if(len(name_part) >= 1):
        return full_name
    else:
        return None

# 找出paper中author_name所对应的位置
def delete_main_name(author_list, name):
    score_list = []
    name = clean_name(name)
    author_list_lower = []
    for author in author_list:
        author_list_lower.append(author.lower())
    name_split = name.split()
    for author in author_list_lower:
        # lower_name = author.lower()
        score = distance.get_jaro_distance(name, author, winkler=True, scaling=0.1)
        author_split = author.split()
        inter = set(name_split) & set(author_split)
        alls = set(name_split) | set(author_split)
        score += round(len(inter)/len(alls), 6)
        score_list.append(score)

    rank = np.argsort(-np.array(score_list))
    return_list = [author_list_lower[i] for i in rank[1:]]

    return return_list, rank[0]

# 训练集特征生成函数
def process_feature_coauthor(pos_ins, paper_coauthors):
    
    feature_list = []

    paper = pos_ins[0] 
    author = pos_ins[1]


    paper_name = paper2aid2name[paper][0]
    
    # 从作者的论文列表中把该篇论文去掉，防止训练出现bias
    doc_list = []
    for doc in author_data[paper_name][author]:
        if(doc != paper):
            doc_list.append(doc)
    for doc in doc_list:
        if doc == paper:
            print("error!")
            exit()
    
    # 保存作者的所有paper的coauthors以及各自出现的次数(作者所拥有论文的coauthors)
    candidate_authors_int = defaultdict(int)

    total_author_count = 0
    for doc in doc_list:
        
        doc_dict = pubs_dict[doc]
        author_list = []

        paper_authors = doc_dict['authors']
        paper_authors_len = len(paper_authors)
#         paper_authors = random.sample(paper_authors, min(50, paper_authors_len))
    
        for author in paper_authors:                
            clean_author = clean_name(author['name'])
            if(clean_author != None):
                author_list.append(clean_author)
        if(len(author_list) > 0):
            # 获取paper中main author_name所对应的位置
            _, author_index = delete_main_name(author_list, paper_name)

            # 获取除了main author_name外的coauthor
            for index in range(len(author_list)):
                if(index == author_index):
                    continue
                else:
                    candidate_authors_int[author_list[index]] += 1
                    total_author_count += 1

    # author 的所有不同coauthor name
    author_keys = list(candidate_authors_int.keys())

    if ((len(author_keys) == 0) or (len(paper_coauthors) == 0)):
        feature_list.extend([0.] * 5)
    else:
        co_coauthors = set(paper_coauthors) & set(author_keys)
        coauthor_len = len(co_coauthors)
        
        
        co_coauthors_ratio_for_paper = round(coauthor_len / len(paper_coauthors), 6)
        co_coauthors_ratio_for_author = round(coauthor_len / len(author_keys), 6)
        
        coauthor_count = 0
        for coauthor_name in co_coauthors:
            coauthor_count += candidate_authors_int[coauthor_name]
            
        
        
        co_coauthors_ratio_for_author_count = round(coauthor_count / total_author_count, 6)

        # 计算了5维paper与author所有的paper的coauthor相关的特征：
        #    1. 不重复的coauthor个数
        #    2. 不重复的coauthor个数 / paper的所有coauthor的个数
        #    3. 不重复的coauthor个数 / author的所有paper不重复coauthor的个数
        #    4. coauthor个数（含重复）
        #    4. coauthor个数（含重复）/ author的所有paper的coauthor的个数（含重复）
        feature_list.extend([coauthor_len, co_coauthors_ratio_for_paper, co_coauthors_ratio_for_author, coauthor_count, co_coauthors_ratio_for_author_count])
        
#         print(feature_list)
    return feature_list

In [4]:
def process_feature_keywords(pos_ins, topic_list):
    
    feature_list = []

    paper = pos_ins[0] 
    author = pos_ins[1]


    paper_name = paper2aid2name[paper][0]
    
    # 从作者的论文列表中把该篇论文去掉，防止训练出现bias
    doc_list = []
    for doc in author_data[paper_name][author]:
        if(doc != paper):
            doc_list.append(doc)
    for doc in doc_list:
        if doc == paper:
            print("error!")
            exit()
    
    # 保存作者的所有paper的coauthors以及各自出现的次数(作者所拥有论文的coauthors)
    candidate_keywords_int = defaultdict(int)

    total_keyword_count = 0
    for doc in doc_list:
        
        doc_dict = pubs_dict[doc]
        keyword_list = []
        if ('keywords' not in doc_dict.keys()):
            feature_list.extend([0.] * 5)
            return feature_list
        paper_keywords = doc_dict['keywords']
        paper_keywords_len = len(paper_keywords)
        paper_keywords = random.sample(paper_keywords, min(10, paper_keywords_len))
        
        for keywords in paper_keywords:
            clean_keywords=clean_name(keywords)
            if clean_keywords!=None:
                keyword_list.append(clean_keywords)
        if(len(keyword_list) > 0):
            # 获取除了main author_name外的coauthor
            for index in range(len(keyword_list)):
                candidate_keywords_int[keyword_list[index]] += 1
                total_keyword_count += 1

    # author 的所有不同coauthor name
    author_keys = list(candidate_keywords_int.keys())

    if ((len(author_keys) == 0) or (len(paper_keywords) == 0)):
        feature_list.extend([0.] * 5)
    else:
        co_keywords = set(paper_keywords) & set(author_keys)
        keyword_len = len(co_keywords)
#         same=[x for x in paper_keywords if  x in author_keys]
        co_keywords_ratio_for_paper = round(keyword_len / len(paper_keywords), 6)
        co_keywords_ratio_for_author = round(keyword_len / len(author_keys), 6)
        
        keywords_count = 0
        for keywords_name in co_keywords:
            keywords_count += candidate_keywords_int[keywords_name]
            
        
        
        co_keywords_ratio_for_author_count = round(keywords_count / total_keyword_count, 6)

        
        feature_list.extend([keyword_len, co_keywords_ratio_for_paper, co_keywords_ratio_for_paper, keywords_count, co_keywords_ratio_for_author_count])
        
#         print(feature_list)
    return feature_list

In [6]:
import tqdm
from collections import defaultdict

pos_features_author = []
neg_features_author = []
pos_features_keywords = []
neg_features_keywords = []
pos_features_org = []
neg_features_org = []
print(len(train_instances))

for ins in tqdm.tqdm(train_instances):
    
    pos_set = ins[0]
    neg_set = ins[1]
    paper_id = list(pos_set)[0][0]
    paper_name = paper2aid2name[paper_id][0]
    author_name=None
    
    author_list = []
    # 获取paper的coauthors
    paper_coauthors = []
    feature_coauthor=[]
    
    paper_authors = pubs_dict[paper_id]['authors']
    paper_authors_len = len(paper_authors)
    # 只取前50个author以保证效率
#     paper_authors = random.sample(paper_authors, min(50, paper_authors_len))

#     for author in paper_authors:                
#         clean_author = clean_name(author['name'])
#         if(clean_author != None):
#             author_list.append(clean_author)
    author_list.extend([x 
                        for x in [clean_name(xx['name']) 
                                  for xx in paper_authors] 
                        if x is not None])  # simplify code
    
    if(len(author_list) > 0):
        # 获取paper中main author_name所对应的位置
        _, author_index = delete_main_name(author_list, paper_name)
        
        # 获取除了main author_name外的coauthor
        for index in range(len(author_list)):
            if(index == author_index):
                author_name=author_list[author_index]
                continue
            else:
                paper_coauthors.append(author_list[index])
        
    
#         for pos_ins in pos_set:
#             pos_features_author.append(process_feature_coauthor(pos_ins, paper_coauthors))
#         for neg_ins in neg_set:
#             neg_features_author.append(process_feature_coauthor(neg_ins, paper_coauthors))
        pos_features_author.extend([process_feature_coauthor(xx, paper_coauthors)
                                    for xx in pos_set])
        neg_features_author.extend([process_feature_coauthor(xx, paper_coauthors)
                                    for xx in neg_set])
    else:
#         for pos_ins in pos_set:
#             pos_features_author.append([0.] * 5)
#         for neg_ins in neg_set:
#             neg_features_author.append([0.] * 5)
        pos_features_author.extend([[0.] * 5] * len(pos_set))
        neg_features_author.extend([[0.] * 5] * len(neg_set))
        
#         处理话题
    topic_list=[]
    if 'keywords' in pubs_dict[paper_id].keys():
        paper_topics=pubs_dict[paper_id]['keywords']
    
    paper_topics_len=len(paper_topics)
    paper_topics=random.sample(paper_topics,min(10,paper_topics_len))
    
#     for topic in paper_topics:
#         clean_topic=clean_name(topic)
#         if clean_topic!=None:
#             topic_list.append(clean_topic)
    topic_list.extend([x
                       for x in [clean_name(xx)
                                 for xx in paper_topics]
                       if x is not None])
    if len(topic_list)>0:
#         for pos_ins in pos_set:
#             pos_features_keywords.append(process_feature_keywords(pos_ins,topic_list))
#         for neg_ins in neg_set:
#             neg_features_keywords.append(process_feature_keywords(neg_ins,topic_list))
        pos_features_keywords.extend([process_feature_keywords(xx, topic_list)
                                      for xx in pos_set])
        neg_features_keywords.extend([process_feature_keywords(xx, topic_list)
                                      for xx in neg_set])
    else:
#         for pos_ins in pos_set:
#             pos_features_keywords.append([0.] * 5)
#         for neg_ins in neg_set:
#             neg_features_keywords.append([0.] * 5)
        pos_features_keywords.extend([[0.] * 5] * len(pos_set))
        neg_features_keywords.extend([[0.] * 5] * len(neg_set))
            
            
#     #         处理org
    
    
#     name = clean_name(paper_name)
#     author_list_lower = []
#     for author in author_list:
#         author_list_lower.append(author.lower())
#     name_split = name.split()
#     org=None
    
# #     print(pubs_dict[paper_id].keys())
#     paper_authors=pubs_dict[paper_id]['authors']
#     for author in paper_authors:
#         aname=clean_name(author['name'])
#         if aname==author_name:
# #             print('in')
#             if 'org' in author.keys():
#                 org=author['org']
#             break
    
#     if org!=None and org!='':
# #         print(1,org)
#         for pos_ins in pos_set:
#             pos_features_org.append(process_feature_org(pos_ins, org, author_name))
#         for neg_ins in neg_set:
#             neg_features_org.append(process_feature_org(neg_ins, org, author_name))
#     else:
#         for pos_ins in pos_set:
#             pos_features_org.append([0.])
#         for neg_ins in neg_set:
#             neg_features_org.append([0.] )

# pos_features=[]
# neg_features=[]
# for i in range(len(pos_features_author)):
#     pos_features.append(pos_features_author[i]+pos_features_keywords[i])
# for i in range(len(neg_features_author)):
#     neg_features.append(neg_features_author[i]+neg_features_keywords[i])
pos_features = [pos_features_author[i] + pos_features_keywords[i]
                     for i in range(len(pos_features_author))]
neg_features = [neg_features_author[i] + neg_features_keywords[i]
                     for i in range(len(neg_features_author))]


  0%|          | 0/198607 [00:00<?, ?it/s][A
  0%|          | 3/198607 [00:00<2:19:26, 23.74it/s][A


198607


  0%|          | 7/198607 [00:00<2:06:41, 26.12it/s][A
  0%|          | 11/198607 [00:00<2:00:53, 27.38it/s][A
  0%|          | 15/198607 [00:00<1:56:53, 28.32it/s][A
  0%|          | 19/198607 [00:00<1:49:15, 30.29it/s][A
  0%|          | 22/198607 [00:00<1:52:19, 29.47it/s][A
  0%|          | 26/198607 [00:00<1:49:57, 30.10it/s][A
  0%|          | 30/198607 [00:00<1:45:19, 31.42it/s][A
  0%|          | 34/198607 [00:01<1:44:44, 31.60it/s][A
  0%|          | 38/198607 [00:01<1:46:28, 31.08it/s][A
  0%|          | 42/198607 [00:01<1:45:27, 31.38it/s][A
  0%|          | 46/198607 [00:01<1:46:47, 30.99it/s][A
  0%|          | 50/198607 [00:01<1:41:36, 32.57it/s][A
  0%|          | 54/198607 [00:01<1:41:31, 32.60it/s][A
  0%|          | 58/198607 [00:01<1:43:43, 31.90it/s][A
  0%|          | 62/198607 [00:01<1:44:36, 31.63it/s][A
  0%|          | 66/198607 [00:02<1:46:10, 31.17it/s][A
  0%|          | 70/198607 [00:02<1:43:40, 31.91it/s][A
  0%|          | 74/198607 [00:0

  0%|          | 396/198607 [00:19<3:54:57, 14.06it/s][A
  0%|          | 398/198607 [00:20<3:52:37, 14.20it/s][A
  0%|          | 400/198607 [00:20<3:52:58, 14.18it/s][A
  0%|          | 402/198607 [00:20<3:50:51, 14.31it/s][A
  0%|          | 404/198607 [00:20<3:50:28, 14.33it/s][A
  0%|          | 406/198607 [00:20<3:53:42, 14.13it/s][A
  0%|          | 408/198607 [00:20<3:55:08, 14.05it/s][A
  0%|          | 410/198607 [00:20<4:02:01, 13.65it/s][A
  0%|          | 412/198607 [00:21<4:01:32, 13.68it/s][A
  0%|          | 414/198607 [00:21<3:57:39, 13.90it/s][A
  0%|          | 416/198607 [00:21<3:57:37, 13.90it/s][A
  0%|          | 418/198607 [00:21<3:54:38, 14.08it/s][A
  0%|          | 420/198607 [00:21<3:53:21, 14.15it/s][A
  0%|          | 422/198607 [00:21<3:54:56, 14.06it/s][A
  0%|          | 424/198607 [00:21<3:54:52, 14.06it/s][A
  0%|          | 427/198607 [00:22<3:43:09, 14.80it/s][A
  0%|          | 429/198607 [00:22<3:44:38, 14.70it/s][A
  0%|         

  0%|          | 834/198607 [00:39<42:33, 77.46it/s][A
  0%|          | 843/198607 [00:39<42:09, 78.19it/s][A
  0%|          | 858/198607 [00:40<36:06, 91.27it/s][A
  0%|          | 889/198607 [00:40<28:41, 114.87it/s][A
  0%|          | 913/198607 [00:40<24:14, 135.92it/s][A
  0%|          | 943/198607 [00:40<20:38, 159.63it/s][A
  0%|          | 973/198607 [00:40<17:45, 185.47it/s][A
  1%|          | 997/198607 [00:40<18:40, 176.32it/s][A
  1%|          | 1023/198607 [00:40<16:54, 194.76it/s][A
  1%|          | 1054/198607 [00:40<15:04, 218.31it/s][A
  1%|          | 1089/198607 [00:40<13:24, 245.48it/s][A
  1%|          | 1132/198607 [00:41<11:43, 280.53it/s][A
  1%|          | 1166/198607 [00:41<11:10, 294.49it/s][A
  1%|          | 1199/198607 [00:41<11:06, 296.28it/s][A
  1%|          | 1231/198607 [00:45<2:25:20, 22.63it/s][A
  1%|          | 1254/198607 [00:50<5:09:26, 10.63it/s][A
  1%|          | 1270/198607 [00:54<7:11:58,  7.61it/s][A
  1%|          | 1282/

  1%|          | 1428/198607 [01:27<11:37:54,  4.71it/s][A
  1%|          | 1429/198607 [01:28<11:30:09,  4.76it/s][A
  1%|          | 1430/198607 [01:28<11:34:31,  4.73it/s][A
  1%|          | 1431/198607 [01:28<11:36:37,  4.72it/s][A
  1%|          | 1432/198607 [01:28<11:25:54,  4.79it/s][A
  1%|          | 1433/198607 [01:28<11:35:09,  4.73it/s][A
  1%|          | 1434/198607 [01:29<11:28:20,  4.77it/s][A
  1%|          | 1435/198607 [01:29<11:24:46,  4.80it/s][A
  1%|          | 1436/198607 [01:29<11:30:52,  4.76it/s][A
  1%|          | 1437/198607 [01:29<11:30:42,  4.76it/s][A
  1%|          | 1438/198607 [01:29<11:35:57,  4.72it/s][A
  1%|          | 1439/198607 [01:30<11:41:41,  4.68it/s][A
  1%|          | 1440/198607 [01:30<11:40:59,  4.69it/s][A
  1%|          | 1441/198607 [01:30<11:40:41,  4.69it/s][A
  1%|          | 1442/198607 [01:30<11:29:04,  4.77it/s][A
  1%|          | 1443/198607 [01:30<11:26:02,  4.79it/s][A
  1%|          | 1444/198607 [01:31<11:3

KeyboardInterrupt: 

In [None]:
# np.savetxt("pos_features.txt",pos_features,fmt='%f',delimiter=',' )
# np.savetxt("neg_features.txt",neg_features,fmt='%f',delimiter=',')

In [7]:
# pos_features= np.loadtxt('pos_features.txt',delimiter=',')
# neg_features= np.loadtxt('neg_features.txt',delimiter=',')

In [None]:
print(neg_features)

In [None]:
print(pos_features)

In [10]:
# from sklearn.svm import SVC
# from sklearn.externals import joblib
# from sklearn import tree
# # 构建svm正负例
# svm_train_ins = []
# for ins in pos_features:
#     svm_train_ins.append((ins, 1))

# for ins in neg_features:
#     svm_train_ins.append((ins, 0))

# print(np.array(svm_train_ins).shape)

# random.shuffle(svm_train_ins)

# x_train= []
# y_train = []
# for ins in svm_train_ins:
#     x_train.append(ins[0])
#     y_train.append(ins[1])

# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(x_train, y_train)

In [None]:
# from sklearn.svm import SVC
# from sklearn.externals import joblib

# # 构建svm正负例
# svm_train_ins = []
# for ins in pos_features:
#     svm_train_ins.append((ins, 1))

# for ins in neg_features:
#     svm_train_ins.append((ins, 0))

# print(np.array(svm_train_ins).shape)

# random.shuffle(svm_train_ins)

# x_train= []
# y_train = []
# for ins in svm_train_ins:
#     x_train.append(ins[0])
#     y_train.append(ins[1])

# clf = SVC(probability=True)
# clf.fit(x_train, y_train)

(1191642, 2)




In [None]:
# train_ins = []
# for ins in pos_features:
#     train_ins.append((ins, 1))

# for ins in neg_features:
#     train_ins.append((ins, 0))

# print(np.array(train_ins).shape)

# random.shuffle(train_ins)

# x_train= []
# y_train = []
# for ins in train_ins:
#     x_train.append(ins[0])
#     y_train.append(ins[1])

In [None]:
# from sklearn.preprocessing import OneHotEncoder
# encoder=OneHotEncoder(sparse=False)
# yy=[[0],[1]]
# encoder.fit(yy)
# y_train_reshape=np.array(y_train).reshape(-1,1)
# y_train_onehot=encoder.transform(y_train_reshape)
# print(y_train_onehot[1:10])

# # y_test_reshape=np.array(y_test).reshape(-1,1)
# # y_test_onehot=encoder.transform(y_test_reshape)
# # print(y_test_onehot[1:10]

In [None]:
# x_train=np.array(x_train)

In [8]:
model=None
train_ins = []
for ins in pos_features:
    train_ins.append((ins, 1))

for ins in neg_features:
    train_ins.append((ins, 0))

print(np.array(train_ins).shape)

random.shuffle(train_ins)

x_train= []
y_train = []
for ins in train_ins:
    x_train.append(ins[0])
    y_train.append(ins[1])
from sklearn.preprocessing import OneHotEncoder
encoder=OneHotEncoder(sparse=False)
yy=[[0],[1]]
encoder.fit(yy)
y_train_reshape=np.array(y_train).reshape(-1,1)
y_train_onehot=encoder.transform(y_train_reshape)
print(y_train_onehot[1:10])

# y_test_reshape=np.array(y_test).reshape(-1,1)
# y_test_onehot=encoder.transform(y_test_reshape)
# print(y_test_onehot[1:10]
x_train=np.array(x_train)
import tensorflow as tf
import pickle as p
import os
logdir="logs"
checkpoint_path='./checkpoint/author.{epoch:02d}-{val_loss:.2f}.ckpt'
callbacks=[
    tf.keras.callbacks.TensorBoard(log_dir=logdir,
                                   histogram_freq=2),  # 生成tb需要的日志
    tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                      save_weights_only=True,
                                      verbose=1,
                                      period=5),  # 用于在epoch间保存要模型
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', 
                                         factor=0.1, 
                                         patience=3, 
                                         verbose=0, 
                                         mode='auto', 
                                         epsilon=0.0001, 
                                         cooldown=0, 
                                         min_lr=0),  # 当指标变化小时，减少学习率
]
checkpoint_dir=os.path.dirname(checkpoint_path)
latest=tf.train.latest_checkpoint(checkpoint_dir)
#建立Keras序列模型
model=tf.keras.models.Sequential()
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(units=512,
                               input_dim=10,
                               use_bias=True,
                               kernel_initializer='uniform',
                               bias_initializer='zeros',
                               activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(units=256,
                               activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(units=128,
                               activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(units=64,
                               activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(units=32,
                               activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dense(units=2,
                                activation='softmax'))
model.compile(optimizer=tf.keras.optimizers.Adam(0.1),    
             loss='binary_crossentropy',
             metrics=['accuracy'])
#if latest:
#    model.load_weights(latest)
if train:    
    train_history=model.fit(x=x_train,
                            shuffle=True,
                            y=y_train_onehot,
                            validation_split=0.01,  # 0.2用作验证集
                            epochs=100,
                            batch_size=8192,
                            callbacks=callbacks,
                            verbose=1) 

(1191642, 2)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]]
Train on 1179725 samples, validate on 11917 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: saving model to ./checkpoint/author.05-0.42.ckpt
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 00010: saving model to ./checkpoint/author.10-0.12.ckpt
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 00015: saving model to ./checkpoint/author.15-0.12.ckpt
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 00020: saving model to ./checkpoint/author.20-0.12.ckpt
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 00025: saving model to ./checkpoint/author.25-0.12.ckpt
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 00030: saving model to ./checkpoint/author.30-0.12.ckpt
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 00035: saving model to ./checkpoint/author

KeyboardInterrupt: 

In [None]:
# 训练集中的作者论文信息
with open("cna_data/whole_author_profile.json", "r") as f2:
    test_author_data = json.load(f2)

# 训练集的论文元信息
with open("cna_data/whole_author_profile_pub.json", "r") as f2:
    test_pubs_dict = json.load(f2)

# 待分配论文集
with open("cna_test_data/cna_test_unass_competition.json", "r") as f2:
    unass_papers = json.load(f2)

with open("cna_test_data/cna_test_pub.json", "r") as f2:
    unass_papers_dict = json.load(f2)
name_list=[]
# with open("cna_data/new_test_author_data.json", 'r') as files:
#     new_test_author_data = json.load(files)
# 简单处理whole_author_profile，将同名的作者合并：
# 为了效率，预处理new_test_author_data中的paper，将其全部处理成paper_id + '-' + author_index的形式。
new_test_author_data = {}
for author_id, author_info in tqdm.tqdm(test_author_data.items()):
    author_name = author_info['name']
    author_papers = author_info['papers']
    newly_papers = []

    for paper_id in author_papers:

        paper_authors = test_pubs_dict[paper_id]['authors']
        paper_authors_len = len(paper_authors)
        
        # 只利用author数小于50的paper，以保证效率
        if(paper_authors_len > 50):
            continue
#         paper_authors = random.sample(paper_authors, min(50, paper_authors_len))
        author_list = []
        for author in paper_authors:                
            clean_author = clean_name(author['name'])
            if(clean_author != None):
                author_list.append(clean_author)
        if(len(author_list) > 0):
            # 获取paper中main author_name所对应的位置
            _, author_index = delete_main_name(author_list, author_name)
#             print(paper_name)
            new_paper_id = str(paper_id) + '-' + str(author_index)
            newly_papers.append(new_paper_id)
            
       
        
        
    if(new_test_author_data.get(author_name) != None):
        new_test_author_data[author_name][author_id] = newly_papers
    else:
        tmp = {}
        tmp[author_id] = newly_papers
        new_test_author_data[author_name] = tmp
        name_list.append(author_name)
print(len(new_test_author_data))

In [None]:
# test集的特征生成函数，与train类似
def process_test_feature_coauthor(pair, new_test_author_data, test_pubs_dict, paper_coauthors):
    
    feature_list = []

    paper = pair[0] 
    author = pair[1]
    paper_name = pair[2]
    
    doc_list = new_test_author_data[paper_name][author]

    
    # 保存作者的所有coauthors以及各自出现的次数(作者所拥有论文的coauthors)
    candidate_authors_int = defaultdict(int)

    total_author_count = 0
    for doc in doc_list:
        doc_id = doc.split('-')[0]
        author_index = doc.split('-')[1]
        doc_dict = test_pubs_dict[doc_id]
        author_list = []

        paper_authors = doc_dict['authors']
        paper_authors_len = len(paper_authors)
        paper_authors = random.sample(paper_authors, min(50, paper_authors_len))
    
        for author in paper_authors:                
            clean_author = clean_name(author['name'])
            if(clean_author != None):
                author_list.append(clean_author)
        if(len(author_list) > 0):

            # 获取除了main author_name外的coauthor
            for index in range(len(author_list)):
                if(index == author_index):
                    continue
                else:
                    candidate_authors_int[author_list[index]] += 1
                    total_author_count += 1

    author_keys = list(candidate_authors_int.keys())

    if ((len(author_keys) == 0) or (len(paper_coauthors) == 0)):
        feature_list.extend([0.] * 5)
    else:
        co_coauthors = set(paper_coauthors) & set(author_keys)
        coauthor_len = len(co_coauthors)
        
        co_coauthors_ratio_for_paper = round(coauthor_len / len(paper_coauthors), 6)
        co_coauthors_ratio_for_author = round(coauthor_len / len(author_keys), 6)
        
        coauthor_count = 0
        for coauthor_name in co_coauthors:
            coauthor_count += candidate_authors_int[coauthor_name]
            
        
        
        co_coauthors_ratio_for_author_count = round(coauthor_count / total_author_count, 6)

        # 计算了5维paper与author所有的paper的coauthor相关的特征：
        #    1. 不重复的coauthor个数
        #    2. 不重复的coauthor个数 / paper的所有coauthor的个数
        #    3. 不重复的coauthor个数 / author的所有paper不重复coauthor的个数
        #    4. coauthor个数（含重复）
        #    4. coauthor个数（含重复）/ author的所有paper的coauthor的个数（含重复）
        feature_list.extend([coauthor_len, co_coauthors_ratio_for_paper, co_coauthors_ratio_for_author, coauthor_count, co_coauthors_ratio_for_author_count])
        
#         print(feature_list)
    return feature_list

In [None]:
# test集的特征生成函数，与train类似
def process_test_feature_keywords(pair, new_test_author_data, test_pubs_dict, paper_coauthors):
    
    feature_list = []

    paper = pair[0] 
    author = pair[1]
    paper_name = pair[2]
    
    doc_list = new_test_author_data[paper_name][author]

    
    # 保存作者的所有coauthors以及各自出现的次数(作者所拥有论文的coauthors)
    candidate_authors_int = defaultdict(int)

    total_author_count = 0
    for doc in doc_list:
        doc_id = doc.split('-')[0]
        author_index = doc.split('-')[1]
        doc_dict = test_pubs_dict[doc_id]
        author_list = []
        
        if 'keywords' not in doc_dict.keys():
            continue
        paper_authors = doc_dict['keywords']
        paper_authors_len = len(paper_authors)
        paper_authors = random.sample(paper_authors, min(10, paper_authors_len))
    
        for author in paper_authors:                
            author_list.append(author)
        if(len(author_list) > 0):

            # 获取除了main author_name外的coauthor
            for index in range(len(author_list)):
                if(index == author_index):
                    continue
                else:
                    candidate_authors_int[author_list[index]] += 1
                    total_author_count += 1
    author_keys = list(candidate_authors_int.keys())

    if ((len(author_keys) == 0) or (len(paper_coauthors) == 0)):
        feature_list.extend([0.] * 5)
    else:
        co_coauthors = set(paper_coauthors) & set(author_keys)
        coauthor_len = len(co_coauthors)
#         print(coauthor_len)
        co_coauthors_ratio_for_paper = round(coauthor_len / len(paper_coauthors), 6)
        co_coauthors_ratio_for_author = round(coauthor_len / len(author_keys), 6)
        
        coauthor_count = 0
        for coauthor_name in co_coauthors:
            coauthor_count += candidate_authors_int[coauthor_name]
            
        
        
        co_coauthors_ratio_for_author_count = round(coauthor_count / total_author_count, 6)

        # 计算了5维paper与author所有的paper的coauthor相关的特征：
        #    1. 不重复的coauthor个数
        #    2. 不重复的coauthor个数 / paper的所有coauthor的个数
        #    3. 不重复的coauthor个数 / author的所有paper不重复coauthor的个数
        #    4. coauthor个数（含重复）
        #    4. coauthor个数（含重复）/ author的所有paper的coauthor的个数（含重复）
        feature_list.extend([coauthor_len, co_coauthors_ratio_for_paper, co_coauthors_ratio_for_author, coauthor_count, co_coauthors_ratio_for_author_count])
        
#         print(feature_list)
    return feature_list

In [None]:
print(len(unass_papers))


count = 0

# 存储paper的所有candidate author id
paper2candidates = defaultdict(list)
# 存储对应的paper与candidate author的生成特征
paper2features = defaultdict(list)

for u_p in tqdm.tqdm(unass_papers):
    paper_id = u_p.split('-')[0]
    author_index = int(u_p.split('-')[1])
    author_list = []
    
    # 获取paper的coauthors
    paper_coauthors = []
    keywords_list=[]
    paper_name = ''
    paper_authors = unass_papers_dict[paper_id]['authors']
    if 'keywords' in unass_papers_dict[paper_id].keys():
        paper_keywords=unass_papers_dict[paper_id]['keywords']
    else:
        paper_keywords=''
    author_name=''
    org=''
#     paper_authors_len = len(paper_authors)
#     paper_authors = random.sample(paper_authors, min(50, paper_authors_len))

    for author in paper_authors:                
        clean_author = clean_name(author['name'])
        if(clean_author != None):
            author_list.append(clean_author)
    for key in paper_keywords:
        clean_key=clean_name(key)
        if clean_key!=None:
            keywords_list.append(clean_key)
    if(len(author_list) > 0):
        
        # 获取除了main author_name外的coauthor
        for index in range(len(author_list)):
            if(index == author_index):
                author_name=author_list[author_index]
                continue
            else:
                paper_coauthors.append(author_list[index])
    for author in paper_authors:
#         print(author.keys())
        clean_author = clean_name(author['name'])
        
        if author_name==clean_author and 'org' in author.keys():
            org=author['org']
#             print('in')
            
# 简单使用精确匹配找出candidate_author_list
   

    if paper_authors[author_index]['name'] == None or paper_authors[author_index]['name']==' ':
        paper_name=' '
    else:
        paper_name = '_'.join(clean_name(paper_authors[author_index]['name']).split())
    if(new_test_author_data.get(paper_name) != None):
        candidate_author_list = new_test_author_data[paper_name]
        for candidate_author in candidate_author_list:
            pair = (paper_id, candidate_author, paper_name)
            paper2candidates[paper_id].append(candidate_author)
            paper2features[paper_id].append(process_test_feature_coauthor(pair, new_test_author_data, test_pubs_dict, paper_coauthors)+
                                            process_test_feature_keywords(pair, new_test_author_data, test_pubs_dict, paper_keywords))
        count += 1
    else:
        score=0.0
        name_index=' '
        for name in (name_list):
            name_split=name.split('_')
            temp = distance.get_jaro_distance(paper_name, name, winkler=True, scaling=0.1)
            pname_split = paper_name.split('_')
            inter = set(name_split) & set(pname_split)
            alls = set(name_split) | set(pname_split)
            temp += round(len(inter)/len(alls), 6)
            if score<temp:
                name_index=name
                score=temp
        if(new_test_author_data.get(name_index) != None):
            candidate_author_list = new_test_author_data[name_index]
            for candidate_author in candidate_author_list:
                pair = (paper_id, candidate_author, name_index)
                paper2candidates[paper_id].append(candidate_author)
                paper2features[paper_id].append(process_test_feature_coauthor(pair, new_test_author_data, test_pubs_dict, paper_coauthors)+
                                            process_test_feature_keywords(pair, new_test_author_data, test_pubs_dict, paper_keywords))
                
            count+=1
            
            
print(count)
assert len(paper2candidates) == len(paper2features)
print(len(paper2candidates))

In [None]:
result_dict = defaultdict(list)
for paper_id, ins_feature_list in tqdm.tqdm(paper2features.items()): 
    score_list = []
    prob_pred = model.predict(np.array(ins_feature_list))[:, 1]
    score_list.extend(prob_pred)
#     for ins in ins_feature_list:
#         # 利用svm对一篇paper的所有candidate author去打分，利用分数进行排序，取top-1 author作为预测的author
# #         print(ins)
#         prob_pred = model.predict(np.array([ins]))[:, 1]
#         score_list.append(prob_pred[0])
    rank = np.argsort(-np.array(score_list))
    #取top-1 author作为预测的author
    predict_author = paper2candidates[paper_id][rank[0]]
    result_dict[predict_author].append(paper_id)

with open("cna_data/result_dnn.json", 'w') as files:
    json.dump(result_dict, files, indent = 4)

In [None]:
with open("cna_data/paper2features.json", 'w') as files:
    json.dump(result_dict, files, indent = 4)

In [None]:
result_dict = defaultdict(list)
for paper_id, ins_feature_list in tqdm.tqdm(paper2features.items()): 
    score_list = []
    for ins in ins_feature_list:
        # 利用svm对一篇paper的所有candidate author去打分，利用分数进行排序，取top-1 author作为预测的author
#         print(ins)
#         print([ins])
        prob_pred = clf.predict_proba([ins])[:, 1]
        score_list.append(prob_pred[0])
    rank = np.argsort(-np.array(score_list))
    #取top-1 author作为预测的author
    predict_author = paper2candidates[paper_id][rank[0]]
    result_dict[predict_author].append(paper_id)

with open("cna_data/result_svm.json", 'w') as files:
    json.dump(result_dict, files, indent = 4)