In [1]:
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import KMeans
import gensim
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score



In [3]:
data_path = './dataset/'
dataset = 'net_aminer_homo'

name_label_dict = {}
with open('8area_label.txt') as f:
    lines = f.readlines()
    for line in lines:
        name,label = line.split(' ')
        name_label_dict[name] = int(label.strip())
        
id_name_dict = {}
with open(data_path + dataset + '/id_author.txt') as f:
    lines = f.readlines()
    for line in lines:
        a_id,a_name = line.replace('\n','').split(' ')
        id_name_dict[a_id] = a_name

In [4]:
homo_model = gensim.models.Word2Vec.load('net_aminer_homo_model')
indices_homo = sorted(homo_model.wv.vocab)
X_homo = homo_model[indices_homo]

part_model = gensim.models.Word2Vec.load('net_aminer_part_model')
indices_part = sorted(part_model.wv.vocab)
X_part = part_model[indices_part]

  This is separate from the ipykernel package so we can avoid doing imports until
  import sys


In [5]:
print(X_homo.shape)
print(X_part.shape)

(42157, 128)
(42157, 128)


In [68]:
def cal_acc(indices,X):
    idx_labels = []
    for idx,a in enumerate(indices):
        name = id_name_dict[a]
        label = name_label_dict[name]
        idx_labels.append(label)
    kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

    clus_dict = {'author_id':indices,'cluster':kmeans.labels_,'label':idx_labels} 
    clus_df = pd.DataFrame(data=clus_dict)

    class0_df = clus_df.loc[clus_df['cluster'] == 0]
    class1_df = clus_df.loc[clus_df['cluster'] == 1]

    votes0,values0 = np.unique(class0_df['label'].values, return_counts=True)
    class0_label = votes0[np.argmax(values0)]

    votes1,values1 = np.unique(class1_df['label'].values, return_counts=True)
    class1_label = votes1[np.argmax(values1)]
    
    clus_df['pred'] = -1
    
    clus_df.loc[clus_df['cluster'] == 0,'pred'] = class0_label
    clus_df.loc[clus_df['cluster'] == 1,'pred'] = class1_label

    acc = accuracy_score(clus_df['label'].values, clus_df['pred'].values)
    print("accracy is: %.3f"%acc)
    return clus_df

In [69]:
print("result for homo:")
homo_df = cal_acc(indices_homo,X_homo)
print()
print("result for part:")
part_df = cal_acc(indices_part,X_part)

result for homo:
accracy is: 0.672

result for part:
accracy is: 0.671


### Top100 similar author in the same area

In [70]:
paper_author = pd.read_csv(data_path + dataset + '/paper_author.txt', header=None, sep=' ')
paper_author.columns = ['paper_id', 'author_id']

In [119]:
author_count = paper_author.groupby('author_id',as_index=False).count()

In [120]:
author_count = author_count.sort_values(by=['paper_id'],ascending=False)

In [133]:
author_count = author_count.reset_index()
author_count.head()

Unnamed: 0,index,author_id,paper_id
0,39497,39497,1046
1,23781,23781,836
2,21326,21326,796
3,33435,33435,796
4,39067,39067,766


In [157]:
def similar_domain(model,indices,target_authors,topn = 100):
    idx_labels = []
    for idx,a in enumerate(indices):
        name = id_name_dict[a]
        label = name_label_dict[name]
        idx_labels.append(label)
    id_dict = {'author_id':indices,'label':idx_labels}
    author_df = pd.DataFrame(data=id_dict)
    
    for target_author in target_authors:
        rank = author_count.index[author_count['author_id'] == target_author].tolist()[0]
        print("Examining for the %d-th author %d"%(rank,target_author))
        target_author = str(target_author)
        target_labels =author_df.loc[author_df['author_id'] == target_author]['label'].values
        if target_labels.shape[0] == 0:
            print("*** target author not found ***")
            print()
            continue
        target_label = target_labels[0]
        top100 = np.array(model.wv.most_similar(positive=[target_author],\
                                                topn=topn, restrict_vocab=None, indexer=None))[:,0]
        sel_author = author_df.loc[author_df['author_id'].isin(top100)]
        same_domain = sel_author.loc[sel_author['label'] == target_label]
        print('%d/%d authors are in the same domain'%(same_domain.shape[0],topn))
        print()

In [158]:
print("examing homo model")
similar_domain(homo_model,indices_homo,author_count['author_id'].values[:5])

examing homo model
Examining for the 0-th author 39497
98/100 authors are in the same domain

Examining for the 1-th author 23781
76/100 authors are in the same domain

Examining for the 2-th author 21326
85/100 authors are in the same domain

Examining for the 3-th author 33435
86/100 authors are in the same domain

Examining for the 4-th author 39067
96/100 authors are in the same domain



In [160]:
similar_domain(part_model,indices_part,author_count['author_id'].values[:5])

Examining for the 0-th author 39497
100/100 authors are in the same domain

Examining for the 1-th author 23781
77/100 authors are in the same domain

Examining for the 2-th author 21326
78/100 authors are in the same domain

Examining for the 3-th author 33435
84/100 authors are in the same domain

Examining for the 4-th author 39067
87/100 authors are in the same domain



In [159]:
similar_domain(homo_model,indices_homo,author_count['author_id'].values[-5:])

Examining for the 43682-th author 30241
16/100 authors are in the same domain

Examining for the 43683-th author 30242
79/100 authors are in the same domain

Examining for the 43684-th author 4009
*** target author not found ***

Examining for the 43685-th author 18270
78/100 authors are in the same domain

Examining for the 43686-th author 34068
96/100 authors are in the same domain

