In [4]:
import json
import numpy as np
import pandas as pd
import math

In [5]:
with open("CrawledPapers.json", 'r') as f:
    data = json.load(f)
len(data)

2000

In [6]:
for x in data:
    i = x["id"]
    for j in data:
        if j["id"] == i and j != x:
            print(x)
            print()
            print(j)
            print("\n\n")        

In [7]:
paper_to_index = {}
for index in range(len(data)):
    paper_to_index[data[index]["id"]] = index
# paper_to_index

In [8]:
adj_matrice = np.zeros((len(data), len(data)))
for i in range(len(data)):
    refs = data[i]["references"]
    for ref in refs:
        j = paper_to_index.get(ref, -1)
        if j != -1:
            adj_matrice[i, j] = 1     

In [9]:
def cal_pageRank(adj_matrice, alpha):
    P = adj_matrice.copy()
    V = np.zeros(len(P[0]))
    V.fill(1 / len(P[0]))
    
    for i in range(len(P)):
        if sum(P[i]) == 0:
            P[i] = V
        else:
            P[i] = (1 - alpha) * P[i] + alpha * V
    res = V.reshape((1, len(V))).dot(P)
    for i in range(10):
        res = res.dot(P)
    
    return dict([(i, res[0, paper_to_index[i]]) for i in paper_to_index])

In [10]:
P = cal_pageRank(adj_matrice, 0.5)
# P

In [11]:
data

[{'id': '2981549002',
  'title': 'Tensor Programs I: Wide Feedforward or Recurrent Neural Networks of Any Architecture are Gaussian Processes.',
  'reference_count': '55',
  'citation_count': '27',
  'abstract': 'Wide neural networks with random weights and biases are Gaussian processes, as originally observed by Neal (1995) and more recently by Lee et al. (2018) and Matthews et al. (2018) for deep fully-connected networks, as well as by Novak et al. (2019) and Garriga-Alonso et al. (2019) for deep convolutional networks. We show that this Neural Network-Gaussian Process correspondence surprisingly extends to all modern feedforward or recurrent neural networks composed of multilayer perceptron, RNNs (e.g. LSTMs, GRUs), (nD or graph) convolution, pooling, skip connection, attention, batch normalization, and/or layer normalization. More generally, we introduce a language for expressing neural network computations, and our result encompasses all such expressible neural networks. This work

In [12]:
authors_to_index = {}
for paper in data:
    for author in paper["authors"]:
        if author not in authors_to_index:
            authors_to_index[author] = len(authors_to_index)
authors_to_index

{'Greg Yang': 0,
 'Linda Wang': 1,
 'Zhong Qiu Lin': 2,
 'Alexander Wong': 3,
 'Han Zhang 1, Ian Goodfellow 1, Dimitris Metaxas 2, Augustus Odena 1': 4,
 'Alexey Dosovitskiy 1, Lucas Beyer 1, Alexander Kolesnikov 1, Dirk Weissenborn 2, Xiaohua Zhai 1, Thomas Unterthiner 1, Mostafa Dehghani 1, Matthias Minderer 1, Georg Heigold 2, Sylvain Gelly 1, Jakob Uszkoreit 1, Neil Houlsby 3': 5,
 'Volodymyr Mnih': 6,
 'Koray Kavukcuoglu': 7,
 'David Silver': 8,
 'Andrei A. Rusu': 9,
 'Joel Veness': 10,
 'Marc G. Bellemare': 11,
 'Alex Graves': 12,
 'Martin Riedmiller': 13,
 'Andreas K. Fidjeland': 14,
 'Georg Ostrovski': 15,
 'Stig Petersen': 16,
 'Charles Beattie': 17,
 'Amir Sadik': 18,
 'Ioannis Antonoglou': 19,
 'Helen King': 20,
 'Dharshan Kumaran': 21,
 'Daan Wierstra': 22,
 'Shane Legg': 23,
 'Demis Hassabis': 24,
 'Tomas Mikolov': 25,
 'Ilya Sutskever': 26,
 'Kai Chen': 27,
 'Greg S Corrado': 28,
 'Jeff Dean': 29,
 'Kaiming He': 30,
 'Xiangyu Zhang': 31,
 'Shaoqing Ren': 32,
 'Jian Sun': 

In [13]:
authors_ref_matrix = np.zeros((len(authors_to_index), len(authors_to_index)))
for i in range(len(data)):
    authors = data[i]["authors"]
    refs = data[i]["references"]
    for j in range(len(data)):
        if i != j:
            if data[j]["id"] in refs:
                target_authors = data[j]["authors"]
                for a in authors:
                    for b in target_authors:
                        authors_ref_matrix[authors_to_index[a], authors_to_index[b]] = 1

In [14]:
h = np.ones(len(authors_ref_matrix))
a = np.ones(len(authors_ref_matrix))
for rep in range(5):
    for i in range(len(authors_ref_matrix)):
        h[i] += a.reshape((1, len(a))).dot(authors_ref_matrix[i].reshape((len(authors_ref_matrix), 1)))
        
    for i in range(len(authors_ref_matrix)):
        a[i] += h.reshape((1, len(h))).dot(authors_ref_matrix[:, i])
    a = a / sum(a)
    h = h / sum(h)
    

In [15]:
authorities = dict([(i, a[authors_to_index[i]]) for i in authors_to_index])
sorted(authorities.items(), key = lambda x: x[1], reverse = True)

[('Suxiang Tong', 0.008042227831284632),
 ('Michelle L Holshue 1, Chas DeBolt 2, Scott Lindquist',
  0.007436545497314237),
 ('Kathy H Lofy', 0.007436545497314237),
 ('John Wiesman', 0.007436545497314237),
 ('Hollianne Bruce', 0.007436545497314237),
 ('Christopher Spitters', 0.007436545497314237),
 ('Keith Ericson', 0.007436545497314237),
 ('Sara Wilkerson', 0.007436545497314237),
 ('Ahmet Tural', 0.007436545497314237),
 ('George Diaz', 0.007436545497314237),
 ('Amanda Cohn', 0.007436545497314237),
 ('LeAnne Fox', 0.007436545497314237),
 ('Anita Patel', 0.007436545497314237),
 ('Susan I Gerber', 0.007436545497314237),
 ('Lindsay Kim', 0.007436545497314237),
 ('Xiaoyan Lu', 0.007436545497314237),
 ('Steve Lindstrom', 0.007436545497314237),
 ('Mark A Pallansch', 0.007436545497314237),
 ('William C Weldon', 0.007436545497314237),
 ('Holly M Biggs', 0.007436545497314237),
 ('Timothy M Uyeki', 0.007436545497314237),
 ('Satish K Pillai', 0.007436545497314237),
 ('Chaolin Huang 1, Yeming Wang

In [16]:
user_profiles = pd.read_csv("data.csv")
user_profiles.fillna(0, inplace = True)

In [17]:
user_profiles.columns

Index(['algorithms', 'AI', 'computational science', 'computer architecture',
       'computer engineering', 'computer graphics', 'computer hardware',
       'computer network', 'computer security', 'computer vision',
       'data mining', 'data science', 'database', 'distributed computing',
       'embedded systems', 'human-computer interaction',
       'information retrieval', 'internet privacy', 'knowledge management',
       'library science', 'machine learning', 'multimedia',
       'natural language processing', 'operating systems',
       'parallel computing', 'pattern recognition', 'programming languages',
       'real-time computing', 'simulation', 'software engineering',
       'speech recognition', 'telecommunications',
       'theoretical computer science', 'world wide web'],
      dtype='object')

In [18]:
def cal_papers_profiles(papers_data, topics):
#     papers_vectors = np.zeros((len(papers_data), len(topics)))
    papers_profiles = {}
    for paper in papers_data:
        vec = np.zeros(len(topics))
        for i in range(len(topics)):
            for topic in paper["related_topics"]:
                if topic.lower() == topics[i].lower():
                    vec[i] = 1
        papers_profiles[paper["id"]] = vec
#         papers_profiles[paper["id"]] = np.array([1 if topics[i] in paper["related_topics"] else 0 for i in range(len(topics))])
    return papers_profiles

In [19]:
topics = list(user_profiles.columns)
papers_profiles = cal_papers_profiles(data, topics)
papers_profiles

{'2981549002': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '3105081694': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '2950893734': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '3119786062': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '2145339207': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '2153579005': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [21]:
papers_profiles

{'2981549002': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '3105081694': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '2950893734': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '3119786062': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '2145339207': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 '2153579005': array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.

In [22]:
def recommend_paper(user_profile, papers_profiles):
    scores = {}
    valid_user_profile = sum(user_profile) != 0
    for paper_id in papers_profiles:
        if not valid_user_profile or sum(papers_profiles[paper_id]) == 0:
            scores[paper_id] = 0
            continue
        norm_factor = np.linalg.norm(user_profile, 2) * np.linalg.norm(papers_profiles[paper_id], 2)
        scores[paper_id] = user_profile.reshape((1, -1)).dot(papers_profiles[paper_id].reshape(-1, 1))[0, 0] / norm_factor
    return sorted(scores.items(), key = lambda x: x[1], reverse = True)[:10]

In [23]:
recommend_paper(np.array(user_profiles.loc[1, :]), papers_profiles)

[('2141599568', 0.7519334528499158),
 ('1753482797', 0.7519334528499158),
 ('1486649854', 0.7519334528499158),
 ('2163568299', 0.7519334528499158),
 ('1970689298', 0.7519334528499158),
 ('2056590938', 0.7519334528499158),
 ('2525778437', 0.7519334528499158),
 ('2251222643', 0.7519334528499158),
 ('1905522558', 0.7519334528499158),
 ('1508165687', 0.7519334528499158)]