In [6]:
%matplotlib inline
import pickle
import urllib
import time
import feedparser
import itertools
from copy import deepcopy
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from tqdm import tqdm

In [7]:
category = 'astro-ph'
entries = pickle.load(open(category + '_entries.pkl', 'rb'))
author_ind = pickle.load( open(category + '_author_ind.pkl', 'rb'))
train_adj_list = pickle.load(open(category + '_train_adj_list.pkl', 'rb'))
test_adj_list = pickle.load(open(category + '_test_adj_list.pkl', 'rb'))
num_authors = len(author_ind)
authors = range(num_authors)
pos_edges = set([(min(a1, a2), max(a1, a2)) for (a1, a2) in \
                 itertools.combinations(authors, 2)]) - set(train_adj_list)
pred_edges = set(test_adj_list) - set(train_adj_list)

In [11]:
def gen_neighbors(train_adj_list, author_ind):
    neighbors = {}
    for a in author_ind.iterkeys():
        neighbors[author_ind[a]] = frozenset()

    for e in train_adj_list:
        neighbors[e[0]] = neighbors[e[0]].union(set([e[1]]))
        neighbors[e[1]] = neighbors[e[1]].union(set([e[0]]))
    return neighbors

def common_neighbors(neighbors, authors, k):
    ranking = []
    for a1, a2 in pos_edges:
        neighbor_count = len(neighbors[a1].intersection(neighbors[a2]))
        ranking.append((a1, a2, neighbor_count))
    ranking.sort(key=lambda p: -p[2])
    ranking = [(r[:2]) for r in ranking]
    ranked_edges = set(ranking[:k])
    return ranked_edges

def random_edges(pos_edges, k):
    ind = np.random.choice(range(num_authors), k)
    pos_edges_list = list(pos_edges)
    pred_edges = set([pos_edges_list[i] for i in ind])
    return pred_edges

# Shortest distances
def floyd_warshall(train_adj_list, num_authors):
    dist = np.zeros((num_authors, num_authors)) + 1e10
    for e in train_adj_list:
        dist[e[0]][e[1]] = 1
        dist[e[1]][e[0]] = 1
    # Because symmetrical, only want to calculate dist[i][j] where j > i
    # This means that j > k, j > i, k > i.
    for k in tqdm(range(num_authors)):
        for i in range(k):
            for j in range(k + 1, num_authors):
                if dist[i][j] > dist[i][k] + dist[k][j]:
                    dist[i][j] = dist[i][k] + dist[k][j]
    return dist

def build_matrix(train_adj_list, author_ind):
    P = np.zeros((len(author_ind), len(author_ind))) + 1e-50
    for e in train_adj_list:
        P[e[0]][e[1]] = 1
        P[e[1]][e[0]] = 1        
    return P

def pred_acc(pred_edges, ranked_edges):
    tot_edges = float(len(pred_edges))
    cor_edges = pred_edges.intersection(ranked_edges)
    return len(cor_edges) / tot_edges

In [1]:
# Rooted pagerank
def rooted_pr(P, alpha, e, threshold=1e-40):
    n = P.shape[0]
    P_norm = np.divide(P.T, (P.sum(axis=1)))
    
    # Start at x
    s_o = np.array([0] * n)
    s_o[e[0]] = 1
    s_i = None
    
    # Random prob alpha to jump to x
    E = np.array([0] * n)
    E[e[0]] = alpha
        
    while (np.sum(np.abs(s_i - s_o)) > threshold):
        s_o = s_i
        s_i = np.dot(((1 - alpha) * P_norm + E), s_o)
    
    return s_i[e[1]]

In [12]:
neighbors = gen_neighbors(train_adj_list, author_ind)
t = build_matrix(train_adj_list, author_ind)
cn_edges = common_neighbors(neighbors, authors, len(pred_edges))
rand_edges = random_edges(pos_edges, len(pred_edges))

# Common neighbors
pred_acc(cn_edges, pred_edges)
pred_acc(rand_edges, pred_edges)

0.002871205906480722

In [13]:
# Takes around 30 minutes to run
dist = floyd_warshall(train_adj_list, num_authors)
dist = dist.astype(int)


100%|██████████| 2559/2559 [39:36<00:00,  1.08it/s]


In [18]:
pickle.dump(dist, open(category + '_dist.pkl', 'wb'))

In [None]:
# Look at all distances of 2 
# Look at all distances 3
# Choose top k from these

In [14]:
dist_2 = []
for i in tqdm(range(num_authors)):
    for j in range(i + 1, num_authors):
        if dist[i][j] == 2:
            dist_2.append((i, j))

100%|██████████| 2559/2559 [00:01<00:00, 1861.27it/s]


In [17]:
len(dist_2)

43375