In [18]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse import csr_matrix

In [19]:
def normalize(mx):
    rowsum = np.array(mx.sum(1))
    rowsum[rowsum == 0] = 1
    r_inv = np.power(rowsum.astype(float), -1).flatten()
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

In [20]:
def pagerank(edges):
    page_weights = np.ones((edges.shape[0],1))/edges.shape[0]
    beta = 0.85

    for i in range(1000):
        new_page_weights = beta * edges @ page_weights + (1-beta)/edges.shape[0]
        if np.allclose(new_page_weights, page_weights, atol=1e-12):
            break
        page_weights = new_page_weights
    return page_weights

In [21]:
raw_data = pd.read_csv('web_links.csv')
edges = csr_matrix((np.ones(len(raw_data)),(raw_data['FromNodeId'],raw_data['ToNodeId'])))
del raw_data
edges = normalize(edges).T

In [23]:
page_weights = pagerank(edges).flatten()

In [24]:
top1k = np.argsort(page_weights)[::-1][:1000]
result = pd.DataFrame(data = {'NodeId': top1k, 'PageRank_Value': page_weights[top1k]})
result.to_csv('test_prediction.csv', index=False)

In [25]:
raw_data = pd.read_csv('web_links.csv')