# PageRank algorithm

In [18]:
import numpy as np
from scipy.sparse import dok_matrix
import scipy.linalg
from numba import njit, prange

### Problem 1

In [19]:
def adjacency_matrix(filename, N):
    A = np.zeros((N, N))
    nodes = []
    with open(filename, 'r') as f:
        for c in f:
            try:
                node = list(map(int, c.strip().split()))
                nodes.append(node)
            except:
                pass
            
    for i in range(N):
        for j in range(N):
            if [i, j] in nodes:
                A[i, j] = 1
    return dok_matrix(A)

In [20]:
A = adjacency_matrix('matrix.txt', 8)

In [21]:
A.toarray()

array([[0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.]])

### Problem 2

In [22]:
@njit
def calculate_K(A, N):
    A[A.sum(axis=1) == 0, :] = np.ones(N)
    D = A.sum(axis=1)
    return (A.T / D)

In [23]:
calculate_K(A.toarray(), 8)

array([[0.        , 1.        , 0.125     , 0.33333333, 0.33333333,
        0.5       , 1.        , 1.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.33333333, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.33333333,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.33333333, 0.33333333,
        0.5       , 0.        , 0.        ],
       [1.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])

### Problem 3

In [24]:
@njit
def pagerank(A, N=None, d=0.85, tol=1e-5, max_iter=500):
    
    if N is None:
        N = A.shape[0]
    A = A[:N+1, :N+1]
    
    K = calculate_K(A, N)
    
    p = np.ones(N)
    p = p / p.sum()
    
    diff = 1e3
    i = 0
    
    while diff > tol and i < max_iter:
        p_new = d * K @ p + ((1 - d) / N) * np.ones(N)
        diff = np.linalg.norm(p - p_new)
        p = p_new
        i += 1
    
    return p

In [25]:
pagerank(A.toarray())

array([0.43868966, 0.02171029, 0.02786154, 0.02171029, 0.02171029,
       0.02786154, 0.04585394, 0.39460246])

### Problem 4

In [32]:
def pagerank_eigen(A, N=None, d=0.85, tol=1e-5, max_iter=500):
    
    if N is None:
        N = A.shape[0]
    A = A[:N+1, :N+1]
    
    K = calculate_K(A, N)
    
    B = d * K + ((1 - d) / N) * np.ones((N, N))
    
    eigs, eigvecs = scipy.linalg.eig(B)
    max_eig = eigs.argmax()
    
    return eigvecs[:, max_eig] / eigvecs[:, max_eig].sum()

In [33]:
pagerank_eigen(A.toarray())

array([0.43869288, 0.02171029, 0.02786154, 0.02171029, 0.02171029,
       0.02786154, 0.04585394, 0.39459924])

### Problem 5

In [11]:
win_lose = []
with open('ncaa2013.csv', 'r') as file:
    file.readline()
    for line in file:
        teams = line.strip().split(',')
        win_lose.append(teams)
        
win_lose = np.array(win_lose)
teams = np.unique(win_lose.flatten())
N = len(teams)
team_id = dict(zip(teams, range(N)))
win_lose_id = np.array([[team_id[win], team_id[lose]] for win, lose in win_lose[:, ]])

In [12]:
A = np.zeros((N, N))

for game in win_lose_id:
    j, i = game
    A[i, j] = 1

In [13]:
p = pagerank(A, d=0.7)
rank_id = p.argsort()[-5:][::-1]  # Top 5 teams

list(teams[rank_id])

['Duke', 'Butler', 'Louisville', 'Illinois', 'Indiana']