## PageRank

## Exercise 1

In [113]:
import numpy as np
from scipy.sparse import dok_matrix
from scipy import linalg as la
import pandas as pd

In [76]:
def readmatfromtext(filename):
    '''
    Accepts a .txt file with nodes listed in each line and
    creates an adjacency matrix from them.
    '''
    nodes = []
    N = 0
    with open(filename, 'r') as myfile:
        for line in myfile:
            try:
                line = list(map(int, line.strip().split()))
                nodes.append(line)
            except:
                pass
    #Grab highest node
    N = np.amax(nodes) + 1
    A = np.zeros((N,N))
    for i in range(N):
        for j in range(N):
            if[i, j] in nodes:
                A[i, j] = 1
    return dok_matrix(A)

readmatfromtext("Data/matrix.txt").toarray()

array([[0., 0., 0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 1., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0.]])

## Exercise 2

In [77]:
def Kmat(A):
    '''
    Accepts an adjacency matrix and returns the K matrix
    '''
    #Compute modified adjacency matrix
    N = A.shape[0]
    A[A.sum(axis=1) == 0, :] = np.ones(N)
    # Get diagonals
    D = A.sum(axis = 1)
    K = A.T / D
    return(K)

Data = readmatfromtext("Data/matrix.txt").toarray()

Kmat(Data)

array([[0.        , 1.        , 0.125     , 0.33333333, 0.33333333,
        0.5       , 1.        , 1.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.33333333, 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.        , 0.33333333,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.125     , 0.33333333, 0.33333333,
        0.5       , 0.        , 0.        ],
       [1.        , 0.        , 0.125     , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])

## Exercise 3

In [138]:
def steadystate(A, N = 'none', d =.85, tol = 1e-5, maxiter=1000):
    '''
    Input A is the adjacency matrix - we can find it by using
    for example the function above. Output will be a probability
    density vector for each of the nodes of the graph.
    '''
    # Restrict to relevant section of A
    if N != 'none':
        A = A[:N,:N]
    n = A.shape[0]
    #Initialize random vector for p_t
    p_t = np.random.rand(n)
    #normalize
    p_t = p_t / np.sum(p_t)
    #Initialize distance and iter counters
    pdist = 5
    iter = 0
    while (pdist > tol) & (iter < maxiter):
        pnext = d * A @ p_t + ((1 - d)/ n)
        pdist = la.norm(pnext - p_t)
        p_t = pnext
        print("At iter:", iter, "Distance was", pdist)
        iter += 1
    return(p_t)

Data = readmatfromtext("Data/matrix.txt").toarray()
A = Kmat(Data)
steadystate(A)
    

At iter: 0 Distance was 0.7260867639566944
At iter: 1 Distance was 0.6957982673082554
At iter: 2 Distance was 0.5627622767509463
At iter: 3 Distance was 0.488143843010561
At iter: 4 Distance was 0.4120713564186548
At iter: 5 Distance was 0.35094742516819244
At iter: 6 Distance was 0.2981471014898515
At iter: 7 Distance was 0.25346256132340234
At iter: 8 Distance was 0.21543442984399577
At iter: 9 Distance was 0.18312132459361782
At iter: 10 Distance was 0.15565264379289695
At iter: 11 Distance was 0.13230486044031453
At iter: 12 Distance was 0.11245910483150458
At iter: 13 Distance was 0.09559024533523429
At iter: 14 Distance was 0.08125170707412818
At iter: 15 Distance was 0.06906395135572369
At iter: 16 Distance was 0.05870435857197492
At iter: 17 Distance was 0.04989870480503728
At iter: 18 Distance was 0.042413899079857896
At iter: 19 Distance was 0.03605181421891696
At iter: 20 Distance was 0.03064404208583597
At iter: 21 Distance was 0.026047435773017696
At iter: 22 Distance was 

array([0.43869583, 0.02171029, 0.02786154, 0.02171029, 0.02171029,
       0.02786154, 0.04585394, 0.3945963 ])

## Problem 4 Incomplete

In [111]:
def sseigen(A, N='none', d = .85):
    n = len(A)
    #Define B matrix
    K = Kmat(A)
    # Everything good except K - fix!!!
    E = np.ones((n, n))
    B = (d * K + ((1 - d) / n) * E)
    #Get eigenvalues
    eigs, eigvecs = la.eig(B)
    p_ss = eigvecs[:, 0]/ np.sum(eigvecs[:,0])
    return p_ss

Data = readmatfromtext("Data/matrix.txt").toarray()
A = Kmat(Data)
#print(steadystate(A))
print(sseigen(A))

[[0.         0.         0.         0.         0.         0.
  0.         0.88888889]
 [0.23300971 0.         0.         0.         0.         0.
  0.         0.        ]
 [0.02912621 1.         0.27272727 1.         1.         0.27272727
  0.09677419 0.11111111]
 [0.0776699  0.         0.72727273 0.         0.         0.
  0.25806452 0.        ]
 [0.0776699  0.         0.         0.         0.         0.72727273
  0.25806452 0.        ]
 [0.11650485 0.         0.         0.         0.         0.
  0.38709677 0.        ]
 [0.23300971 0.         0.         0.         0.         0.
  0.         0.        ]
 [0.23300971 0.         0.         0.         0.         0.
  0.         0.        ]]
[0.03870928 0.02641669 0.48031139 0.32401996 0.04643401 0.03127529
 0.02641669 0.02641669]


## Exercise 5

In [148]:
raw = pd.read_csv('Data/ncaa2013.csv')
winners = raw['WINNING_TEAM']
# NOTE THAT THERE IS A SPACE BEFORE LOSING TEAM!!!
losers = raw[' LOSING_TEAM']
names = np.unique(np.concatenate((winners, losers)))
n = len(names)
# Create adjacency matrix (Code from link predictor)
adj = np.zeros((n, n))
for i in raw.index:
    name1, name2 = winners[i], losers[i]
    name1loc = np.where(name1 == names)
    name2loc = np.where(name2 == names)
    '''
    Note - this is where I deviate from the earlier method.
    Now, I have a directed graph, so I only want one of the
    connections to be switched to 1.
    '''
    adj[name2loc, name1loc] = 1
#Grab true adjacency matrix
A = Kmat(adj)
#Get Steady State of Adjacency matrix.
ss = steadystate(A, d=.7)
# Pull rank indices
namerank = np.argsort(ss)
best = names[namerank]
print(f"The 5 best teams are: {best[-5:][::-1]}")
print("You can tell this is a bad algorithm because Notre Dame isn't ranked number 1!")
#np.where('Cleveland St'==names), np.where('Grambling'==names)


At iter: 0 Distance was 0.03578377358007139
At iter: 1 Distance was 0.010247763970980997
At iter: 2 Distance was 0.004032532425221514
At iter: 3 Distance was 0.0019028916578856445
At iter: 4 Distance was 0.0009349131718913085
At iter: 5 Distance was 0.0004668395399876965
At iter: 6 Distance was 0.00023642918406946914
At iter: 7 Distance was 0.00012045238527362104
At iter: 8 Distance was 6.16930124081624e-05
At iter: 9 Distance was 3.171013429757476e-05
At iter: 10 Distance was 1.6341773071825042e-05
At iter: 11 Distance was 8.438238337842991e-06
The 5 best teams are: ['Duke' 'Butler' 'Louisville' 'Illinois' 'Indiana']
You can tell this is a bad algorithm because Notre Dame isn't ranked number 1!
