<img src="q6.png" alt="Drawing" style="width: 700px;"/>

In [2]:
import sys; sys.path.append("../../")
from graphs.directedgraph import DirectedGraph

import numpy as np

#### The example dataset, with 5 researches who published 15 papers in total.
Assume that no researcher cites papers that they wrote.
<img src="q6_data.jpeg" alt="Drawing" style="width: 600px;"/>

In [9]:
# Who published each paper?
who_published = {
    0: 0,
    1: 3,
    2: 4,
    3: 1,
    4: 0,
    5: 4,
    6: 4,
    7: 0,
    8: 1,
    9: 0,
    10: 2,
    11: 1,
    12: 2,
    13: 2,
    14: 0
}

In [19]:
# Build the graph of which papers cited which
papers_graph = DirectedGraph(15)
papers_graph.insertEdge(1, 6, 1)
papers_graph.insertEdge(1, 14, 1)
papers_graph.insertEdge(2, 1, 1)
papers_graph.insertEdge(2, 3, 1)
papers_graph.insertEdge(2, 4, 1)
papers_graph.insertEdge(3, 1, 1)
papers_graph.insertEdge(3, 2, 1)
papers_graph.insertEdge(3, 4, 1)
papers_graph.insertEdge(3, 13, 1)
papers_graph.insertEdge(4, 2, 1)
papers_graph.insertEdge(4, 3, 1)
papers_graph.insertEdge(4, 5, 1)
papers_graph.insertEdge(4, 6, 1)
papers_graph.insertEdge(5, 1, 1)
papers_graph.insertEdge(6, 3, 1)
papers_graph.insertEdge(6, 4, 1)
papers_graph.insertEdge(6, 7, 1)
papers_graph.insertEdge(6, 9, 1)
papers_graph.insertEdge(6, 10, 1)
papers_graph.insertEdge(8, 0, 1)
papers_graph.insertEdge(8, 13, 1)
papers_graph.insertEdge(10, 1, 1)
papers_graph.insertEdge(10, 5, 1)
papers_graph.insertEdge(10, 7, 1)
papers_graph.insertEdge(10, 9, 1)
papers_graph.insertEdge(10, 11, 1)
papers_graph.insertEdge(12, 1, 1)
papers_graph.insertEdge(12, 5, 1)
papers_graph.insertEdge(12, 7, 1)
papers_graph.insertEdge(12, 8, 1)
papers_graph.insertEdge(12, 9, 1)
papers_graph.insertEdge(12, 14, 1)
papers_graph.insertEdge(13, 1, 1)
papers_graph.insertEdge(13, 3, 1)
papers_graph.insertEdge(13, 6, 1)
papers_graph.insertEdge(14, 13, 1)

print(papers_graph.getNumEdges())

36


In [20]:
# From the papers graph, compute the citation matrix C, 
# C[i][j] = number of times researcher i cites a paper published by
# researcher j
citation_matrix = np.zeros((5, 5))

for i in range(15):
    for j in range(15):
        if papers_graph.edgeExists(i, j):
            r_i = who_published[i]
            r_j = who_published[j]
            citation_matrix[r_i][r_j] += 1

print(citation_matrix)

[[0. 1. 1. 0. 3.]
 [2. 0. 2. 1. 1.]
 [5. 3. 0. 3. 3.]
 [1. 0. 0. 0. 1.]
 [4. 2. 1. 2. 0.]]


In [62]:
# Use this matrix to build the researches graph
researchers_graph = DirectedGraph(5)
for i in range(5):
    for j in range(5):
        num_cites = citation_matrix[i][j]
        if num_cites != 0:
            researchers_graph.insertEdge(i, j, num_cites)
            
researchers_graph.printGraph()

Graph has 5 vertices and 17 edges

Edges: 
edge from 0-1 with weight 1.0
edge from 0-2 with weight 1.0
edge from 0-4 with weight 3.0
edge from 1-0 with weight 2.0
edge from 1-2 with weight 2.0
edge from 1-3 with weight 1.0
edge from 1-4 with weight 1.0
edge from 2-0 with weight 5.0
edge from 2-1 with weight 3.0
edge from 2-3 with weight 3.0
edge from 2-4 with weight 3.0
edge from 3-0 with weight 1.0
edge from 3-4 with weight 1.0
edge from 4-0 with weight 4.0
edge from 4-1 with weight 2.0
edge from 4-2 with weight 1.0
edge from 4-3 with weight 2.0


In [37]:
# Build the Google matrix from a directed graph
def google_matrix(W, alpha):
    M = W.getNumVertices()
    G0 = np.zeros((M, M))
    
    # Add in the edge weights
    for i in range(M):
        for j in range(M):
            if W.edgeExists(i, j):
                G0[i][j] = W.getEdgeWeight(i, j)
        if np.sum(G0[i]) != 0:
            G0[i] /= np.sum(G0[i])
                
    d = np.array([1 if np.sum(G0[i]) == 0 else 0 for i in range(M)]).reshape((M, 1))
    e = np.full((M, 1), 1)

    return alpha*(G0 + 1/M * (d @ e.T)) + (1-alpha)/M * (e @ e.T)

In [34]:
# PageRank iterative algorithm
def PageRank(W, max_iter=100, damping_factor=0.85):
    # Get the Google matrix for W
    M = W.getNumVertices()
    G = google_matrix(W, damping_factor)

    # Generate a random vector q with all entries between 0 and 1 and sum to 1
    # (probability distribution)
    q = np.random.rand(1, M)
    q /= np.sum(q)

    # Iteratively multiply q by G until convergence or max iterations reached
    q_prev = None
    for i in range(max_iter):
        q_prev = q
        q = q @ G

    return q

In [35]:
# Part (a)

In [64]:
# Compute the ranks of each paper
paper_ranks = PageRank(papers_graph)[0]

# Add the ranks of each page to the researcher who published it
ranks_a = np.zeros(5)
for p in range(15):
    ranks_a[who_published[p]] += paper_ranks[p]

print(ranks_a)

[0.29028023 0.155624   0.17942394 0.14068812 0.23398371]


In [65]:
# Part (b)

In [66]:
# Compute the ranks from the researchers graph
ranks_b = PageRank(researchers_graph)[0]

print(ranks_b)

[0.28166154 0.1577802  0.14896891 0.13224993 0.27933942]


Here's the dataset again. Did both methods rank the researches in the same order?
<img src="q6_data.jpeg" alt="Drawing" style="width: 600px;"/>

In [71]:
order_a = np.argsort(ranks_a)
order_b = np.argsort(ranks_b)

print("Ranks:  (a)   (b)")
for i in range(5):
    print(f"{i+1}        {order_a[4-i]}     {order_b[4-i]}")

Ranks:  (a)   (b)
1        0     0
2        4     4
3        2     1
4        1     2
5        3     3


In [None]:
# It's almost the same, but 1 and 2 are pretty close in rank.