# A simple PageRank implementation

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import json
import numpy as np

Mounted at /content/drive


In [3]:
def read_graph(filename):
    # Open the file specified by 'filename' in read mode
    with open(filename, 'r') as f:
        # Load the graph data from the JSON file into the variable 'g'
        g = json.load(f)
        # Return the graph data; no further processing is required as it's already in the desired format
        return g

In [4]:
read_graph("/content/drive/MyDrive/Pagerank2024/data/example.json")

{'a': ['b', 'c', 'd'],
 'b': ['e', 'f'],
 'c': ['e'],
 'd': ['a'],
 'e': ['d'],
 'f': ['a']}

In [None]:
def compute_R(graph):
    # Calculate the number of nodes in the graph
    n = len(graph.keys())

    # we create a dictionary mapping from each node (key in the graph) to a unique index (position in the matrix)
    key_to_pos = dict(zip(graph.keys(), range(0,n)))

    # Initialize an nxn zero matrix, where n is the number of nodes in the graph
    R = np.zeros((n,n))

    # Iterate over each node in the graph
    for i, source in enumerate(graph.keys()):
        # Calculate the out-degree of the current node ('source'), which is the lentgh of its adjacent list
        out_deg = len(graph[source])

        # Iterate over each destination node that 'source' is connected to
        for dest in graph[source]:
            # Find the matrix index corresponding to the destination node
            j = key_to_pos[dest]

            # Update the matrix entry to represent the edge weight from 'source' to 'dest'
            # Here, it is set as 1 divided by the out-degree of 'source'
            R[i][j] = 1/out_deg

    # Return the matrix representing the graph
    return R

In [None]:
def PageRank_iteration(x, R, J, alpha):
    # Determine the size of the vector x (number of nodes in the graph)
    n = len(x)

    # Create a column vector of ones with the same length as x
    one = np.mat(np.ones(n)).T

    # Calculate the transition probability matrix P
    # P is a weighted combination of a random jump matrix J and the graph's adjacency matrix R
    # 'alpha' is the damping factor: it balances between the random jump and following links in R
    P = (alpha * one * J + (1 - alpha) * R)

    # Perform the PageRank iteration: multiply the current rank vector x with the transition matrix P
    x_prime = x * P

    # Return the updated rank vector
    return x_prime


In [None]:
def compute_PageRank(graph, alpha, epsilon):
    # Get the number of nodes in the graph
    n = len(graph.keys())

    # Compute the transition matrix R for the graph without considering teleportation
    R = compute_R(graph)

    # Initialize the jump vector J, a uniform distribution vector where each entry is 1/n
    J = np.ones(n)/n

    # Initialize the PageRank vector x with a uniform distribution (each entry is 1/n)
    x = np.ones(n)/n
    # Alternative initialization for x: a random stochastic vector
    # x = np.random.rand(n)
    # x = x/x.sum()

    # Initialize the error measure to infinity for the while loop condition
    err = np.inf

    # Iterate until the sum of the absolute differences between new and old x falls below epsilon
    while (err > epsilon):
        # Perform a PageRank iteration to get the new rank vector
        x_new = PageRank_iteration(x, R, J, alpha)

        # Update the error measure as the sum of absolute differences between new and old x
        err = (abs(x_new - x)).sum()

        # Debugging: print the current error value
        print(err)

        # Update the rank vector for the next iteration
        x = x_new

    # Print the final PageRank scores for each node in the graph
    print("PageRank scores:")
    for i, k in enumerate(graph.keys()):
        print(f"{k}: {x[0,i]}")

    # Return the final PageRank vector
    return x

In [None]:
# Function call to read the graph data from a file named "example.json"
G = read_graph("example.json")

# Print the graph data loaded from "example.json"
G

{'a': ['b', 'c', 'd'],
 'b': ['e', 'f'],
 'c': ['e'],
 'd': ['a'],
 'e': ['d'],
 'f': ['a']}

In [None]:
# Function call to compute the PageRank of the graph 'G'
# The damping factor (alpha) is set to 0.01, and the convergence threshold (epsilon) is also set to 0.01
compute_PageRank(G, 0.000001, 0.01)
# note that if alpha<1 you obtain different score but the ranking remains the same

0.6111105
0.4999990000004998
0.3888877222233888
0.3333320000019999
0.3240724537069444
0.29629451852296296
0.24691185185703704
0.21141806173431477
0.19495709311401443
0.17746736111909725
0.1532047619254359
0.13237152538596558
0.11896850369583647
0.10732302878777282
0.09419630842800522
0.08213174486582284
0.07299560278659371
0.06533915564239529
0.05774120167501613
0.050679485857717026
0.04485485752901306
0.03993907056741703
0.035375518991823576
0.031176851305714848
0.027562663343478733
0.024465816245472734
0.021679098521830677
0.01914985562794138
0.016930004459697577
0.015002959386493757
0.013291196406280084
0.011754207029557984
0.010394872581580998
0.009204318105636766
PageRank scores:
a: 0.31462010452707145
b: 0.10570560457849008
c: 0.10570560457849008
d: 0.26443491531342317
e: 0.15715024491856097
f: 0.05238352608396478


matrix([[0.3146201 , 0.1057056 , 0.1057056 , 0.26443492, 0.15715024,
         0.05238353]])

In [None]:
0.052384729231159846

0.052384729231159846