In [1]:
import numpy as np

In [3]:
class Node:
    """
    This class represents a node. A node has some children and parents, which we could 
    link to using its methods.
    """
    
    def __init__(self, name):
        self.name = name
        self.children = []
        self.parents = []
        
    def __repr__(self):
        return "node " + self.name

    def link_child(self, new_child):
        for child in self.children:
            if(child.name == new_child.name):
                return None
        self.children.append(new_child)

    def link_parent(self, new_parent):
        for parent in self.parents:
            if(parent.name == new_parent.name):
                return None
        self.parents.append(new_parent)
        

In [4]:
class Graph: 
    """
    This class models a graph for the PageRank problem. It consists essentialy of a list
    of nodes. It has methods to check the presence of a node, find a node (or create one),
    link two nodes and compute the adiacency matrix.
    """
    def __init__(self):
        self.nodes = []
    
    def __len__(self):
        return len(self.nodes)
    
    def contains(self, name):
        for node in self.nodes:
            if(node.name == name):
                return True
        return False
    
    def find(self, name):
        if(not self.contains(name)):
            new_node = Node(name)
            self.nodes.append(new_node)
            return new_node
        else:
            return next(node for node in self.nodes if node.name == name)
        
    def add_edge(self, parent, child):
        parent_node = self.find(parent)
        child_node = self.find(child)

        parent_node.link_child(child_node)
        child_node.link_parent(parent_node)
        
    def adjacency_matrix(self):
        A = np.zeros((len(self.nodes), len(self.nodes)))
        for i in range(len(self.nodes)):
            children = [node.name for node in self.nodes[i].children]
            row = [node.name in children for node in self.nodes]
            A[i] = row
        return A

In [5]:
def build_graph(file_name):
    """
    Creates and returns a graph object from the edge list contained in the given path.

    Parameters
    ----------
    file_name : string
        The relative path (from the dataset folder) to the file containing the graph definition.

    Returns
    -------
    graph : Graph
        The graph built from file_name.

    """
    with open(file_name) as file:
        lines = file.readlines()

    graph = Graph()

    for line in lines:
        [parent, child] = line.strip().split(',')
        graph.add_edge(parent, child)

    return graph

In [9]:
def P_matrix(graph):
    """
    Compute the P matrix to use in the PageRank algorithm (see Langville and Gleich). 
    It does not embed a policy for dealing with sink nodes (their column
    is composed of all 0s); so P is a substochastic matrix (as in a psedudo-pg problem).

    Parameters
    ----------
    graph : Graph
        the graph which P refers to.
        
    Returns
    -------
    P : numpyarray
        A 2 dimensional array equal to the trasposed adjacency matrix multiplied for the
        Penrose-pseudoinverse of the D matrix (see Langville and Gleich).

    """
    AT = graph.adjacency_matrix().T
    d = AT.sum(axis=0)
    d[d == 0] = 1
    return AT / d

def pageRank(graph, alpha=0.85, max_iterations=400, algo="iterative", rround="yes"):
    """
    Returns the PageRank value for each of the nodes of the given graph, using the
    given parameters. It applies the weakly preferential policy for sink nodes (the random
    surfer is teleported to a random node following a uniform distribution). The 
    personalization vector is considered to be a uniformly distributed one.

    Parameters
    ----------
    graph : Graph (own implementation)
        The graph containing the nodes to compute the PageRank for.
    alpha : float, optional
        The damping parameter of the algorithm. The default is 0.85.
    max_iterations: int, optional
        The maximum number of iterations to do in case an iterative procedure is chosen (see
        algo parameter): when the maximum value is reached, the execution of the function stops
        and the result obtained at that point is returned. The default is 400.
    algo : string, optional
        Used to distinguish between an iterative application of the algorithm and exact one.
        For the iterative version, it applies the update rule contained in Gleich's paper until
        convergence or maximum iterations reached. The pg values vector is initialized
        as a uniform distribution over all the nodes. For the exact version, see pageRank_exact.
        The default value of the parameter is "iterative". For the exact version, use "exact".
    rround : string, optional
        String value to apply a rounding of the pg values to the first 3 decimal digits. 
        The default is "yes".

    Returns
    -------
    numpy array
        An array containing the pg value for each node. Each value refers to the node in graph
        which holds the same position in the graph's node list.
    """
    if algo == "exact":
        return pageRank_exact(graph, alpha, rround)
    
    N = len(graph)
    P = P_matrix(graph)
    c = np.sum(P, axis=0) == 0
    v = np.repeat(1.0/N, N)  # teleportation vector
    x = v.copy()   # initialization vector
    dangling = v.copy()  # sink nodes policy vector
    
    threshold= 1e-16
    error = 1
    
    for i in range(max_iterations):
        x_old = x
        x = alpha*(P @ x_old + (np.inner(c, x_old) * dangling)) + (1-alpha) * v
        error = np.linalg.norm((x - x_old), ord=1)  # use norm 1 to convergence
        if error < threshold:
            break

    if rround == "yes":
        return np.round(x, 3)
    return x

def pageRank_exact(graph, alpha=0.85, rround="yes"): 
    """
    Exact resolution for the PageRank problem. It is done by solving the linear system
    associated to the problem as shown in Gleich's paper. We still use the weakly preferential
    approach and a uniform v vector.

    Parameters
    ----------
    graph : Graph (own implementation)
        The graph containing the nodes to compute the PageRank for.
    alpha : float, optional
        The damping parameter of the algorithm. The default is 0.85.
    rround : string, optional
        String value to apply a rounding of the pg values to the first 3 decimal digits. 
        The default is "yes".

    Returns
    -------
    numpy array
        An array containing the pg value for each node. Each value refers to the node in graph
        which holds the same position in the graph's node list.

    """
    P = P_matrix(graph)   
    N = len(graph)
    v = np.repeat(1.0/N, N)
    
    # this time we need to complete P as to make it a stochastic matrix
    c = np.sum(P, axis=0) == 0
    u = np.repeat(1.0/N, N)
    P = P + np.outer(u,c)
    
    x = np.linalg.solve(np.eye(N,N) - alpha*P, (1-alpha)*v)
    
    if rround == "yes":
        return np.round(x, 3)
    return x

def pageRank_pretty_print(graph, pg_values):
    """
    Prints the page rank value of each node, explicitly associating the node IDs
    with the PR value. 

    Parameters
    ----------
    graph : Graph
        The graph the PR values refer to.
    pg_values : numpy array
        The values computed over the given graph.

    Returns
    -------
    None.

    """
    print()
    print("Page rank values:")
    print("-----------------------")
    
    for i in range(len(graph.nodes)):
        print("Node", i+1, " [id: " + graph.nodes[i].name + "]:", pg_values[i])
    print()

In [15]:
G = build_graph("OneDrive/Documents/Github/page-rank-implementation/dataset/graph_7.txt")

In [18]:
pr_values = pageRank(G, rround="no")
pageRank_pretty_print(G, pr_values)


Page rank values:
-----------------------
Node 1  [id: 1]: 0.001603487771750135
Node 2  [id: 8]: 0.001798197001176937
Node 3  [id: 11]: 0.001798197001176937
Node 4  [id: 168]: 0.0021038904913770165
Node 5  [id: 227]: 0.002075338846050591
Node 6  [id: 253]: 0.002439687729781583
Node 7  [id: 264]: 0.0036708708015263866
Node 8  [id: 307]: 0.002783328086106484
Node 9  [id: 2]: 0.001603487771750135
Node 10  [id: 9]: 0.0019442289232470386
Node 11  [id: 10]: 0.0019442289232470386
Node 12  [id: 13]: 0.0019442289232470386
Node 13  [id: 14]: 0.0019442289232470386
Node 14  [id: 3]: 0.001603487771750135
Node 15  [id: 6]: 0.0018306485394147375
Node 16  [id: 219]: 0.002302227339185401
Node 17  [id: 223]: 0.00205294157634367
Node 18  [id: 235]: 0.002302227339185401
Node 19  [id: 296]: 0.002302227339185401
Node 20  [id: 336]: 0.002302227339185401
Node 21  [id: 4]: 0.001603487771750135
Node 22  [id: 40]: 0.0042046618732677205
Node 23  [id: 136]: 0.002234510751089171
Node 24  [id: 217]: 0.0020416849502