In [2]:
import os
import urllib
import shutil
import numpy as np


In [3]:
# The file is located here:
url = "https://snap.stanford.edu/data/web-Google.txt.gz"

In [4]:
# Download and copy it here using the code below:
f= 'web-Google.txt.gz'

if not os.path.exists(f):
    r = urllib.request.urlopen(url)
    with open(f, 'wb') as fo:
        shutil.copyfileobj(r, fo)

In [5]:
import gzip

In [6]:
# Load the graph data from the file
def load_Gdata(filename):
    # Initialize an empty dictionary to represent the graph
    graph = {}

    # Open the gzip-compressed file for reading
    with gzip.open(filename, 'rt') as file:
        # Iterate through each line in the file
        for line in file:
            # Skip lines starting with "#"
            if line.startswith("#"):
                continue
            else:
                # Split the line into parts and convert them to integers
                parts = line.strip().split()
                source, target = int(parts[0]), int(parts[1])

                # If the source is not in the graph, add it with an empty list
                if source not in graph:
                    graph[source] = []

                # Append the target to the source's list in the graph
                graph[source].append(target)

    # Return the constructed graph
    return graph


In [7]:
graph = load_Gdata(f)

In [8]:
# Initialize PageRank values for all vertices
num_vertices = len(graph)
pagerank = np.ones(num_vertices) / num_vertices

In [9]:
# Implement the power iteration method to compute PageRank
def power_iteration(graph, damping_factor=0.85, num_iterations=100):
    # Get the number of vertices in the graph
    num_vertices = len(graph)

    # Initialize the pagerank vector with equal probabilities for each vertex
    initial_pagerank = np.ones(num_vertices) / num_vertices

    # Perform power iteration for a specified number of iterations
    for _ in range(num_iterations):
        # Initialize a new pagerank vector
        new_pagerank = np.zeros(num_vertices)

        # Update pageranks based on the graph structure
        for vertex in graph:
            num_neighbors = len(graph[vertex])
            if num_neighbors == 0:
                new_pagerank += damping_factor * initial_pagerank[vertex] / num_vertices
            else:
                for neighbor in graph[vertex]:
                    new_pagerank[neighbor] += damping_factor * initial_pagerank[vertex] / num_neighbors

        # Apply damping factor and add the teleportation probability
        new_pagerank += (1 - damping_factor) / num_vertices

        # Update the initial pagerank vector for the next iteration
        initial_pagerank = new_pagerank

    # Return the final computed pagerank vector
    return initial_pagerank


In [10]:
# Initialize PageRank values for all vertices
num_vertices = len(graph)
pagerank = np.ones(num_vertices) / num_vertices


In [11]:
pagerank


array([1.35234917e-06, 1.35234917e-06, 1.35234917e-06, ...,
       1.35234917e-06, 1.35234917e-06, 1.35234917e-06])