# Downloading Java From Command Line (Optional)

In [None]:
# !sudo apt update
# !sudo apt install default-jre
# !sudo apt install default-jdk

# Imports

In [None]:
import pip
import json
import time

try:
    import psycopg2
except:
    pip.main(['install','psycopg2-binary'])
    import psycopg2

try:
    import pyspark
    from pyspark.sql import SparkSession
    from pyspark.context import SparkContext
except:
    pip.main(['install','pyspark'])
    import pyspark
    from pyspark.sql import SparkSession
    from pyspark.context import SparkContext

# Gets AWS credentials

In [None]:
credentials_file = open("./awscredentials.json")
aws_creds = json.load(credentials_file)

# Reset Pagerank Results

In [None]:
try:
    conn = psycopg2.connect(host=aws_creds["ENDPOINT"], port=aws_creds["PORT"],
                            database=aws_creds["DBNAME"], user=aws_creds["USR"],
                            password=aws_creds["password"])
    cur = conn.cursor()
    cur.execute("DROP TABLE IF EXISTS pagerank_results")
    cur.close()
    conn.commit()
    conn.close()
except Exception as e:
    print("Database connection failed due to {}".format(e))

# Define PageRank Queries

In [None]:
# Queries

cleaned_links_domain_query = '''
    WITH sinks AS (
        SELECT DISTINCT d1.dest AS sink
        FROM domains d1
        WHERE d1.dest NOT IN ( 
            SELECT DISTINCT d2.source
            FROM domains d2
        )
    ), back_edges AS (
        SELECT d.dest AS source, d.source AS dest
        FROM domains d JOIN sinks s ON d.dest = s.sink
    )
    SELECT b.source AS source, b.dest AS dest
    FROM back_edges b
        UNION
    SELECT d.source AS source, d.dest AS dest
    FROM domains d'''

out_weights_query = '''
    SELECT c.source AS source, 1.0 / COUNT(*) AS out_weight
    FROM cleaned_links_domain c
    GROUP BY c.source'''

initialize_ranks_query = '''
    SELECT DISTINCT(source) AS node, 1 AS rank
    FROM cleaned_links_domain'''

curr_iter_ranks_query = '''
    SELECT l.dest AS node, SUM(o.out_weight * p.rank) * 0.85 + 0.15 AS rank
    FROM cleaned_links_domain l JOIN out_weights o ON l.source = o.source 
            JOIN prev_iter_ranks p ON l.source = p.node
    GROUP BY l.dest'''

max_diff_query = '''
    SELECT MAX(ABS(p.rank - c.rank)) AS max_diff
    FROM prev_iter_ranks p JOIN curr_iter_ranks c ON p.node = c.node'''

id_normalization_query = '''
    WITH stats1 AS (
        SELECT AVG(rank) AS avgR, STD(rank) AS stdR
        FROM curr_iter_ranks
    ), stand AS (
        SELECT node AS domain, (rank - (SELECT avgR FROM stats1)) / (SELECT stdR FROM stats1) AS rank
        FROM curr_iter_ranks
    ), stats2 AS (
        SELECT MIN(rank) AS minR, MAX(rank) AS maxR
        FROM stand
    )
    SELECT domain, (rank - (SELECT minR FROM stats2)) / (SELECT maxR - minR FROM stats2) AS rank
    FROM stand'''

# Initialize Spark

In [None]:
jdbcUrl = "jdbc:postgresql://" + aws_creds["ENDPOINT"] + ":" + aws_creds["PORT"] + "/" + \
                    aws_creds["DBNAME"] + "?user=" + aws_creds["USR"] + "&password=" + aws_creds["password"]

spark = SparkSession.builder \
        .appName('PageRank') \
        .master('local[*]') \
        .config("spark.driver.extraClassPath", "postgresql-42.2.20.jar") \
        .config('spark.executor.memory','32G') \
        .config('spark.driver.memory','32G') \
        .getOrCreate()

# Actual PageRank

In [None]:
# Load data from RDS
cleaned_links_domain_sdf = spark.read.format("jdbc") \
        .option("url", jdbcUrl) \
        .option("driver", "org.postgresql.Driver") \
        .option("query", cleaned_links_domain_query) \
        .load()
cleaned_links_domain_sdf.createOrReplaceTempView("cleaned_links_domain")

out_weights_sdf = spark.sql(out_weights_query)
out_weights_sdf.createOrReplaceTempView("out_weights")

In [None]:
# Actual PageRank
prev_iter_ranks_sdf = spark.sql(initialize_ranks_query)
prev_iter_ranks_sdf.createOrReplaceTempView("prev_iter_ranks")

convergence = False
i = 0
while not convergence:
    curr_iter_ranks_sdf = spark.sql(curr_iter_ranks_query)
    curr_iter_ranks_sdf.createOrReplaceTempView("curr_iter_ranks")

    max_diff_sdf = spark.sql(max_diff_query)
    max_diff = max_diff_sdf.collect()[0].asDict()['max_diff']
    max_diff_sdf.unpersist()
    
    print("Iteration:", i, "| Max Diff:", max_diff)
    
    if max_diff < .1 or i == 100:
        convergence = True
        print("Convergence at iteration:", i)
        id_normalization_sdf = spark.sql(id_normalization_query)
        id_normalization_sdf.write.format("jdbc") \
                .option("url", jdbcUrl) \
                .option("driver", "org.postgresql.Driver") \
                .option("dbtable", "pagerank_results") \
                .save()
    else:
        spark.catalog.dropTempView("prev_iter_ranks")
        curr_iter_ranks_sdf.createOrReplaceTempView("prev_iter_ranks")
        curr_iter_ranks_sdf.unpersist()
        spark.catalog.dropTempView("curr_iter_ranks")
        i += 1