# Downloading Java From Command Line (Optional)

In [None]:
!sudo apt update
!sudo apt install default-jre
!sudo apt install default-jdk

# Python Imports

In [1]:
import pip
import json
try:
    import pandas as pd
except:
    pip.main(['install','pandas'])
    import pandas as pd

try:
    import psycopg2
except:
    pip.main(['install','psycopg2-binary'])
    import psycopg2

try:
    import pyspark
    from pyspark.sql import SparkSession
    from pyspark.context import SparkContext
except:
    pip.main(['install','pyspark'])
    import pyspark
    from pyspark.sql import SparkSession
    from pyspark.context import SparkContext

# Gets AWS credentials

In [2]:
credentials_file = open("./awscredentials.json")
aws_creds = json.load(credentials_file)

# Reset Pagerank Results

In [3]:
try:
    conn = psycopg2.connect(host=aws_creds["ENDPOINT"], port=aws_creds["PORT"],
                            database=aws_creds["DBNAME"], user=aws_creds["USR"],
                            password=aws_creds["password"])
    cur = conn.cursor()
    cur.execute("DROP TABLE IF EXISTS pagerank_results")
    cur.close()
    conn.commit()
    conn.close()
except Exception as e:
    print("Database connection failed due to {}".format(e))

# Define PageRank Queries

In [4]:
# Queries

cleaned_links_url_query = '''
    WITH ids AS (
        SELECT u1.id AS source, u2.id AS dest
        FROM links_url l JOIN urls u1 ON l.source=u1.url JOIN urls u2 ON l.dest=u2.url
    ), no_self_loops AS (
        SELECT i.source, i.dest
        FROM ids i
        WHERE i.source <> i.dest
    ), sinks AS (
        SELECT n1.dest AS sink
        FROM no_self_loops n1
        WHERE n1.dest NOT IN ( 
            SELECT DISTINCT n2.source
            FROM no_self_loops n2
        )
    ), back_edges AS (
        SELECT n.dest AS source, n.source AS dest
        FROM no_self_loops n JOIN sinks s ON n.dest = s.sink
    )
    SELECT b.source AS source, b.dest AS dest
    FROM back_edges b
        UNION
    SELECT n.source AS source, n.dest AS dest
    FROM no_self_loops n'''

out_weights_query = '''
    SELECT c.source AS source, 1.0 / COUNT(*) AS out_weight
    FROM cleaned_links_url c
    GROUP BY c.source'''

initialize_ranks_query = '''
    SELECT DISTINCT(source) AS node, 1 AS rank
    FROM cleaned_links_url'''

curr_iter_ranks_query = '''
    SELECT l.dest AS node, SUM(o.out_weight * p.rank) * 0.85 + 0.15 AS rank
    FROM cleaned_links_url l JOIN out_weights o ON l.source = o.source 
            JOIN prev_iter_ranks p ON l.source = p.node
    GROUP BY l.dest'''

max_diff_query = '''
    SELECT MAX(ABS(p.rank - c.rank)) AS max_diff
    FROM prev_iter_ranks p JOIN curr_iter_ranks c ON p.node = c.node'''

id_normalization_query = '''
    WITH min_rank AS (
        SELECT MIN(rank) AS min_rank
        FROM curr_iter_ranks
    ), max_minus_min AS (
        SELECT MAX(rank) - (SELECT * FROM min_rank) AS max_min_diff
        FROM curr_iter_ranks
    ) 
    SELECT c.node AS id, (c.rank - (SELECT min_rank FROM min_rank)) / ((SELECT max_min_diff FROM max_minus_min)) AS rank
    FROM curr_iter_ranks c'''

# Initialize Spark

In [5]:
jdbcUrl = "jdbc:postgresql://" + aws_creds["ENDPOINT"] + ":" + aws_creds["PORT"] + "/" + \
                    aws_creds["DBNAME"] + "?user=" + aws_creds["USR"] + "&password=" + aws_creds["password"]

spark = SparkSession.builder \
        .appName('PageRank') \
        .master('local[*]') \
        .config("spark.driver.extraClassPath", "postgresql-42.2.20.jar") \
        .getOrCreate()

# Actual PageRank

In [6]:
# Load data from RDS
cleaned_links_url_sdf = spark.read.format("jdbc") \
        .option("url", jdbcUrl) \
        .option("driver", "org.postgresql.Driver") \
        .option("query", cleaned_links_url_query) \
        .load()
cleaned_links_url_sdf.createOrReplaceTempView("cleaned_links_url")

out_weights_sdf = spark.sql(out_weights_query)
out_weights_sdf.createOrReplaceTempView("out_weights")

In [7]:
# Actual PageRank
prev_iter_ranks_sdf = spark.sql(initialize_ranks_query)
prev_iter_ranks_sdf.createOrReplaceTempView("prev_iter_ranks")

convergence = False
i = 0
while not convergence or i == 10000:
    curr_iter_ranks_sdf = spark.sql(curr_iter_ranks_query)
    curr_iter_ranks_sdf.createOrReplaceTempView("curr_iter_ranks")

    max_diff_df = spark.sql(max_diff_query).toPandas()
    max_diff_result = pd.read_json(max_diff_df.to_json())
    
    if max_diff_result["max_diff"][0] < 0.1 or i == 9999:
        convergence = True
        print("Convergence at iteration:", i)
        id_normalization_sdf = spark.sql(id_normalization_query)
        id_normalization_sdf.write.format("jdbc") \
                .option("url", jdbcUrl) \
                .option("driver", "org.postgresql.Driver") \
                .option("dbtable", "pagerank_results") \
                .save()
    else:
        curr_iter_ranks_sdf.createOrReplaceTempView("prev_iter_ranks")
        i += 1