# Downloading Java From Command Line (Optional)

In [1]:
# !sudo apt update
# !sudo apt install default-jre
# !sudo apt install default-jdk

# Python Imports

In [2]:
import pip
import json
    
try:
    from urllib.parse import urlparse
except:
    pip.main(['install','urllib'])
    from urllib.parse import urlparse

try:
    import psycopg2
except:
    pip.main(['install','psycopg2-binary'])
    import psycopg2

try:
    import pyspark
    from pyspark.sql import SparkSession
    from pyspark.context import SparkContext
    from pyspark.sql.types import *
    from pyspark.sql.functions import udf
except:
    pip.main(['install','pyspark'])
    import pyspark
    from pyspark.sql import SparkSession
    from pyspark.context import SparkContext
    from pyspark.sql.types import *
    from pyspark.sql.functions import udf

# Gets AWS credentials

In [3]:
credentials_file = open("./awscredentials.json")
aws_creds = json.load(credentials_file)

# Reset Domains Results

In [4]:
try:
    conn = psycopg2.connect(host=aws_creds["ENDPOINT"], port=aws_creds["PORT"],
                            database=aws_creds["DBNAME"], user=aws_creds["USR"],
                            password=aws_creds["password"])
    cur = conn.cursor()
    cur.execute("DROP TABLE IF EXISTS domains")
    cur.close()
    conn.commit()
    conn.close()
except Exception as e:
    print("Database connection failed due to {}".format(e))

# Define PageRank Queries

In [5]:
# Queries

cleaned_links_url_query = '''
    SELECT u1.url AS source, u2.url AS dest
    FROM links_url l JOIN urls u1 ON l.source=u1.url JOIN urls u2 ON l.dest=u2.url
    WHERE u1.url <> u2.url'''

get_domains_query = '''
    SELECT DISTINCT GET_DOMAIN(c.source) AS source, GET_DOMAIN(c.dest) AS dest
    FROM cleaned_links_url c'''

# Initialize Spark

In [6]:
jdbcUrl = "jdbc:postgresql://" + aws_creds["ENDPOINT"] + ":" + aws_creds["PORT"] + "/" + \
                    aws_creds["DBNAME"] + "?user=" + aws_creds["USR"] + "&password=" + aws_creds["password"]

spark = SparkSession.builder \
        .appName('CleanURLs') \
        .master('local[*]') \
        .config("spark.driver.extraClassPath", "postgresql-42.2.20.jar") \
        .config("spark.network.timeout", "3700s") \
        .config("spark.executor.heartbeatInterval","3600s") \
        .config('spark.executor.memory','40G') \
        .config('spark.driver.memory','75G') \
        .getOrCreate()

In [7]:
get_domain = udf(lambda url: urlparse(url).netloc, StringType())
spark.udf.register("GET_DOMAIN", get_domain)

<function __main__.<lambda>(url)>

# Actual PageRank

In [None]:
# Load data from RDS
cleaned_links_url_sdf = spark.read.format("jdbc") \
        .option("url", jdbcUrl) \
        .option("driver", "org.postgresql.Driver") \
        .option("query", cleaned_links_url_query) \
        .load()

cleaned_links_url_sdf.createOrReplaceTempView("cleaned_links_url")

get_domains_sdf = spark.sql(get_domains_query)
get_domains_sdf.write.format("jdbc") \
        .option("url", jdbcUrl) \
        .option("driver", "org.postgresql.Driver") \
        .option("dbtable", "domains") \
        .save()