# Downloading Java From Command Line (Optional)

In [None]:
!sudo apt update
!sudo apt install default-jre
!sudo apt install default-jdk

# Python Imports

In [1]:
import pip
import json

try:
    import pandas as pd
except:
    pip.main(['install','pandas'])
    import pandas as pd
    
try:
    from urllib.parse import urlparse
except:
    pip.main(['install','urllib'])
    from urllib.parse import urlparse

try:
    import psycopg2
except:
    pip.main(['install','psycopg2-binary'])
    import psycopg2

try:
    import pyspark
    from pyspark.sql import SparkSession
    from pyspark.context import SparkContext
    from pyspark.sql.types import *
    from pyspark.sql.functions import udf
except:
    pip.main(['install','pyspark'])
    import pyspark
    from pyspark.sql import SparkSession
    from pyspark.context import SparkContext
    from pyspark.sql.types import *
    from pyspark.sql.functions import udf

# Gets AWS credentials

In [2]:
credentials_file = open("./awscredentials.json")
aws_creds = json.load(credentials_file)

# Reset Domains Results

In [3]:
try:
    conn = psycopg2.connect(host=aws_creds["ENDPOINT"], port=aws_creds["PORT"],
                            database=aws_creds["DBNAME"], user=aws_creds["USR"],
                            password=aws_creds["password"])
    cur = conn.cursor()
    cur.execute("DROP TABLE IF EXISTS domains")
    cur.close()
    conn.commit()
    conn.close()
except Exception as e:
    print("Database connection failed due to {}".format(e))

# Define PageRank Queries

In [4]:
# Queries

cleaned_links_url_query = '''
    SELECT u1.url AS source, u2.url AS dest
    FROM links_url l JOIN urls u1 ON l.source=u1.url JOIN urls u2 ON l.dest=u2.url
    WHERE u1.source <> u2.dest
    LIMIT 20'''

get_domains_query = '''
    SELECT DISTINCT GET_DOMAIN(c.source) AS source, GET_DOMAIN(c.dest) AS dest
    FROM cleaned_links_url c'''

# Initialize Spark

In [5]:
jdbcUrl = "jdbc:postgresql://" + aws_creds["ENDPOINT"] + ":" + aws_creds["PORT"] + "/" + \
                    aws_creds["DBNAME"] + "?user=" + aws_creds["USR"] + "&password=" + aws_creds["password"]

spark = SparkSession.builder \
        .appName('CleanURLs') \
        .master('local[*]') \
        .config("spark.driver.extraClassPath", "postgresql-42.2.20.jar") \
        .config("spark.network.timeout", "3700s") \
        .config("spark.executor.heartbeatInterval","3600s") \
        .getOrCreate()

In [6]:
get_domain = udf(lambda url: urlparse(url).netloc, StringType())
spark.udf.register("GET_DOMAIN", get_domain)

<function __main__.<lambda>(url)>

# Actual PageRank

In [14]:
# Load data from RDS
cleaned_links_url_sdf = spark.read.format("jdbc") \
        .option("url", jdbcUrl) \
        .option("driver", "org.postgresql.Driver") \
        .option("query", cleaned_links_url_query) \
        .load()

cleaned_links_url_sdf.createOrReplaceTempView("cleaned_links_url")

df = cleaned_links_url_sdf.toPandas()
df['source'] = df['source'].apply(lambda url: urlparse(url).netloc)
df['dest'] = df['dest'].apply(lambda url: urlparse(url).netloc)

df

# get_domains_sdf = spark.sql(get_domains_query)
# get_domains_sdf.show()
# get_domains_sdf.write.format("jdbc") \
#         .option("url", jdbcUrl) \
#         .option("driver", "org.postgresql.Driver") \
#         .option("dbtable", "domains") \
#         .save()

Unnamed: 0,source,dest
0,gradadm.seas.upenn.edu:443,accessibility.web-resources.upenn.edu:443
1,gradadm.seas.upenn.edu:443,cets.seas.upenn.edu:443
2,gradadm.seas.upenn.edu:443,dar.seas.upenn.edu:443
3,gradadm.seas.upenn.edu:443,directory.apps.upenn.edu:443
4,gradadm.seas.upenn.edu:443,eos.seas.upenn.edu:443
5,gradadm.seas.upenn.edu:443,facultyaffairs.seas.upenn.edu:443
6,gradadm.seas.upenn.edu:443,giving.aws.cloud.upenn.edu:443
7,gradadm.seas.upenn.edu:443,gradadm.seas.upenn.edu:443
8,gradadm.seas.upenn.edu:443,gradadm.seas.upenn.edu:443
9,gradadm.seas.upenn.edu:443,gradadm.seas.upenn.edu:443


In [6]:
import pyspark
import os
from pyspark import SQLContext
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType
from pyspark.sql.functions import udf
from pyspark.sql import Row

os.environ['HADOOP_HOME'] = 'C:\Hadoop'
os.environ['JAVA_HOME'] = 'C:\Program Files\Java\jdk1.8.0_191'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'notebook'
os.environ['PYSPARK_PYTHON'] = 'python'

conf = pyspark.SparkConf() 

 
sc = pyspark.SparkContext.getOrCreate(conf=conf)
spark = SQLContext(sc)

schema = StructType([
    StructField("sales", FloatType(),True),    
    StructField("employee", StringType(),True),
    StructField("ID", IntegerType(),True)
])

data = [[ 10.2, "Fred",123]]

df = spark.createDataFrame(data,schema=schema)

colsInt = udf(lambda z: toInt(z), IntegerType())
spark.udf.register("colsInt", colsInt)

def toInt(s):
    if isinstance(s, str) == True:
        st = [str(ord(i)) for i in s]
        return(int(''.join(st)))
    else:
         return Null


df2 = df.withColumn( 'semployee',colsInt('employee'))
df2.show()

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\edkim\\AppData\\Local\\Temp\\spark-0129005e-33ea-4905-bd18-4f221a491ba8\\pyspark-a21e0888-1b54-4606-b3d8-279cf9f7784f\\tmp78umuw37'