In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

In [2]:
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

--2020-04-10 00:32:35--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2020-04-10 00:32:36 (3.64 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("HospETL").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()

In [5]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url="https://project-pandemix.s3.us-east-2.amazonaws.com/us_hospital_data.csv"
spark.sparkContext.addFile(url)
hosp_data_df = spark.read.csv(SparkFiles.get("us_hospital_data.csv"), sep=",", header=True, inferSchema=True)

# Show DataFrame
hosp_data_df.show()

+------------+----------+--------------------+--------------------+----------------+--------------+----------------+----------------+------------+--------------+-------------+---------------+------------------------+--------------+
|         lat|      long|       hospital_name|       hospital_type|          county|         state|num_licensed_bds|num_staffed_beds|num_icu_beds|adult_icu_beds|pedi_icu_beds|bed_utilization|increase_in_bed_capacity|avg_vent_usage|
+------------+----------+--------------------+--------------------+----------------+--------------+----------------+----------------+------------+--------------+-------------+---------------+------------------------+--------------+
| -81.7362671|26.0961161|The Willough at N...|Psychiatric Hospital|         Collier|       Florida|              87|              87|           0|             0|            0|      0.9435995|                       0|             0|
|-115.2158712|36.1459741|Spring Mountain S...|Psychiatric Hospital|     

Postgres Setup

In [0]:
# Configure settings for RDS
mode = "overwrite"
jdbc_url="jdbc:postgresql://hospitalutil.cuo7ivhfh3jn.us-east-1.rds.amazonaws.com:5432/hospitalutil"
config = {"user":"root", 
          "password": "postgress", 
          "driver":"org.postgresql.Driver"}


In [0]:
# Write DataFrame to active_user table in RDS

hosp_data_df.write.jdbc(url=jdbc_url, table='hospitalutil', mode=mode, properties=config)