In [1]:
from pyspark.sql.functions import current_timestamp, col
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType
from pyspark.sql import SparkSession
from delta.tables import *
from delta import *
from delta import configure_spark_with_delta_pip

In [2]:
builder = SparkSession \
        .builder \
        .appName('healthcare_ingest')\
        .master('local')\
        .config("spark.driver.memory","2g")\
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
        
spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [3]:
spark

In [7]:
landing_folder = "landing_zone"

In [8]:
patients = spark.read \
            .option("inferschema", "True") \
            .option("header", "True") \
            .csv(f"{landing_folder}\\health_records\\patients.csv")

In [9]:
patients.printSchema()

root
 |-- Id: string (nullable = true)
 |-- BIRTHDATE: date (nullable = true)
 |-- DEATHDATE: date (nullable = true)
 |-- PREFIX: string (nullable = true)
 |-- FIRST: string (nullable = true)
 |-- LAST: string (nullable = true)
 |-- SUFFIX: string (nullable = true)
 |-- MAIDEN: string (nullable = true)
 |-- MARITAL: string (nullable = true)
 |-- RACE: string (nullable = true)
 |-- ETHNICITY: string (nullable = true)
 |-- GENDER: string (nullable = true)
 |-- BIRTHPLACE: string (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- COUNTY: string (nullable = true)
 |-- ZIP: integer (nullable = true)
 |-- LAT: double (nullable = true)
 |-- LON: double (nullable = true)



In [40]:
patients_renamed = patients.withColumnRenamed("Id", "patients_id")

In [42]:
procedure = spark.read \
            .option("inferschema", "True") \
            .option("header", "True") \
            .csv(f"{landing_folder}\\health_records\\procedures.csv")

In [43]:
procedure.printSchema()

root
 |-- START: timestamp (nullable = true)
 |-- STOP: timestamp (nullable = true)
 |-- PATIENT: string (nullable = true)
 |-- ENCOUNTER: string (nullable = true)
 |-- CODE: long (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- BASE_COST: integer (nullable = true)
 |-- REASONCODE: long (nullable = true)
 |-- REASONDESCRIPTION: string (nullable = true)



In [44]:
procedure_renamed = procedure.withColumnRenamed("PATIENT", "PATIENT_ID") \
                             .withColumnRenamed("ENCOUNTER", "ENCOUNTER_ID")

In [47]:
encounters = spark.read \
            .option("inferschema", "True") \
            .option("header", "True") \
            .csv(f"{landing_folder}\\health_records\\encounters.csv")

In [50]:
encounters.printSchema()

root
 |-- Id: string (nullable = true)
 |-- START: timestamp (nullable = true)
 |-- STOP: timestamp (nullable = true)
 |-- PATIENT: string (nullable = true)
 |-- ORGANIZATION: string (nullable = true)
 |-- PAYER: string (nullable = true)
 |-- ENCOUNTERCLASS: string (nullable = true)
 |-- CODE: integer (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- BASE_ENCOUNTER_COST: double (nullable = true)
 |-- TOTAL_CLAIM_COST: double (nullable = true)
 |-- PAYER_COVERAGE: double (nullable = true)
 |-- REASONCODE: long (nullable = true)
 |-- REASONDESCRIPTION: string (nullable = true)



In [51]:
encounters_renamed = encounters.withColumnRenamed("Id", "encounters_id") \
                               .withColumnRenamed("PATIENT", "PATIENT_ID")

In [52]:
payers = spark.read \
            .option("inferschema", "True") \
            .option("header", "True") \
            .csv(f"{landing_folder}\\health_records\\payers.csv")

In [53]:
payers.printSchema()

root
 |-- Id: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE_HEADQUARTERED: string (nullable = true)
 |-- ZIP: integer (nullable = true)
 |-- PHONE: string (nullable = true)



In [54]:
payers_renamed = payers.withColumnRenamed("Id", "payers_id") \
                       .withColumnRenamed("NAME", "PAYER_NAME")

In [57]:
organizations = spark.read \
            .option("inferschema", "True") \
            .option("header", "True") \
            .csv(f"{landing_folder}\\health_records\\organizations.csv")

In [58]:
organizations.printSchema()

root
 |-- Id: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- ADDRESS: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- STATE: string (nullable = true)
 |-- ZIP: integer (nullable = true)
 |-- LAT: double (nullable = true)
 |-- LON: double (nullable = true)



In [59]:
organizations_renamed = organizations.withColumnRenamed("Id", "organizations_id")

In [62]:
description = spark.read \
            .option("inferschema", "True") \
            .option("header", "True") \
            .csv(f"{landing_folder}\\health_records\\data_dictionary.csv")

In [63]:
description.printSchema()

root
 |-- Table: string (nullable = true)
 |-- Field: string (nullable = true)
 |-- Description: string (nullable = true)



In [75]:
try:
    patients_renamed.write.format("delta").mode("overwrite").save(f"{bronze_folder}\\healthcare_delta\\patients")
    print("table created")
except Exception as e:
    print("Table creation failed")
    print (e)


table created


In [69]:

procedure_renamed.write.format("delta").mode("overwrite").save(f"{bronze_folder}\\healthcare_delta\\procedure")

In [70]:
encounters_renamed.write.format("delta").mode("overwrite").save(f"{bronze_folder}\\healthcare_delta\\encounters")

In [71]:
payers_renamed.write.format("delta").mode("overwrite").save(f"{bronze_folder}\\healthcare_delta\\payers")

In [72]:
organizations_renamed.write.format("delta").mode("overwrite").save(f"{bronze_folder}\\healthcare_delta\\organizations")

In [73]:
description.write.format("delta").mode("overwrite").save(f"{bronze_folder}\\healthcare_delta\\description")

In [74]:
spark.stop

<bound method SparkSession.stop of <pyspark.sql.session.SparkSession object at 0x000002499A201750>>