In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, col, count, countDistinct, sum, avg
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType
from delta.tables import *
from delta import *
from delta import configure_spark_with_delta_pip


In [2]:
builder = SparkSession \
        .builder \
        .appName('healthcare_Aggregations')\
        .master('local')\
        .config("spark.driver.memory","2g")\
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
        .config("spark.sql.execution.arrow.pyspark.enabled", "true")


spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
spark

In [4]:
source = spark.read \
                .format("delta").load("silver\\healthcare_trans")             

In [5]:
source.count()

26294

In [6]:
source.printSchema()

root
 |-- patients_id: string (nullable = true)
 |-- encounters_id: string (nullable = true)
 |-- FIRST: string (nullable = true)
 |-- LAST: string (nullable = true)
 |-- GENDER: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- BIRTHDATE: date (nullable = true)
 |-- DEATHDATE: date (nullable = true)
 |-- ENCOUNTERCLASS: string (nullable = true)
 |-- REASONCODE: long (nullable = true)
 |-- REASONDESCRIPTION: string (nullable = true)
 |-- BASE_COST: integer (nullable = true)
 |-- BASE_ENCOUNTER_COST: double (nullable = true)
 |-- TOTAL_CLAIM_COST: double (nullable = true)
 |-- PAYER: string (nullable = true)
 |-- PAYER_COVERAGE: double (nullable = true)
 |-- PAYER_NAME: string (nullable = true)
 |-- INGESTION_DATE: timestamp (nullable = true)
 |-- MODIFICATION_DATE: timestamp (nullable = true)
 |-- SOURCE: string (nullable = true)



In [7]:
source.createOrReplaceTempView("health")

In [13]:
#Insurance Analysis
pop_insurance = spark.sql("SELECT PAYER_NAME, \
                          SUM(TOTAL_CLAIM_COST) AS TOTALCLAIMCOST,\
                          AVG(PAYER_COVERAGE) AS AVERAGEPAYERCOVERAGE,\
                          COUNT(ENCOUNTERCLASS) AS TOTAL_CLAIMS\
                          FROM health \
                          GROUP BY PAYER_NAME \
                          ORDER BY AVERAGEPAYERCOVERAGE DESC")
pop_insurance.show()

+--------------------+--------------------+--------------------+------------+
|          PAYER_NAME|      TOTALCLAIMCOST|AVERAGEPAYERCOVERAGE|TOTAL_CLAIMS|
+--------------------+--------------------+--------------------+------------+
|            Medicaid|   8948728.260000002|   5921.291505981683|        1421|
|Blue Cross Blue S...|  2993246.8300000024|  2371.6637871853504|         874|
|            Medicare|2.4497492689999342E7|   1832.712737638881|       10436|
|       Dual Eligible|   1535708.230000004|  1641.1073539928357|         839|
|    UnitedHealthcare|   2554194.390000001|   4.188739693757364|         849|
|              Humana|  3535357.0400000038|  1.8823314065510612|        1038|
|               Aetna|  2580679.6500000027|  1.7999887892376691|         892|
|        Cigna Health|  2420435.5600000047|  1.2288705583756347|         788|
|        NO_INSURANCE|  4.91907004899982E7|                 0.0|        8474|
|              Anthem|  2978427.1900000004|                 0.0|

In [16]:
#Common Encounters
encount_cases = spark.sql("SELECT ENCOUNTERCLASS, \
                          COUNT(ENCOUNTERCLASS) AS OCCURENCES,\
                          SUM(BASE_ENCOUNTER_COST) AS TOTAL_BASE_ENCOUNTER_COST,\
                          SUM(BASE_COST) AS TOTAL_BASE_COST\
                          FROM health \
                          GROUP BY ENCOUNTERCLASS \
                          ORDER BY OCCURENCES DESC")
encount_cases.show()

+--------------+----------+-------------------------+---------------+
|ENCOUNTERCLASS|OCCURENCES|TOTAL_BASE_ENCOUNTER_COST|TOTAL_BASE_COST|
+--------------+----------+-------------------------+---------------+
|    ambulatory|     12405|       1313890.6500001978|       30655689|
|    outpatient|      6225|        652083.6300000027|       13271097|
|    urgentcare|      2337|       333209.45999999775|       19734386|
|     emergency|      2267|        329226.6699999877|        4434087|
|      wellness|      1931|        264160.7999999907|        2419221|
|     inpatient|      1129|       128142.65000000008|        6074229|
+--------------+----------+-------------------------+---------------+



In [9]:
spark.stop

<bound method SparkSession.stop of <pyspark.sql.session.SparkSession object at 0x00000148B24F04D0>>