In [24]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import current_timestamp, col, count, countDistinct, sum, avg, lit, to_timestamp, coalesce
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, DoubleType
from delta.tables import *
from delta import *
from delta import configure_spark_with_delta_pip


In [3]:
builder = SparkSession \
        .builder \
        .appName('healthcare_transformation')\
        .master('local')\
        .config("spark.driver.memory","2g")\
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [None]:
spark

In [None]:
bronze_folder = "bronze"

In [5]:
patients = spark.read \
                .format("delta").load(f"{bronze_folder}\\healthcare_delta\\patients")

In [None]:
patients.printSchema()

In [7]:
encounters = spark.read \
                .format("delta").load(f"{bronze_folder}\\healthcare_delta\\encounters") \
                .withColumnRenamed("REASONCODE", "REASONCODE_EN") \
                .withColumnRenamed("REASONDESCRIPTION", "REASONDESCRIPTION_EN")

In [None]:
encounters.printSchema()

In [9]:
procedure = spark.read \
                .format("delta").load(f"{bronze_folder}\\healthcare_delta\\procedure")

In [None]:
procedure.printSchema()

In [11]:
payers = spark.read \
            .format("delta").load(f"{bronze_folder}\\healthcare_delta\\payers") \
            .withColumnRenamed("CITY", "PAYER_CITY")

In [None]:
payers.printSchema()

In [13]:
combined_pe = patients.join(encounters, patients.patients_id == encounters.PATIENT_ID) \
                      .join(procedure, encounters.PATIENT_ID == procedure.PATIENT_ID) \
                      .join(payers, encounters.PAYER == payers.PAYER_ID)
                      

In [None]:
combined_pe.printSchema()

In [25]:
final_df = combined_pe.select("patients_id", "encounters_id", "FIRST", "LAST","GENDER", "CITY","BIRTHDATE","DEATHDATE","ENCOUNTERCLASS","REASONCODE", "REASONDESCRIPTION",
                             "BASE_COST", "BASE_ENCOUNTER_COST", "TOTAL_CLAIM_COST", "PAYER", "PAYER_COVERAGE","PAYER_NAME" 
                   )\
                   .withColumn("INGESTION_DATE", current_timestamp())\
                   .withColumn("MODIFICATION_DATE", current_timestamp())\
                   .withColumn("SOURCE", lit("Kaggle"))

In [None]:
final_df.printSchema()

In [27]:
final_df_dropped = final_df.dropDuplicates(['encounters_id'])

In [None]:
final_df_dropped.count()

In [None]:
final_df_dropped.show()

In [None]:
try:
    final_df_dropped.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save("silver\\healthcare_trans")
    print("table created")
except Exception as e:
    print("Table creation failed")
    print (e)

In [None]:
spark.stop