In [0]:
#read from csv
appointment_df  = spark.read \
                    .option("header", True) \
                    .option("delimiter", ",") \
                    .option("multiline", True) \
                    .option("escape", "\"") \
                    .csv("s3://cmg-databricks-lakehouse/mfd/appointments/", inferSchema =True)

patient_df = spark.read \
                .option("Header", True) \
                .option("delimiter", ",") \
                .option("multiline", True) \
                .option("escape", "\"") \
                .csv("s3://cmg-databricks-lakehouse/mfd/patients/", inferSchema =True)

In [0]:
patient_df.dtypes

In [0]:
appointment_df.columns

In [0]:
appointment_df.printSchema()

In [0]:
display(patient_df)

In [0]:
#This is kinda cool to get just one row 
appointment1 = appointment_df.take(1) # it take row number as a parameter
print(appointment1[0]['cmgID'], appointment1[0]['patientID'])


In [0]:
appointment_df.count()

In [0]:
sample_appointment_df = appointment_df.sample(fraction=0.001)
print(sample_appointment_df.count())

In [0]:
display(appointment_df)

In [0]:
from pyspark.sql.functions import col, sum


appointment_df.select(col("cmgID"), col("patientID")).groupBy("cmgID").agg(sum("patientID")).show(5)

In [0]:



appointment_df.select([
    sum(col(c).isNull().cast('int')).alias(c+'_nullCount')
    for c in appointment_df.columns
]).display()

In [0]:
from pyspark.sql.functions import when

appointment_df = appointment_df.withColumn(
    "arrived",
     when(col("arrived").isNull(), 0)
    .otherwise(col("arrived"))
)

In [0]:
appointment_df.display()

In [0]:
appointment_df.filter((col("startTime").cast("date") <= "2025-06-25") & (col("startTime").cast("date") >= "2020-06-25")).display()

In [0]:
appointment_df =  appointment_df.withColumn(
    "startDate",
    col("startTime").cast("date")

)

In [0]:
appointment_df.withColumnRenamed(
    "notes", 
    "hashNotes"
).display()

In [0]:
patient_df.display()

In [0]:

from pyspark.sql.functions import concat_ws
# concat with separator 
patient_df = patient_df.withColumn(
    "name",
    concat_ws(" ", col("title"), col("firstName"), col("middleName"), col("lastName"))
).display()

In [0]:


joined_df = patient_df.join(
    appointment_df,
    patient_df.ID == appointment_df.patientID
).select(
    patient_df.title.alias("patientName"),
    appointment_df["*"]
)
display(joined_df)

In [0]:
joined_df = patient_df.join(
    appointment_df,
    (patient_df.ID == appointment_df.patientID) & (appointment_df.date == "2020-06-25"),
    how = "left_anti"
).display()




In [0]:
from pyspark.sql.functions import count, sum


patient_df.groupBy(
    "title"
).agg(

    count("*").alias("count")
).display()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import  row_number

w = Window.partitionBy("patientID").orderBy("ID")

latest_appointment_df = appointment_df.withColumn(
    "rank",
    row_number().over(w)
).filter(
    col("rank") == 1
).drop("rank")
latest_appointment_df.display()