# Let's create our own dataset to work with real dates

This is a dataset of patient visits from a medical office. It contains the patients first and last names, date of birth, and the dates of their first 3 visits. 

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, DateType
from pyspark.sql.functions import to_date, expr

# Tạo phiên Spark
spark = SparkSession.builder \
    .appName("Date Conversion") \
    .getOrCreate()

# Tạo DataFrame từ dữ liệu
md_office = [
    ('Mohammed', 'Alfasy', '1987-04-08', '2016-01-07', '2017-02-03', '2018-03-02'),
    ('Marcy', 'Wellmaker', '1986-04-08', '2015-01-07', '2017-01-03', '2018-01-02'),
    ('Ginny', 'Ginger', '1986-07-10', '2014-08-07', '2015-02-03', '2016-03-02'),
    ('Vijay', 'Doberson', '1988-05-02', '2016-01-07', '2018-02-03', '2018-03-02'),
    ('Orhan', 'Gelicek', '1987-05-11', '2016-05-07', '2017-01-03', '2018-09-02'),
    ('Sarah', 'Jones', '1956-07-06', '2016-04-07', '2017-08-03', '2018-10-02'),
    ('John', 'Johnson', '2017-10-12', '2018-01-02', '2018-10-03', '2018-03-02')
]

# Định nghĩa schema cho DataFrame
schema = StructType([
    StructField('first_name', StringType(), True),
    StructField('last_name', StringType(), True),
    StructField('dob', StringType(), True),
    StructField('visit1', StringType(), True),
    StructField('visit2', StringType(), True),
    StructField('visit3', StringType(), True)
])

# Tạo DataFrame từ dữ liệu và schema
df = spark.createDataFrame(md_office, schema)

# Chuyển đổi chuỗi ngày tháng thành ngày tháng với chế độ "LEGACY"
df = df.withColumn('dob', to_date(expr("lpad(dob, 10, '0')"), 'yyyy-MM-dd')) \
       .withColumn('visit1', to_date(expr("lpad(visit1, 10, '0')"), 'yyyy-MM-dd')) \
       .withColumn('visit2', to_date(expr("lpad(visit2, 10, '0')"), 'yyyy-MM-dd')) \
       .withColumn('visit3', to_date(expr("lpad(visit3, 10, '0')"), 'yyyy-MM-dd'))

# Hiển thị DataFrame và schema
df.show()
df.printSchema()


+----------+---------+----------+----------+----------+----------+
|first_name|last_name|       dob|    visit1|    visit2|    visit3|
+----------+---------+----------+----------+----------+----------+
|  Mohammed|   Alfasy|1987-04-08|2016-01-07|2017-02-03|2018-03-02|
|     Marcy|Wellmaker|1986-04-08|2015-01-07|2017-01-03|2018-01-02|
|     Ginny|   Ginger|1986-07-10|2014-08-07|2015-02-03|2016-03-02|
|     Vijay| Doberson|1988-05-02|2016-01-07|2018-02-03|2018-03-02|
|     Orhan|  Gelicek|1987-05-11|2016-05-07|2017-01-03|2018-09-02|
|     Sarah|    Jones|1956-07-06|2016-04-07|2017-08-03|2018-10-02|
|      John|  Johnson|2017-10-12|2018-01-02|2018-10-03|2018-03-02|
+----------+---------+----------+----------+----------+----------+

root
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- visit1: date (nullable = true)
 |-- visit2: date (nullable = true)
 |-- visit3: date (nullable = true)



## 7. Can you calculate a variable showing the length of time between patient visits?

Compare visit1 to visit2 and visit2 to visit3 for all patients and see what the average length of time is between visits. Create an alias for it as well. 

In [2]:
from pyspark.sql.functions import col, datediff, avg

# Calculate the difference between visit dates (in days)
df = df.withColumn("visit1_to_visit2", datediff(col("visit2"), col("visit1"))) \
       .withColumn("visit2_to_visit3", datediff(col("visit3"), col("visit2")))

# Calculate the average time between visits (in days)
avg_visit_interval = df.select(avg(col("visit1_to_visit2").alias("avg_visit_interval12")) + \
                              avg(col("visit2_to_visit3").alias("avg_visit_interval23")))

# Print the results
avg_visit_interval.show()


+-----------------------------------------------------------------------------------------------+
|(avg(visit1_to_visit2 AS avg_visit_interval12) + avg(visit2_to_visit3 AS avg_visit_interval23))|
+-----------------------------------------------------------------------------------------------+
|                                                                              721.2857142857142|
+-----------------------------------------------------------------------------------------------+



## 8. Can you calculate the age of each patient?

In [3]:

from pyspark.sql.functions import col, current_date
age = df.withColumn('age', (datediff(current_date(), col("dob"))/365.25)) # based on Gregorian calendar, we have 365 days and appox 4 hours per year
age.show()

+----------+---------+----------+----------+----------+----------+----------------+----------------+------------------+
|first_name|last_name|       dob|    visit1|    visit2|    visit3|visit1_to_visit2|visit2_to_visit3|               age|
+----------+---------+----------+----------+----------+----------+----------------+----------------+------------------+
|  Mohammed|   Alfasy|1987-04-08|2016-01-07|2017-02-03|2018-03-02|             393|             392| 36.90075290896646|
|     Marcy|Wellmaker|1986-04-08|2015-01-07|2017-01-03|2018-01-02|             727|             364| 37.90006844626968|
|     Ginny|   Ginger|1986-07-10|2014-08-07|2015-02-03|2016-03-02|             180|             393| 37.64544832306639|
|     Vijay| Doberson|1988-05-02|2016-01-07|2018-02-03|2018-03-02|             758|              27|35.832991101984945|
|     Orhan|  Gelicek|1987-05-11|2016-05-07|2017-01-03|2018-09-02|             241|             607|  36.8104038329911|
|     Sarah|    Jones|1956-07-06|2016-04

## 9. Can you extract the month from the first visit column and call it "Month"?

In [4]:
from pyspark.sql.functions import col, month

# Extract month from the "visit1" column and name it "Month"
df = df.withColumn("Month", month(col("visit1")).alias("Month"))

# Print the DataFrame with the new column
df.show()


+----------+---------+----------+----------+----------+----------+----------------+----------------+-----+
|first_name|last_name|       dob|    visit1|    visit2|    visit3|visit1_to_visit2|visit2_to_visit3|Month|
+----------+---------+----------+----------+----------+----------+----------------+----------------+-----+
|  Mohammed|   Alfasy|1987-04-08|2016-01-07|2017-02-03|2018-03-02|             393|             392|    1|
|     Marcy|Wellmaker|1986-04-08|2015-01-07|2017-01-03|2018-01-02|             727|             364|    1|
|     Ginny|   Ginger|1986-07-10|2014-08-07|2015-02-03|2016-03-02|             180|             393|    8|
|     Vijay| Doberson|1988-05-02|2016-01-07|2018-02-03|2018-03-02|             758|              27|    1|
|     Orhan|  Gelicek|1987-05-11|2016-05-07|2017-01-03|2018-09-02|             241|             607|    5|
|     Sarah|    Jones|1956-07-06|2016-04-07|2017-08-03|2018-10-02|             483|             425|    4|
|      John|  Johnson|2017-10-12|2018