### Exploring data frames

Exploring data frames, schemas, and data types concepts

Why use a custom schema? 
- makes the datatype implicit and avoids data anomoly problems
- performance overhead required from an implied schema (results in $)
- incompatible column names when reading from csv files (no spaces or special characters allowed)


In [1]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

dim_calendar = [("2024-01-01", 1, 2, "Mon", 1, "Jan"), 
    ("2024-01-02", 2, 3, "Tue", 1, "Jan"),
    ("2024-01-03", 3, 4, "Wed", 1, "Jan"),
    ("2024-01-04", 4, 5, "Thu", 1, "Jan"),
    ("2024-01-05", 5, 6, "Fri", 1, "Jan"),
    ("2024-01-06", 6, 7, "Sat", 1, "Jan"),
    ("2024-01-07", 7, 1, "Sun", 1, "Jan")
]

schema = StructType([
    StructField("calendardate",StringType() ,True),
    StructField("dayofyear",IntegerType(),True),
    StructField("dayofweek",IntegerType(),True),
    StructField("dayofweekname",StringType(),True),
    StructField("monthofyear",IntegerType(),True),
    StructField("monthname",StringType(),True)
])

df = spark.createDataFrame(data = dim_calendar, schema = schema)
df.printSchema()
df.show()

StatementMeta(, b9b63885-64f9-4c81-9458-a31780c2dcbf, 3, Finished, Available)

root
 |-- calendardate: string (nullable = true)
 |-- dayofyear: integer (nullable = true)
 |-- dayofweek: integer (nullable = true)
 |-- dayofweekname: string (nullable = true)
 |-- monthofyear: integer (nullable = true)
 |-- monthname: string (nullable = true)

+------------+---------+---------+-------------+-----------+---------+
|calendardate|dayofyear|dayofweek|dayofweekname|monthofyear|monthname|
+------------+---------+---------+-------------+-----------+---------+
|  2024-01-01|        1|        2|          Mon|          1|      Jan|
|  2024-01-02|        2|        3|          Tue|          1|      Jan|
|  2024-01-03|        3|        4|          Wed|          1|      Jan|
|  2024-01-04|        4|        5|          Thu|          1|      Jan|
|  2024-01-05|        5|        6|          Fri|          1|      Jan|
|  2024-01-06|        6|        7|          Sat|          1|      Jan|
|  2024-01-07|        7|        1|          Sun|          1|      Jan|
+------------+---------+--

### Spark Date Functions

In this example, I replace the manually created columns using builtin functions for DataFrame operations. <br>
Information on PySpark SQL functions can be found at [spark.apache.org](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/functions.html) 

the withColumn is can be used to: 
- override column names when reading from csv files
- adding a new column to a dataframe for a calculated column 

In [33]:
from pyspark.sql.functions import col, date_format, dayofyear, dayofweek, dayofmonth, year, month, quarter, trunc, last_day

dim_calendar = ["2024-01-01", "2024-01-02", "2024-01-03", "2024-01-04", "2024-01-05", "2024-01-06", "2024-01-07",
        "2024-01-08", "2024-01-09", "2024-01-10", "2024-01-11", "2024-01-12", "2024-01-13", "2024-01-14"]

df = spark.createDataFrame(dim_calendar, "string").toDF("calendardate")
df.printSchema()

df.withColumn('dayofyear', dayofyear(col("calendardate"))) \
    .withColumn('dayofweek', dayofweek(col("calendardate"))) \
    .withColumn('dayofweekname', date_format(col("calendardate"), "EEE")) \
    .withColumn('monthofyear', month(col("calendardate"))) \
    .withColumn('monthname', date_format(col("calendardate"), "MMM")) \
    .withColumn('year', year(col("calendardate"))) \
    .withColumn("firstdayofmonth", trunc(col("calendardate"), "MM")) \
    .withColumn("lastdayofmonth", last_day(col("calendardate"))) \
    .show()

df.printSchema()

StatementMeta(, 99b26b3d-29a4-49c7-9ab6-5a79801e600b, 35, Finished, Available)

root
 |-- calendardate: string (nullable = true)

+------------+---------+---------+-------------+-----------+---------+----+---------------+--------------+
|calendardate|dayofyear|dayofweek|dayofweekname|monthofyear|monthname|year|firstdayofmonth|lastdayofmonth|
+------------+---------+---------+-------------+-----------+---------+----+---------------+--------------+
|  2024-01-01|        1|        2|          Mon|          1|      Jan|2024|     2024-01-01|    2024-01-31|
|  2024-01-02|        2|        3|          Tue|          1|      Jan|2024|     2024-01-01|    2024-01-31|
|  2024-01-03|        3|        4|          Wed|          1|      Jan|2024|     2024-01-01|    2024-01-31|
|  2024-01-04|        4|        5|          Thu|          1|      Jan|2024|     2024-01-01|    2024-01-31|
|  2024-01-05|        5|        6|          Fri|          1|      Jan|2024|     2024-01-01|    2024-01-31|
|  2024-01-06|        6|        7|          Sat|          1|      Jan|2024|     2024-01-01|   

### Parameters - creating a sequence of dates
In these cells we add parameters and using the explode and sequence functions


In [24]:
# Parameters can be passed from pipelines...  For our date table, we will incrementally add dates for the current month 
# each time that we load monthly data.  

beginDate = '2015-01-01'
endDate = '2024-03-31'

StatementMeta(, b9b63885-64f9-4c81-9458-a31780c2dcbf, 26, Finished, Available)

In [22]:
from pyspark.sql.functions import col, date_format, dayofyear, dayofweek, dayofmonth, year, weekofyear, month, quarter, trunc, last_day, explode, sequence, to_date

df = spark.sql(f"select explode(sequence(to_date('{beginDate}'), to_date('{endDate}'), interval 1 day)) as calendarDate")
df.printSchema()
df.show(3)


df.withColumn('dayofyear', dayofyear(col("calendardate"))) \
    .withColumn('dayofweek', dayofweek(col("calendardate"))) \
    .withColumn('dayofweekname', date_format(col("calendardate"), "EEE")) \
    .withColumn('monthofyear', month(col("calendardate"))) \
    .withColumn('monthname', date_format(col("calendardate"), "MMM")) \
    .withColumn('year', year(col("calendardate"))) \
    .withColumn('dayofyear', dayofyear(col("calendardate"))) \
    .withColumn('weekofyear', weekofyear(col("calendardate"))) \
    .withColumn("firstdayofmonth", trunc(col("calendardate"), "MM")) \
    .withColumn("lastdayofmonth", last_day(col("calendardate"))) \
    .show()


StatementMeta(, b9b63885-64f9-4c81-9458-a31780c2dcbf, 24, Finished, Available)

root
 |-- calendarDate: date (nullable = false)

+------------+
|calendarDate|
+------------+
|  2024-01-01|
|  2024-01-02|
|  2024-01-03|
+------------+
only showing top 3 rows

+------------+---------+---------+-------------+-----------+---------+----+---------------+--------------+
|calendarDate|dayofyear|dayofweek|dayofweekname|monthofyear|monthname|year|firstdayofmonth|lastdayofmonth|
+------------+---------+---------+-------------+-----------+---------+----+---------------+--------------+
|  2024-01-01|        1|        2|          Mon|          1|      Jan|2024|     2024-01-01|    2024-01-31|
|  2024-01-02|        2|        3|          Tue|          1|      Jan|2024|     2024-01-01|    2024-01-31|
|  2024-01-03|        3|        4|          Wed|          1|      Jan|2024|     2024-01-01|    2024-01-31|
|  2024-01-04|        4|        5|          Thu|          1|      Jan|2024|     2024-01-01|    2024-01-31|
|  2024-01-05|        5|        6|          Fri|          1|      Jan|20

### Writing to a Delta table
In this cell,  we write the data to a Delta table

In [25]:
from pyspark.sql.functions import col, date_format, dayofweek, year, dayofyear, weekofyear, month, quarter, trunc, last_day, explode, sequence, to_date

deltaTableName = "dim_calendar"

df = spark.sql(f"select explode(sequence(to_date('{beginDate}'), to_date('{endDate}'), interval 1 day)) as calendarDate")

df.withColumn('dayofyear', dayofyear(col("calendardate"))) \
    .withColumn('dayofweek', dayofweek(col("calendardate"))) \
    .withColumn('dayofweekname', date_format(col("calendardate"), "EEE")) \
    .withColumn('monthofyear', month(col("calendardate"))) \
    .withColumn('monthname', date_format(col("calendardate"), "MMM")) \
    .withColumn('year', year(col("calendardate"))) \
    .withColumn('dayofyear', dayofyear(col("calendardate"))) \
    .withColumn('weekofyear', weekofyear(col("calendardate"))) \
    .withColumn('quarter', quarter(col("calendardate"))) \
    .withColumn("firstdayofmonth", trunc(col("calendardate"), "MM")) \
    .withColumn("lastdayofmonth", last_day(col("calendardate"))) \
    .withColumn('monthandyear', date_format(col("calendardate"), "yyyy-MM")) \
    .write.mode("overwrite").format("delta").saveAsTable(deltaTableName)


StatementMeta(, b9b63885-64f9-4c81-9458-a31780c2dcbf, 27, Finished, Available)