Learners will learn how to work with dates and strings in PySpark.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_timestamp, lower, trim, length, size, split, date_format, hour, to_date, expr, unix_timestamp, date_trunc, col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("WorkingWithDatesAndStrings") \
    .getOrCreate()

Task 1: Create Sample DataFrame with Strings and Timestamps
We will create a DataFrame containing string and timestamp values. The data represents users with their names, timestamps, and order statuses.

In [None]:
# Sample data
data = [
    ("  Alice  ", "2024-05-01 10:30:00", "Order confirmed!"),
    ("bob", "2024-05-03 15:45:10", "Delivered successfully."),
    ("CHARLIE", "2024-05-04 07:05:33", "payment failed"),
    ("dEbRa", "2024-05-04 19:55:00", "  CANCELLED  ")
]

# Define columns
columns = ["name", "timestamp", "status"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Cast string timestamp to proper type
df = df.withColumn("timestamp", to_timestamp("timestamp"))
df.show(truncate=False)


+---------+-------------------+-----------------------+
|name     |timestamp          |status                 |
+---------+-------------------+-----------------------+
|  Alice  |2024-05-01 10:30:00|Order confirmed!       |
|bob      |2024-05-03 15:45:10|Delivered successfully.|
|CHARLIE  |2024-05-04 07:05:33|payment failed         |
|dEbRa    |2024-05-04 19:55:00|  CANCELLED            |
+---------+-------------------+-----------------------+



Task 2: Demonstrate String Operations
In this task, we’ll perform some common string operations such as:

Lowercasing and trimming the name and status columns.

Calculating the length of the status column.

Counting the word count in the status column.

In [None]:
# String Operations: Clean up and process the strings
df = df.withColumn("name_clean", lower(trim(col("name")))) \
       .withColumn("status_clean", trim(lower(col("status")))) \
       .withColumn("status_length", length("status")) \
       .withColumn("word_count", size(split("status", " ")))

# Show the results
df.select("name", "name_clean", "status", "status_clean", "status_length", "word_count").show(truncate=False)

+---------+----------+-----------------------+-----------------------+-------------+----------+
|name     |name_clean|status                 |status_clean           |status_length|word_count|
+---------+----------+-----------------------+-----------------------+-------------+----------+
|  Alice  |alice     |Order confirmed!       |order confirmed!       |16           |2         |
|bob      |bob       |Delivered successfully.|delivered successfully.|23           |2         |
|CHARLIE  |charlie   |payment failed         |payment failed         |14           |2         |
|dEbRa    |debra     |  CANCELLED            |cancelled              |13           |5         |
+---------+----------+-----------------------+-----------------------+-------------+----------+



Task 3: Demonstrate Timestamp Operations
In this task, we will work with timestamps to extract information such as:

Day of the week.

Hour of the timestamp.

Date-only portion.

Whether the day is a weekend.

Rounded hour of the timestamp.

In [None]:
# Timestamp Operations
df = df.withColumn("day_of_week", date_format("timestamp", "EEEE")) \
       .withColumn("hour", hour("timestamp")) \
       .withColumn("date", to_date("timestamp")) \
       .withColumn("is_weekend", expr("dayofweek(timestamp) IN (1, 7)")) \
       .withColumn("rounded_hour", date_trunc("hour", col("timestamp")))

# Show the results
df.select("timestamp", "day_of_week", "hour", "date", "is_weekend", "rounded_hour").show(truncate=False)

+-------------------+-----------+----+----------+----------+-------------------+
|timestamp          |day_of_week|hour|date      |is_weekend|rounded_hour       |
+-------------------+-----------+----+----------+----------+-------------------+
|2024-05-01 10:30:00|Wednesday  |10  |2024-05-01|false     |2024-05-01 10:00:00|
|2024-05-03 15:45:10|Friday     |15  |2024-05-03|false     |2024-05-03 15:00:00|
|2024-05-04 07:05:33|Saturday   |7   |2024-05-04|true      |2024-05-04 07:00:00|
|2024-05-04 19:55:00|Saturday   |19  |2024-05-04|true      |2024-05-04 19:00:00|
+-------------------+-----------+----+----------+----------+-------------------+



Task 4: Demonstrate Time Difference Calculations
In this task, we will calculate the time difference between two timestamps. We’ll assume that the order is delivered after a set duration, and we’ll calculate the time difference in minutes.

In [None]:
# Assume we have another timestamp for when the order is delivered
df = df.withColumn("delivered_at", col("timestamp") + expr("INTERVAL 2 HOURS 15 MINUTES"))
df = df.withColumn("duration_minutes", (unix_timestamp("delivered_at") - unix_timestamp("timestamp")) / 60)

# Show the results
df.select("timestamp", "delivered_at", "duration_minutes").show(truncate=False)

+-------------------+-------------------+----------------+
|timestamp          |delivered_at       |duration_minutes|
+-------------------+-------------------+----------------+
|2024-05-01 10:30:00|2024-05-01 12:45:00|135.0           |
|2024-05-03 15:45:10|2024-05-03 18:00:10|135.0           |
|2024-05-04 07:05:33|2024-05-04 09:20:33|135.0           |
|2024-05-04 19:55:00|2024-05-04 22:10:00|135.0           |
+-------------------+-------------------+----------------+

