In [3]:
from pyspark.sql import SparkSession
import os
import pyspark.sql.functions as F
spark = (
    SparkSession
    .builder
    .master("local[*]")
    .getOrCreate()
)
df = spark.read.parquet(
    os.path.join("/opt/workspace", "parquet")
).select(
    F.col("name"),
    F.col("premiered"),
)
df.show(5)

+------------------+----------+
|              name| premiered|
+------------------+----------+
|        The Affair|2014-10-12|
|   Jane the Virgin|2014-10-13|
|          Marry Me|2014-10-14|
|Two and a Half Men|2003-09-22|
|       About a Boy|2014-02-22|
+------------------+----------+


use .when() and .otherwise() to create a new column called `era` that categorizes the `premiered` column into the following eras: 20s, teens, naughties, 20th century.

In [6]:
df.select(
    F.col("name"),
    F.col("premiered"),
    F.when((F.year(F.col("premiered")) < 2030) & (F.year(F.col("premiered")) >= 2020), "20s")
     .when((F.year(F.col("premiered")) < 2020) & (F.year(F.col("premiered")) >= 2010), "teens")
     .when((F.year(F.col("premiered")) < 2010) & (F.year(F.col("premiered")) >= 2000), "naughties")
     .otherwise("20th century")
     .alias("era"),
).show(5)

+------------------+----------+---------+
|              name| premiered|      era|
+------------------+----------+---------+
|        The Affair|2014-10-12|    teens|
|   Jane the Virgin|2014-10-13|    teens|
|          Marry Me|2014-10-14|    teens|
|Two and a Half Men|2003-09-22|naughties|
|       About a Boy|2014-02-22|    teens|
+------------------+----------+---------+


use expr() and SQL case to create a new column called `era` that categorizes the `premiered` column into the following eras: 20s, teens, naughties, 20th century.

In [7]:
df.select(
    F.col("name"),
    F.col("premiered"),
    F.expr(
        """
        CASE
            WHEN year(premiered) < 2030 AND year(premiered) >= 2020 THEN '20s'
            WHEN year(premiered) < 2020 AND year(premiered) >= 2010 THEN 'teens'
            WHEN year(premiered) < 2010 AND year(premiered) >= 2000 THEN 'naughties'
            ELSE '20th century'
        END AS era
        """
    )
).show(5)

+------------------+----------+---------+
|              name| premiered|      era|
+------------------+----------+---------+
|        The Affair|2014-10-12|    teens|
|   Jane the Virgin|2014-10-13|    teens|
|          Marry Me|2014-10-14|    teens|
|Two and a Half Men|2003-09-22|naughties|
|       About a Boy|2014-02-22|    teens|
+------------------+----------+---------+
