In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("2010-12-01.csv")
df.printSchema()
df.createOrReplaceTempView("invoice")

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [4]:
df.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:00|     7.65|   17850.0|United Kingdom|
|   536365|    21730|GLASS S

## Converting to Spark Types

In [5]:
from pyspark.sql.functions import lit
df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

## Working with Booleans

In [6]:
from pyspark.sql.functions import col
df.filter("InvoiceNo = 536365").select("InvoiceNo", "Description").show(5, False)

+---------+-----------------------------------+
|InvoiceNo|Description                        |
+---------+-----------------------------------+
|536365   |WHITE HANGING HEART T-LIGHT HOLDER |
|536365   |WHITE METAL LANTERN                |
|536365   |CREAM CUPID HEARTS COAT HANGER     |
|536365   |KNITTED UNION FLAG HOT WATER BOTTLE|
|536365   |RED WOOLLY HOTTIE WHITE HEART.     |
+---------+-----------------------------------+
only showing top 5 rows



In [7]:
df.filter("InvoiceNo != 536365").select("InvoiceNo", "Description").show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [8]:
df.filter("InvoiceNo <> 536365").select("InvoiceNo", "Description").show(5, False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [9]:
from pyspark.sql.functions import instr
priceFilter = col("UnitPrice") > 600
descripFilter = instr(df.Description, "POSTAGE") >= 1
df.filter(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      NULL|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      NULL|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [11]:
# SQL
spark.sql('SELECT * FROM invoice WHERE StockCode in ("DOT") AND (UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1)').show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      NULL|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      NULL|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [12]:
DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
.filter("isExpensive")\
.select("StockCode", "unitPrice", "Description", "isExpensive").show(5)

+---------+---------+--------------+-----------+
|StockCode|unitPrice|   Description|isExpensive|
+---------+---------+--------------+-----------+
|      DOT|   569.77|DOTCOM POSTAGE|       true|
|      DOT|   607.49|DOTCOM POSTAGE|       true|
+---------+---------+--------------+-----------+



In [13]:
# SQL
spark.sql("""SELECT StockCode, UnitPrice, Description, 
(StockCode = 'DOT' AND (UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1)) as isExpensive
FROM invoice
WHERE (StockCode = 'DOT' AND
(UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1))""").show()

+---------+---------+--------------+-----------+
|StockCode|UnitPrice|   Description|isExpensive|
+---------+---------+--------------+-----------+
|      DOT|   569.77|DOTCOM POSTAGE|       true|
|      DOT|   607.49|DOTCOM POSTAGE|       true|
+---------+---------+--------------+-----------+



## Working with Numbers

In [14]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [15]:
from pyspark.sql.functions import expr, pow
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [16]:
df.selectExpr("CustomerId", "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [17]:
# SQL
spark.sql('SELECT customerId, (POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity FROM invoice').show(2)

+----------+------------------+
|customerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



### Rounding

In [18]:
from pyspark.sql.functions import lit, round, bround
df.select(round(lit("2.5")), bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [19]:
# SQL
spark.sql('SELECT round(2.5), bround(2.5)').show()

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|            3|             2|
+-------------+--------------+



### Statistic

In [20]:
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                NULL| 8.627413127413128| 4.151946589446603|15661.388719512195|          NULL|
| stddev|72.89447869788873|17407.897548583845|                NULL|26.371821677029203|15.638659854603892|1854.4496996893627|          NULL|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

In [21]:
df.stat.freqItems(["StockCode", "Quantity"]).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[22086, 21705, 72...|[200, 128, 23, 50...|
+--------------------+--------------------+



In [22]:
from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)

+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
+-----------------------------+
only showing top 2 rows



## Working with Strings

In [28]:
from pyspark.sql.functions import lower, upper

In [29]:
df.select(col("Description"),
lower(col("Description")),
upper(lower(col("Description")))).show(2)

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
+--------------------+--------------------+-------------------------+
only showing top 2 rows



In [32]:
# SQL
spark.sql('SELECT Description, lower(Description), Upper(lower(Description)) FROM invoice').show()

+--------------------+--------------------+-------------------------+
|         Description|  lower(Description)|upper(lower(Description))|
+--------------------+--------------------+-------------------------+
|WHITE HANGING HEA...|white hanging hea...|     WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern|      WHITE METAL LANTERN|
|CREAM CUPID HEART...|cream cupid heart...|     CREAM CUPID HEART...|
|KNITTED UNION FLA...|knitted union fla...|     KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|red woolly hottie...|     RED WOOLLY HOTTIE...|
|SET 7 BABUSHKA NE...|set 7 babushka ne...|     SET 7 BABUSHKA NE...|
|GLASS STAR FROSTE...|glass star froste...|     GLASS STAR FROSTE...|
|HAND WARMER UNION...|hand warmer union...|     HAND WARMER UNION...|
|HAND WARMER RED P...|hand warmer red p...|     HAND WARMER RED P...|
|ASSORTED COLOUR B...|assorted colour b...|     ASSORTED COLOUR B...|
|POPPY'S PLAYHOUSE...|poppy's playhouse...|     POPPY'S PLAYHOUSE...|
|POPPY'S PLAYHOUSE..

In [33]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim

In [55]:
df.select(
ltrim(lit(" HELLO ")).alias("ltrim"),
rtrim(lit(" HELLO ")).alias("rtrim"),
trim(lit(" HELLO ")).alias("trim"),
lpad(lit("HELLO"), 10, " ").alias("lp"),
rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)

+------+------+-----+----------+----------+
| ltrim| rtrim| trim|        lp|        rp|
+------+------+-----+----------+----------+
|HELLO | HELLO|HELLO|     HELLO|HELLO     |
|HELLO | HELLO|HELLO|     HELLO|HELLO     |
+------+------+-----+----------+----------+
only showing top 2 rows



### Regular Expressions

In [56]:
from pyspark.sql.functions import regexp_replace

In [57]:
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"

In [59]:
df.select('Description').show(3)

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|CREAM CUPID HEART...|
+--------------------+
only showing top 3 rows



In [60]:
df.select(regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),col("Description")).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



In [61]:
# SQL
spark.sql("""SELECT
regexp_replace(Description, 'BLACK|WHITE|RED|GREEN|BLUE', 'COLOR') as
color_clean, Description
FROM invoice""").show(3)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
|CREAM CUPID HEART...|CREAM CUPID HEART...|
+--------------------+--------------------+
only showing top 3 rows



In [65]:
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\
.show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



In [69]:
# SQL
spark.sql("SELECT translate(Description, 'LEET', '1337'), Description FROM invoice").show(3)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
|              CR3AM CUPID H3AR7...|CREAM CUPID HEART...|
+----------------------------------+--------------------+
only showing top 3 rows



In [72]:
from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(regexp_extract(col("Description"), extract_str, 1).alias("color_clean"), col("Description")).show(2)

+-----------+--------------------+
|color_clean|         Description|
+-----------+--------------------+
|      WHITE|WHITE HANGING HEA...|
|      WHITE| WHITE METAL LANTERN|
+-----------+--------------------+
only showing top 2 rows



In [None]:
regexp_extract()

In [91]:
# SQL
spark.sql("""SELECT regexp_extract(Description, '(BLACK|WHITE|RED|GREEN|BLUE|)', 1) as color,
Description
FROM invoice""").show()

+-----+--------------------+
|color|         Description|
+-----+--------------------+
|WHITE|WHITE HANGING HEA...|
|WHITE| WHITE METAL LANTERN|
|     |CREAM CUPID HEART...|
|     |KNITTED UNION FLA...|
|  RED|RED WOOLLY HOTTIE...|
|     |SET 7 BABUSHKA NE...|
|     |GLASS STAR FROSTE...|
|     |HAND WARMER UNION...|
|     |HAND WARMER RED P...|
|     |ASSORTED COLOUR B...|
|     |POPPY'S PLAYHOUSE...|
|     |POPPY'S PLAYHOUSE...|
|     |FELTCRAFT PRINCES...|
|     |IVORY KNITTED MUG...|
|     |BOX OF 6 ASSORTED...|
|     |BOX OF VINTAGE JI...|
|     |BOX OF VINTAGE AL...|
|     |HOME BUILDING BLO...|
|     |LOVE BUILDING BLO...|
|     |RECIPE BOX WITH M...|
+-----+--------------------+
only showing top 20 rows



In [95]:
from pyspark.sql.functions import instr
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
.where("hasSimpleColor")\
.select("Description").show(3, False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



In [98]:
# SQL
spark.sql("SELECT Description FROM invoice WHERE instr(Description, 'BLACK') >= 1 OR instr(Description, 'WHITE') >= 1").show(3)

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
|RED WOOLLY HOTTIE...|
+--------------------+
only showing top 3 rows



In [103]:
from pyspark.sql.functions import expr, locate
simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
    return locate(color_string.upper(), column)\
    .cast("boolean")\
    .alias("is_" + color_string)

In [104]:
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]

In [117]:
df.select(selectedColumns + ['Description']).where(expr("is_white OR is_red")).show()

+--------+--------+------+--------+-------+--------------------+
|is_black|is_white|is_red|is_green|is_blue|         Description|
+--------+--------+------+--------+-------+--------------------+
|   false|    true| false|   false|  false|WHITE HANGING HEA...|
|   false|    true| false|   false|  false| WHITE METAL LANTERN|
|   false|    true|  true|   false|  false|RED WOOLLY HOTTIE...|
|   false|   false|  true|   false|  false|HAND WARMER RED P...|
|   false|   false|  true|   false|  false|RED COAT RACK PAR...|
|   false|   false|  true|   false|  false|ALARM CLOCK BAKEL...|
|   false|   false|  true|   false|  false|SET/2 RED RETROSP...|
|   false|   false|  true|   false|  false|RED TOADSTOOL LED...|
|   false|   false|  true|   false|  false|HAND WARMER RED P...|
|   false|    true| false|   false|  false|WHITE HANGING HEA...|
|   false|    true| false|   false|  false| WHITE METAL LANTERN|
|   false|   false|  true|   false|  false|EDWARDIAN PARASOL...|
|   false|    true| false

## Working with Dates and Timestamps

In [119]:
from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")

In [120]:
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [122]:
dateDF.show()

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2023-11-17|2023-11-17 19:02:...|
|  1|2023-11-17|2023-11-17 19:02:...|
|  2|2023-11-17|2023-11-17 19:02:...|
|  3|2023-11-17|2023-11-17 19:02:...|
|  4|2023-11-17|2023-11-17 19:02:...|
|  5|2023-11-17|2023-11-17 19:02:...|
|  6|2023-11-17|2023-11-17 19:02:...|
|  7|2023-11-17|2023-11-17 19:02:...|
|  8|2023-11-17|2023-11-17 19:02:...|
|  9|2023-11-17|2023-11-17 19:02:...|
+---+----------+--------------------+



In [123]:
from pyspark.sql.functions import date_add, date_sub

In [124]:
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2023-11-12|        2023-11-22|
+------------------+------------------+
only showing top 1 row



In [126]:
# SQL
spark.sql('SELECT date_sub(today, 5), date_add(today, 5) FROM dateTable').show()

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2023-11-12|        2023-11-22|
|        2023-11-12|        2023-11-22|
|        2023-11-12|        2023-11-22|
|        2023-11-12|        2023-11-22|
|        2023-11-12|        2023-11-22|
|        2023-11-12|        2023-11-22|
|        2023-11-12|        2023-11-22|
|        2023-11-12|        2023-11-22|
|        2023-11-12|        2023-11-22|
|        2023-11-12|        2023-11-22|
+------------------+------------------+



In [131]:
from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7)).show(2)

+---+----------+--------------------+----------+
| id|     today|                 now|  week_ago|
+---+----------+--------------------+----------+
|  0|2023-11-17|2023-11-17 19:05:...|2023-11-10|
|  1|2023-11-17|2023-11-17 19:05:...|2023-11-10|
+---+----------+--------------------+----------+
only showing top 2 rows



In [134]:
dateDF.withColumn("week_ago", date_sub(col("today"), 7)).select(datediff(col("week_ago"), col("today"))).show(2)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
|                       -7|
+-------------------------+
only showing top 2 rows



In [138]:
dateDF.select(
to_date(lit("2023-01-01")).alias("start"),
to_date(lit("2024-01-01")).alias("end"))\
.select(months_between(col("start"), col("end"))).show(1)

+--------------------------------+
|months_between(start, end, true)|
+--------------------------------+
|                           -12.0|
+--------------------------------+
only showing top 1 row



In [140]:
# SQL
spark.sql("""SELECT to_date('2023-01-01'), months_between('2023-01-01', '2024-01-01'),
datediff('2023-01-01', '2024-01-01')
FROM dateTable""").show()

+-------------------+--------------------------------------------+--------------------------------+
|to_date(2023-01-01)|months_between(2023-01-01, 2024-01-01, true)|datediff(2023-01-01, 2024-01-01)|
+-------------------+--------------------------------------------+--------------------------------+
|         2023-01-01|                                       -12.0|                            -365|
|         2023-01-01|                                       -12.0|                            -365|
|         2023-01-01|                                       -12.0|                            -365|
|         2023-01-01|                                       -12.0|                            -365|
|         2023-01-01|                                       -12.0|                            -365|
|         2023-01-01|                                       -12.0|                            -365|
|         2023-01-01|                                       -12.0|                            -365|


In [143]:
from pyspark.sql.functions import to_date, lit
spark.range(5).withColumn("date", lit("2023-01-01")).select(to_date(col("date"))).show(2)

+-------------+
|to_date(date)|
+-------------+
|   2023-01-01|
|   2023-01-01|
+-------------+
only showing top 2 rows



In [147]:
dateDF.select(to_date(lit("2023-32-12")),to_date(lit("2023-12-11"))).show(1)

+-------------------+-------------------+
|to_date(2023-32-12)|to_date(2023-12-11)|
+-------------------+-------------------+
|               NULL|         2023-12-11|
+-------------------+-------------------+
only showing top 1 row



In [151]:
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
to_date(lit("2023-12-11"), dateFormat).alias("date"),
to_date(lit("2023-20-12"), dateFormat).alias("date2"))
cleanDateDF.createOrReplaceTempView("dateTable2")

In [152]:
cleanDateDF.show(2)

+----------+----------+
|      date|     date2|
+----------+----------+
|2023-11-12|2023-12-20|
+----------+----------+



In [161]:
from pyspark.sql.functions import to_timestamp
cleanDateDF.select(to_timestamp(col("date"), dateFormat)).show()

+------------------------------+
|to_timestamp(date, yyyy-dd-MM)|
+------------------------------+
|           2023-11-12 00:00:00|
+------------------------------+



In [163]:
cleanDateDF.select(to_timestamp(col("date"), 'yyyy-dd-MM')).show()

+------------------------------+
|to_timestamp(date, yyyy-dd-MM)|
+------------------------------+
|           2023-11-12 00:00:00|
+------------------------------+



In [166]:
# SQL
spark.sql("SELECT to_timestamp(date, 'yyyy-dd-MM'), to_timestamp(date2, 'yyyy-dd-MM') FROM dateTable2").show()

+------------------------------+-------------------------------+
|to_timestamp(date, yyyy-dd-MM)|to_timestamp(date2, yyyy-dd-MM)|
+------------------------------+-------------------------------+
|           2023-11-12 00:00:00|            2023-12-20 00:00:00|
+------------------------------+-------------------------------+



## Working with Nulls in Data

### drop

In [197]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
schema = StructType([
   StructField("name", StringType(), True),
   StructField("age", IntegerType(), True)])
df2 = spark.createDataFrame([('alice', 20), ('bob', None), ('john', 25), (None, None)], schema)

In [198]:
df2.show()

+-----+----+
| name| age|
+-----+----+
|alice|  20|
|  bob|NULL|
| john|  25|
| NULL|NULL|
+-----+----+



In [199]:
df2.na.drop().show()

+-----+---+
| name|age|
+-----+---+
|alice| 20|
| john| 25|
+-----+---+



In [201]:
df2.na.drop("all").show()

+-----+----+
| name| age|
+-----+----+
|alice|  20|
|  bob|NULL|
| john|  25|
+-----+----+



In [203]:
df2.na.drop("all", subset=["age"]).show()

+-----+---+
| name|age|
+-----+---+
|alice| 20|
| john| 25|
+-----+---+



### fill

In [206]:
df2.na.fill("python").show()

+------+----+
|  name| age|
+------+----+
| alice|  20|
|   bob|NULL|
|  john|  25|
|python|NULL|
+------+----+



In [210]:
df2.na.fill(99).show()

+-----+---+
| name|age|
+-----+---+
|alice| 20|
|  bob| 99|
| john| 25|
| NULL| 99|
+-----+---+



## Working with Complex Types

### Structs

In [217]:
from pyspark.sql.functions import struct
complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.createOrReplaceTempView("complexDF")

In [224]:
complexDF.select("complex.Description").show(2)

+--------------------+
|         Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows



In [225]:
complexDF.select(col("complex").getField("Description")).show(2)

+--------------------+
| complex.Description|
+--------------------+
|WHITE HANGING HEA...|
| WHITE METAL LANTERN|
+--------------------+
only showing top 2 rows



In [227]:
complexDF.select("complex.*").show(2)

+--------------------+---------+
|         Description|InvoiceNo|
+--------------------+---------+
|WHITE HANGING HEA...|   536365|
| WHITE METAL LANTERN|   536365|
+--------------------+---------+
only showing top 2 rows



### split

In [229]:
from pyspark.sql.functions import split
df.select(split(col("Description"), " ")).show(2)

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [WHITE, HANGING, ...|
|     [WHITE, METAL, LA...|
+-------------------------+
only showing top 2 rows



In [234]:
# SQL
spark.sql("SELECT split(Description, ' ') FROM invoice").show(3)

+-------------------------+
|split(Description,  , -1)|
+-------------------------+
|     [WHITE, HANGING, ...|
|     [WHITE, METAL, LA...|
|     [CREAM, CUPID, HE...|
+-------------------------+
only showing top 3 rows



In [235]:
df.select(split(col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]").show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



In [238]:
# SQL
spark.sql("SELECT split(Description, ' ')[0] FROM invoice").show(2)

+----------------------------+
|split(Description,  , -1)[0]|
+----------------------------+
|                       WHITE|
|                       WHITE|
+----------------------------+
only showing top 2 rows



In [240]:
from pyspark.sql.functions import size
df.select("Description", size(split(col("Description"), " "))).show(2)

+--------------------+-------------------------------+
|         Description|size(split(Description,  , -1))|
+--------------------+-------------------------------+
|WHITE HANGING HEA...|                              5|
| WHITE METAL LANTERN|                              3|
+--------------------+-------------------------------+
only showing top 2 rows



In [243]:
from pyspark.sql.functions import array_contains
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

+------------------------------------------------+
|array_contains(split(Description,  , -1), WHITE)|
+------------------------------------------------+
|                                            true|
|                                            true|
+------------------------------------------------+
only showing top 2 rows



### explode

In [246]:
from pyspark.sql.functions import split, explode
df.withColumn("splitted", split(col("Description"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo", "splitted", "exploded").show()

+--------------------+---------+--------------------+--------+
|         Description|InvoiceNo|            splitted|exploded|
+--------------------+---------+--------------------+--------+
|WHITE HANGING HEA...|   536365|[WHITE, HANGING, ...|   WHITE|
|WHITE HANGING HEA...|   536365|[WHITE, HANGING, ...| HANGING|
|WHITE HANGING HEA...|   536365|[WHITE, HANGING, ...|   HEART|
|WHITE HANGING HEA...|   536365|[WHITE, HANGING, ...| T-LIGHT|
|WHITE HANGING HEA...|   536365|[WHITE, HANGING, ...|  HOLDER|
| WHITE METAL LANTERN|   536365|[WHITE, METAL, LA...|   WHITE|
| WHITE METAL LANTERN|   536365|[WHITE, METAL, LA...|   METAL|
| WHITE METAL LANTERN|   536365|[WHITE, METAL, LA...| LANTERN|
|CREAM CUPID HEART...|   536365|[CREAM, CUPID, HE...|   CREAM|
|CREAM CUPID HEART...|   536365|[CREAM, CUPID, HE...|   CUPID|
|CREAM CUPID HEART...|   536365|[CREAM, CUPID, HE...|  HEARTS|
|CREAM CUPID HEART...|   536365|[CREAM, CUPID, HE...|    COAT|
|CREAM CUPID HEART...|   536365|[CREAM, CUPID, HE...|  

### Maps

In [256]:
from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.show(2)

+--------------------+
|         complex_map|
+--------------------+
|{WHITE HANGING HE...|
|{WHITE METAL LANT...|
+--------------------+
only showing top 2 rows



In [269]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")).selectExpr("complex_map['WHITE METAL LANTERN']").show(2)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            NULL|
|                          536365|
+--------------------------------+
only showing top 2 rows

