## [API](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html)

## SparkSession

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('Apache Spark test') \
    .config("spark.sql.shuffle.partitions", "500") \
    .getOrCreate()

In [2]:
spark

## Data 
[download data](https://github.com/databricks/Spark-The-Definitive-Guide)

In [3]:
df = spark.read.format('csv') \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/Users/sg0218817/Downloads/Spark-The-Definitive-Guide-master/data/retail-data/by-day/2010-12-01.csv")

In [4]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [5]:
df.describe('Quantity', 'UnitPrice', 'Country').show()

+-------+------------------+------------------+--------------+
|summary|          Quantity|         UnitPrice|       Country|
+-------+------------------+------------------+--------------+
|  count|              3108|              3108|          3108|
|   mean| 8.627413127413128| 4.151946589446603|          null|
| stddev|26.371821677029203|15.638659854603892|          null|
|    min|               -24|               0.0|     Australia|
|    max|               600|            607.49|United Kingdom|
+-------+------------------+------------------+--------------+



## Conditions

In [6]:
from pyspark.sql.functions import *

In [7]:
dot_code_filter = df.StockCode.isin("DOT")
price_filter = col('UnitPrice') > 600
description_filter = df.Description.contains('POSTAGE')

df.where(dot_code_filter) \
    .where(col('InvoiceDate') < lit('2017-01-01')) \
    .where(price_filter | description_filter) \
    .withColumn('isExpensive', dot_code_filter & (price_filter | description_filter)) \
    .withColumn('isExpensiveSql', expr('StockCode in ("DOT") AND (UnitPrice > 600 OR instr(Description, "POSTAGE") >= 1)')) \
    .select('StockCode', 'UnitPrice', 'Description', 'isExpensive', 'isExpensiveSql') \
    .show(2)

+---------+---------+--------------+-----------+--------------+
|StockCode|UnitPrice|   Description|isExpensive|isExpensiveSql|
+---------+---------+--------------+-----------+--------------+
|      DOT|   569.77|DOTCOM POSTAGE|       true|          true|
|      DOT|   607.49|DOTCOM POSTAGE|       true|          true|
+---------+---------+--------------+-----------+--------------+



## Calculations

In [8]:
quantity = pow(col('Quantity') * col('UnitPrice'), 2) + 5

df.select('CustomerId', round(quantity, 2).alias('realQuantity')) \
    .show(2)

+----------+------------+
|CustomerId|realQuantity|
+----------+------------+
|   17850.0|      239.09|
|   17850.0|      418.72|
+----------+------------+
only showing top 2 rows



In [9]:
df.selectExpr('CustomerId', 'BROUND(POWER(Quantity * UnitPrice, 2.0) + 5, 4) AS realQuantity') \
    .show(2)

+----------+------------+
|CustomerId|realQuantity|
+----------+------------+
|   17850.0|      239.09|
|   17850.0|    418.7156|
+----------+------------+
only showing top 2 rows



In [10]:
df.select(col('Description'), lower(col('Description')), initcap(col('Description'))) \
    .show(2, False)

+----------------------------------+----------------------------------+----------------------------------+
|Description                       |lower(Description)                |initcap(Description)              |
+----------------------------------+----------------------------------+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|white hanging heart t-light holder|White Hanging Heart T-light Holder|
|WHITE METAL LANTERN               |white metal lantern               |White Metal Lantern               |
+----------------------------------+----------------------------------+----------------------------------+
only showing top 2 rows



- lower, upper
- initcap
- ltrim, rtrim, trim
- lpad, rpad

In [11]:
regex_string = 'BLACK|WHITE|RED|GREEN|BLUE'
df.select('Description', regexp_replace(col('Description'), regex_string, 'COLOR').alias('Color Clean')) \
    .show(2, False)

+----------------------------------+----------------------------------+
|Description                       |Color Clean                       |
+----------------------------------+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|COLOR HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |COLOR METAL LANTERN               |
+----------------------------------+----------------------------------+
only showing top 2 rows



# Dates

In [12]:
spark.range(1) \
    .withColumn('today', current_date()) \
    .withColumn('now', current_timestamp()) \
    .withColumn('some_date', to_date(lit('2019-09-29'))) \
    .withColumn('today+5', date_add(col('today'), 5)) \
    .withColumn('today-5', date_sub(col('today'), 5)) \
    .withColumn('day_diff', datediff(col('today'), col('some_date'))) \
    .withColumn('month_diff', months_between(col('today'), col('some_date'))) \
    .show(10, False)

+---+----------+-----------------------+----------+----------+----------+--------+----------+
|id |today     |now                    |some_date |today+5   |today-5   |day_diff|month_diff|
+---+----------+-----------------------+----------+----------+----------+--------+----------+
|0  |2019-11-20|2019-11-20 20:51:36.503|2019-09-29|2019-11-25|2019-11-15|52      |1.70967742|
+---+----------+-----------------------+----------+----------+----------+--------+----------+



In [13]:
spark.range(1) \
    .withColumn('some_date', to_date(lit('2019-09-29'))) \
    .withColumn('another_date', to_date(lit('2019-09-29'), 'yyyy-MM-dd')) \
    .withColumn('timestamp', to_timestamp(lit('2019-09-29'), 'yyyy-MM-dd')) \
    .show(10, False)


+---+----------+------------+-------------------+
|id |some_date |another_date|timestamp          |
+---+----------+------------+-------------------+
|0  |2019-09-29|2019-09-29  |2019-09-29 00:00:00|
+---+----------+------------+-------------------+



## Nulls

In [14]:
df.na.drop('any') \
    .select('InvoiceNo', 'StockCode') \
    .show(2, False)

+---------+---------+
|InvoiceNo|StockCode|
+---------+---------+
|536365   |85123A   |
|536365   |71053    |
+---------+---------+
only showing top 2 rows



In [15]:
df.na.fill('replace null with this string') \
    .select('InvoiceNo', 'StockCode') \
    .show(2, False)

+---------+---------+
|InvoiceNo|StockCode|
+---------+---------+
|536365   |85123A   |
|536365   |71053    |
+---------+---------+
only showing top 2 rows



## Arrays

In [16]:
df.select(split(col('Description'), ' ')).show(2, False)

+----------------------------------------+
|split(Description,  )                   |
+----------------------------------------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|
|[WHITE, METAL, LANTERN]                 |
+----------------------------------------+
only showing top 2 rows



In [17]:
df.select(split(col('Description'), ' ').alias('array')) \
    .selectExpr('array[3]') \
    .na.fill('NULL replacement') \
    .show(2, False)

+----------------+
|array[3]        |
+----------------+
|T-LIGHT         |
|NULL replacement|
+----------------+
only showing top 2 rows



In [18]:
df.select(array_contains(split(col('Description'), ' '), 'HEART').alias('has heart')) \
    .show(2, False)

+---------+
|has heart|
+---------+
|true     |
|false    |
+---------+
only showing top 2 rows



In [19]:
df.withColumn('splitted', split(col('Description'), ' ')) \
    .withColumn('exploded', explode(col('splitted'))) \
    .select('splitted', 'exploded') \
    .show(2, False)

+----------------------------------------+--------+
|splitted                                |exploded|
+----------------------------------------+--------+
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|WHITE   |
|[WHITE, HANGING, HEART, T-LIGHT, HOLDER]|HANGING |
+----------------------------------------+--------+
only showing top 2 rows



## Maps

In [20]:
map_df = df.select('StockCode', create_map(col('Description'), col('InvoiceNo')).alias('complex_map'))
map_df.show(10, False)

+---------+-----------------------------------------------+
|StockCode|complex_map                                    |
+---------+-----------------------------------------------+
|85123A   |[WHITE HANGING HEART T-LIGHT HOLDER -> 536365] |
|71053    |[WHITE METAL LANTERN -> 536365]                |
|84406B   |[CREAM CUPID HEARTS COAT HANGER -> 536365]     |
|84029G   |[KNITTED UNION FLAG HOT WATER BOTTLE -> 536365]|
|84029E   |[RED WOOLLY HOTTIE WHITE HEART. -> 536365]     |
|22752    |[SET 7 BABUSHKA NESTING BOXES -> 536365]       |
|21730    |[GLASS STAR FROSTED T-LIGHT HOLDER -> 536365]  |
|22633    |[HAND WARMER UNION JACK -> 536366]             |
|22632    |[HAND WARMER RED POLKA DOT -> 536366]          |
|84879    |[ASSORTED COLOUR BIRD ORNAMENT -> 536367]      |
+---------+-----------------------------------------------+
only showing top 10 rows



In [21]:
map_df.selectExpr('complex_map["WHITE METAL LANTERN"]') \
    .show(3)

+--------------------------------+
|complex_map[WHITE METAL LANTERN]|
+--------------------------------+
|                            null|
|                          536365|
|                            null|
+--------------------------------+
only showing top 3 rows



In [22]:
map_df.selectExpr('explode(complex_map)') \
    .show(3)

+--------------------+------+
|                 key| value|
+--------------------+------+
|WHITE HANGING HEA...|536365|
| WHITE METAL LANTERN|536365|
|CREAM CUPID HEART...|536365|
+--------------------+------+
only showing top 3 rows



## JSON

In [23]:
json_df = df.selectExpr('(InvoiceNo, Description) as my_struct') \
    .select(to_json(col('my_struct')).alias('my_json'))

json_df.show(5, False)

+--------------------------------------------------------------------------+
|my_json                                                                   |
+--------------------------------------------------------------------------+
|{"InvoiceNo":"536365","Description":"WHITE HANGING HEART T-LIGHT HOLDER"} |
|{"InvoiceNo":"536365","Description":"WHITE METAL LANTERN"}                |
|{"InvoiceNo":"536365","Description":"CREAM CUPID HEARTS COAT HANGER"}     |
|{"InvoiceNo":"536365","Description":"KNITTED UNION FLAG HOT WATER BOTTLE"}|
|{"InvoiceNo":"536365","Description":"RED WOOLLY HOTTIE WHITE HEART."}     |
+--------------------------------------------------------------------------+
only showing top 5 rows



In [24]:
json_df.select(get_json_object(col('my_json'), '$.Description'), json_tuple(col('my_json'), 'Description')) \
    .show(5, False)

+---------------------------------------+-----------------------------------+
|get_json_object(my_json, $.Description)|c0                                 |
+---------------------------------------+-----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER     |WHITE HANGING HEART T-LIGHT HOLDER |
|WHITE METAL LANTERN                    |WHITE METAL LANTERN                |
|CREAM CUPID HEARTS COAT HANGER         |CREAM CUPID HEARTS COAT HANGER     |
|KNITTED UNION FLAG HOT WATER BOTTLE    |KNITTED UNION FLAG HOT WATER BOTTLE|
|RED WOOLLY HOTTIE WHITE HEART.         |RED WOOLLY HOTTIE WHITE HEART.     |
+---------------------------------------+-----------------------------------+
only showing top 5 rows



## UDF

In Spark SQL we can define our own functions with the udf method from the pyspark.sql.functions module. The default type of the returned variable for UDFs is string. If we would like to return an other type we need to explicitly do so by using the different types from the pyspark.sql.types module.

In [25]:
def power3(value):
    return value ** 3

power3udf = udf(power3)

In [39]:
spark.range(20).toDF('num') \
    .select('num', power3udf(col('num'))) \
    .show(10, False)

+---+-----------+
|num|power3(num)|
+---+-----------+
|0  |0          |
|1  |1          |
|2  |8          |
|3  |27         |
|4  |64         |
|5  |125        |
|6  |216        |
|7  |343        |
|8  |512        |
|9  |729        |
+---+-----------+
only showing top 10 rows



In [41]:
from pyspark.sql.types import LongType

spark.udf.register('power3py', power3, LongType())

<function __main__.power3(value)>

In [42]:
spark.range(20).toDF('num') \
    .selectExpr('num', 'power3py(num)') \
    .show(10, False)

+---+-------------+
|num|power3py(num)|
+---+-------------+
|0  |0            |
|1  |1            |
|2  |8            |
|3  |27           |
|4  |64           |
|5  |125          |
|6  |216          |
|7  |343          |
|8  |512          |
|9  |729          |
+---+-------------+
only showing top 10 rows

