In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('c6').getOrCreate()

In [3]:
df = spark.read.format('csv')\
.option("header","true")\
.option("inferSchema","true")\
.load("retail-data/2010-12-01.csv")

In [4]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [5]:
df.createOrReplaceTempView("dfTable")

In [5]:
# lit function
from pyspark.sql.functions import lit

In [7]:
df.select(lit(5),lit("five"),lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

In [6]:
# booleans
from pyspark.sql.functions import col

In [9]:
df.where(col("InvoiceNo") != 536365)\
.select("InvoiceNo","Description")\
.show(5,False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [10]:
df.where("InvoiceNo = 536365").show(5, False)

+---------+---------+-----------------------------------+--------+---------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                        |Quantity|InvoiceDate          |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------------+--------+---------------------+---------+----------+--------------+
|536365   |85123A   |WHITE HANGING HEART T-LIGHT HOLDER |6       |2010-12-01 08:26:00.0|2.55     |17850.0   |United Kingdom|
|536365   |71053    |WHITE METAL LANTERN                |6       |2010-12-01 08:26:00.0|3.39     |17850.0   |United Kingdom|
|536365   |84406B   |CREAM CUPID HEARTS COAT HANGER     |8       |2010-12-01 08:26:00.0|2.75     |17850.0   |United Kingdom|
|536365   |84029G   |KNITTED UNION FLAG HOT WATER BOTTLE|6       |2010-12-01 08:26:00.0|3.39     |17850.0   |United Kingdom|
|536365   |84029E   |RED WOOLLY HOTTIE WHITE HEART.     |6       |2010-12-01 08:26:00.0|3.39     |17850.0   |United Kingdom|


In [12]:
df.where("InvoiceNo <> 536365").show(5, False)

+---------+---------+-----------------------------+--------+---------------------+---------+----------+--------------+
|InvoiceNo|StockCode|Description                  |Quantity|InvoiceDate          |UnitPrice|CustomerID|Country       |
+---------+---------+-----------------------------+--------+---------------------+---------+----------+--------------+
|536366   |22633    |HAND WARMER UNION JACK       |6       |2010-12-01 08:28:00.0|1.85     |17850.0   |United Kingdom|
|536366   |22632    |HAND WARMER RED POLKA DOT    |6       |2010-12-01 08:28:00.0|1.85     |17850.0   |United Kingdom|
|536367   |84879    |ASSORTED COLOUR BIRD ORNAMENT|32      |2010-12-01 08:34:00.0|1.69     |13047.0   |United Kingdom|
|536367   |22745    |POPPY'S PLAYHOUSE BEDROOM    |6       |2010-12-01 08:34:00.0|2.1      |13047.0   |United Kingdom|
|536367   |22748    |POPPY'S PLAYHOUSE KITCHEN    |6       |2010-12-01 08:34:00.0|2.1      |13047.0   |United Kingdom|
+---------+---------+---------------------------

In [7]:
from pyspark.sql.functions import instr

In [14]:
priceFilter = col("UnitPrice") > 600

In [15]:
descripFilter = instr(df.Description,"POSTAGE") >= 1

In [16]:
df.where(df.StockCode.isin("DOT")).where(priceFilter | descripFilter).show()

+---------+---------+--------------+--------+--------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|         InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+--------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:...|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:...|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+--------------------+---------+----------+--------------+



In [17]:
DOTCodeFilter = col("StockCode") == "DOT"

In [18]:
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
.where("isExpensive")\
.select("unitPrice","isExpensive").show(5)

+---------+-----------+
|unitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



In [19]:
spark.sql("""
SELECT UnitPrice, (StockCode = 'DOT' AND
  (UnitPrice > 600 OR instr(Description,"POSTAGE") >= 1)) as isExpensive
FROM dfTable
WHERE (StockCode = 'DOT' AND
 (UnitPrice > 600 OR instr(Description,"POSTAGE") >= 1))
""").show()

+---------+-----------+
|UnitPrice|isExpensive|
+---------+-----------+
|   569.77|       true|
|   607.49|       true|
+---------+-----------+



In [8]:
from pyspark.sql.functions import expr

In [21]:
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))\
.where("isExpensive")\
.select("Description","UnitPrice").show(5)

+--------------+---------+
|   Description|UnitPrice|
+--------------+---------+
|DOTCOM POSTAGE|   569.77|
|DOTCOM POSTAGE|   607.49|
+--------------+---------+



In [9]:
#working with numbers
from pyspark.sql.functions import expr, pow

In [26]:
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5

In [27]:
df.select(expr("CustomerId"),fabricatedQuantity.alias("realQuantity")).show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [29]:
df.selectExpr(
"CustomerId",
"(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity").show(2)

+----------+------------------+
|CustomerId|      realQuantity|
+----------+------------------+
|   17850.0|239.08999999999997|
|   17850.0|          418.7156|
+----------+------------------+
only showing top 2 rows



In [10]:
# round
from pyspark.sql.functions import lit, round, bround

In [31]:
df.select(round(lit("2.5")),bround(lit("2.5"))).show(2)

+-------------+--------------+
|round(2.5, 0)|bround(2.5, 0)|
+-------------+--------------+
|          3.0|           2.0|
|          3.0|           2.0|
+-------------+--------------+
only showing top 2 rows



In [11]:
# correlation
from pyspark.sql.functions import corr


In [33]:
df.stat.corr("Quantity","UnitPrice")

-0.04112314436835551

In [34]:
df.select(corr("Quantity","UnitPrice")).show()

+-------------------------+
|corr(Quantity, UnitPrice)|
+-------------------------+
|     -0.04112314436835551|
+-------------------------+



In [35]:
# describe
df.describe().show()

+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|summary|        InvoiceNo|         StockCode|         Description|          Quantity|         UnitPrice|        CustomerID|       Country|
+-------+-----------------+------------------+--------------------+------------------+------------------+------------------+--------------+
|  count|             3108|              3108|                3098|              3108|              3108|              1968|          3108|
|   mean| 536516.684944841|27834.304044117645|                null| 8.627413127413128| 4.151946589446603|15661.388719512195|          null|
| stddev|72.89447869788873|17407.897548583845|                null|26.371821677029203|15.638659854603892|1854.4496996893627|          null|
|    min|           536365|             10002| 4 PURPLE FLOCK D...|               -24|               0.0|           12431.0|     Australia|
|    max|          C

In [12]:
# stat
from pyspark.sql.functions import count,mean,stddev_pop,min,max

In [37]:
#approxQuantile
colName = "UnitPrice"
quatileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice",quatileProbs,relError)

[2.51]

In [38]:
# cross tabulation
df.stat.crosstab("StockCode", "Quantity").show()

+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|StockCode_Quantity| -1|-10|-12| -2|-24| -3| -4| -5| -6| -7|  1| 10|100| 11| 12|120|128| 13| 14|144| 15| 16| 17| 18| 19|192|  2| 20|200| 21|216| 22| 23| 24| 25|252| 27| 28|288|  3| 30| 32| 33| 34| 36|384|  4| 40|432| 47| 48|480|  5| 50| 56|  6| 60|600| 64|  7| 70| 72|  8| 80|  9| 96|
+------------------+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
|             22578|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0| 

In [39]:
df.stat.freqItems(["StockCode","Quantity"]).show()

+--------------------+--------------------+
| StockCode_freqItems|  Quantity_freqItems|
+--------------------+--------------------+
|[90214E, 20728, 2...|[200, 128, 23, 50...|
+--------------------+--------------------+



In [13]:
# unique id
from pyspark.sql.functions import monotonically_increasing_id

In [41]:
df.select(monotonically_increasing_id()).show(2)

+-----------------------------+
|monotonically_increasing_id()|
+-----------------------------+
|                            0|
|                            1|
+-----------------------------+
only showing top 2 rows



In [14]:
# working with strings
# initcap
from pyspark.sql.functions import initcap

In [44]:
df.select(initcap(col("Description"))).show(truncate=False)

+-----------------------------------+
|initcap(Description)               |
+-----------------------------------+
|White Hanging Heart T-light Holder |
|White Metal Lantern                |
|Cream Cupid Hearts Coat Hanger     |
|Knitted Union Flag Hot Water Bottle|
|Red Woolly Hottie White Heart.     |
|Set 7 Babushka Nesting Boxes       |
|Glass Star Frosted T-light Holder  |
|Hand Warmer Union Jack             |
|Hand Warmer Red Polka Dot          |
|Assorted Colour Bird Ornament      |
|Poppy's Playhouse Bedroom          |
|Poppy's Playhouse Kitchen          |
|Feltcraft Princess Charlotte Doll  |
|Ivory Knitted Mug Cosy             |
|Box Of 6 Assorted Colour Teaspoons |
|Box Of Vintage Jigsaw Blocks       |
|Box Of Vintage Alphabet Blocks     |
|Home Building Block Word           |
|Love Building Block Word           |
|Recipe Box With Metal Heart        |
+-----------------------------------+
only showing top 20 rows



In [15]:
# upper and lower
from pyspark.sql.functions import upper, lower

In [16]:
df.select(col("Description"), lower(col("Description")), upper(col("Description"))).show()

+--------------------+--------------------+--------------------+
|         Description|  lower(Description)|  upper(Description)|
+--------------------+--------------------+--------------------+
|WHITE HANGING HEA...|white hanging hea...|WHITE HANGING HEA...|
| WHITE METAL LANTERN| white metal lantern| WHITE METAL LANTERN|
|CREAM CUPID HEART...|cream cupid heart...|CREAM CUPID HEART...|
|KNITTED UNION FLA...|knitted union fla...|KNITTED UNION FLA...|
|RED WOOLLY HOTTIE...|red woolly hottie...|RED WOOLLY HOTTIE...|
|SET 7 BABUSHKA NE...|set 7 babushka ne...|SET 7 BABUSHKA NE...|
|GLASS STAR FROSTE...|glass star froste...|GLASS STAR FROSTE...|
|HAND WARMER UNION...|hand warmer union...|HAND WARMER UNION...|
|HAND WARMER RED P...|hand warmer red p...|HAND WARMER RED P...|
|ASSORTED COLOUR B...|assorted colour b...|ASSORTED COLOUR B...|
|POPPY'S PLAYHOUSE...|poppy's playhouse...|POPPY'S PLAYHOUSE...|
|POPPY'S PLAYHOUSE...|poppy's playhouse...|POPPY'S PLAYHOUSE...|
|FELTCRAFT PRINCES...|fel

In [17]:
# adding and removing spaces
from pyspark.sql.functions import lit,ltrim,rtrim,rpad,lpad,trim

In [18]:
df.select(
    ltrim(lit("     HELLO    ")).alias("ltrim"),
    rtrim(lit("     HELLO    ")).alias("rtrim"),
    trim(lit("     HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lpad"),
    rpad(lit("HELLO"), 10," ").alias("rp")
).show(2)

+---------+----------+-----+----+----------+
|    ltrim|     rtrim| trim|lpad|        rp|
+---------+----------+-----+----+----------+
|HELLO    |     HELLO|HELLO| HEL|HELLO     |
|HELLO    |     HELLO|HELLO| HEL|HELLO     |
+---------+----------+-----+----+----------+
only showing top 2 rows



In [19]:
#regex
from pyspark.sql.functions import regexp_replace

In [20]:
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"

In [21]:
df.select(
    regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
    col("Description")
).show(2)

+--------------------+--------------------+
|         color_clean|         Description|
+--------------------+--------------------+
|COLOR HANGING HEA...|WHITE HANGING HEA...|
| COLOR METAL LANTERN| WHITE METAL LANTERN|
+--------------------+--------------------+
only showing top 2 rows



In [22]:
from pyspark.sql.functions import translate

In [23]:
df.select(translate(col("Description"), "LEET", "1337"), col("Description")).show(2)

+----------------------------------+--------------------+
|translate(Description, LEET, 1337)|         Description|
+----------------------------------+--------------------+
|              WHI73 HANGING H3A...|WHITE HANGING HEA...|
|               WHI73 M37A1 1AN73RN| WHITE METAL LANTERN|
+----------------------------------+--------------------+
only showing top 2 rows



In [24]:
from pyspark.sql.functions import regexp_extract

In [25]:
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"

In [26]:
df.select(
    regexp_extract(col("Description"), extract_str,1).alias("color_clean"),
    col("Description")
).show(2)

+-----------+--------------------+
|color_clean|         Description|
+-----------+--------------------+
|      WHITE|WHITE HANGING HEA...|
|      WHITE| WHITE METAL LANTERN|
+-----------+--------------------+
only showing top 2 rows



In [27]:
# contains
from pyspark.sql.functions import instr

In [28]:
containsBlack = instr(col("Description"), "BLACK") >= 1
containsWhite = instr(col("Description"), "WHITE") >= 1

In [29]:
df.withColumn("hasSimpleColor", containsBlack | containsWhite)\
.where("hasSimpleColor")\
.select("Description").show(3,False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



In [30]:
from pyspark.sql.functions import expr,locate

In [31]:
simpleColors = ["black","white","red","green","blue"]

In [34]:
def color_locator(column, color_string):
    return locate(color_string.upper(),column).cast("boolean").alias("is_" + color_string)

In [35]:
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]

In [36]:
selectedColumns.append(expr("*")) # has to be column type

In [38]:
df.select(*selectedColumns).where(expr("is_white OR is_red"))\
.select("Description").show(3,False)

+----------------------------------+
|Description                       |
+----------------------------------+
|WHITE HANGING HEART T-LIGHT HOLDER|
|WHITE METAL LANTERN               |
|RED WOOLLY HOTTIE WHITE HEART.    |
+----------------------------------+
only showing top 3 rows



In [39]:
# working on dates and timestamp

In [40]:
from pyspark.sql.functions import current_date,current_timestamp

In [41]:
dateDF = spark.range(10).withColumn("today", current_date()).withColumn("now", current_timestamp())

In [42]:
dateDF.createOrReplaceTempView("dateTable")

In [43]:
dateDF.printSchema()

root
 |-- id: long (nullable = false)
 |-- today: date (nullable = false)
 |-- now: timestamp (nullable = false)



In [44]:
from pyspark.sql.functions import date_add, date_sub

In [45]:
dateDF.select(date_sub(col("today"),5), date_add(col("today"),5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2019-09-02|        2019-09-12|
+------------------+------------------+
only showing top 1 row



In [47]:
from pyspark.sql.functions import datediff, months_between, to_date

In [48]:
dateDF.withColumn("week_ago", date_sub(col("today"),7))\
.select(datediff(col("week_ago"), col("today"))).show(1)

+-------------------------+
|datediff(week_ago, today)|
+-------------------------+
|                       -7|
+-------------------------+
only showing top 1 row



In [50]:
dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end")
).select(months_between(col("start"),col("end"))).show(1)

+--------------------------+
|months_between(start, end)|
+--------------------------+
|              -16.67741935|
+--------------------------+
only showing top 1 row



In [51]:
from pyspark.sql.functions import to_date, lit

In [52]:
spark.range(5).withColumn("date", lit("2017-01-01")).select(to_date(col("date"))).show(1)

+-------------+
|to_date(date)|
+-------------+
|   2017-01-01|
+-------------+
only showing top 1 row



In [53]:
dateDF.select(to_date(lit("2016-20-12")),to_date(lit("2017-12-11"))).show(1)

+-------------------+-------------------+
|to_date(2016-20-12)|to_date(2017-12-11)|
+-------------------+-------------------+
|               null|         2017-12-11|
+-------------------+-------------------+
only showing top 1 row



In [60]:
# fixing datetime arrangments
from pyspark.sql.functions import date_format
dateFormat = "yyyy-dd-MM"

In [61]:
cleanDateDF = spark.range(1).select(
    date_format(to_date(lit("2017-12-11")), dateFormat).alias("date"),
    date_format(to_date(lit("2017-20-12")), dateFormat).alias("date2")
)

In [62]:
cleanDateDF.createOrReplaceTempView("dateTable2")

In [63]:
cleanDateDF.show()

+----------+-----+
|      date|date2|
+----------+-----+
|2017-11-12| null|
+----------+-----+



In [64]:
# working with null values

In [65]:
from pyspark.sql.functions import coalesce

In [66]:
df.select(coalesce(col("Description"), col("CustomerId"))).show()

+---------------------------------+
|coalesce(Description, CustomerId)|
+---------------------------------+
|             WHITE HANGING HEA...|
|              WHITE METAL LANTERN|
|             CREAM CUPID HEART...|
|             KNITTED UNION FLA...|
|             RED WOOLLY HOTTIE...|
|             SET 7 BABUSHKA NE...|
|             GLASS STAR FROSTE...|
|             HAND WARMER UNION...|
|             HAND WARMER RED P...|
|             ASSORTED COLOUR B...|
|             POPPY'S PLAYHOUSE...|
|             POPPY'S PLAYHOUSE...|
|             FELTCRAFT PRINCES...|
|             IVORY KNITTED MUG...|
|             BOX OF 6 ASSORTED...|
|             BOX OF VINTAGE JI...|
|             BOX OF VINTAGE AL...|
|             HOME BUILDING BLO...|
|             LOVE BUILDING BLO...|
|             RECIPE BOX WITH M...|
+---------------------------------+
only showing top 20 rows



In [67]:
df.na.drop("all",subset=["StockCode","InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [68]:
df.na.fill("All Null values become a string").show()

+---------+---------+--------------------+--------+--------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|         InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:...|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:...|     3.39|   17850.0|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:...|     2.75|   17850.0|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:...|     3.39|   17850.0|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:...|     3.39|   17850.0|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|2010-12-01 08:26:...|     7.65|   17850.0|United Kingdom|
|   536365|    2173

In [69]:
df.na.fill("all", subset=["StockCode","InvoiceNo"])

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [72]:
fill_cols_vals = {"StockCode":5, "Description":"No Value"}

In [73]:
df.na.fill(fill_cols_vals)

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [74]:
df.na.replace([""], ["UNKNOWN"], "Description")

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: timestamp, UnitPrice: double, CustomerID: double, Country: string]

In [75]:
# working with complex types


In [76]:
from pyspark.sql.functions import struct


In [77]:
complexDF = df.select(struct("Description","InvoiceNo").alias("complex"))

In [78]:
complexDF.createOrReplaceTempView("complexDF")

In [79]:
complexDF.select("complex.Description")

DataFrame[Description: string]

In [80]:
complexDF.select(col("complex").getField("Description"))

DataFrame[complex.Description: string]

In [81]:
complexDF.select("complex.*")

DataFrame[Description: string, InvoiceNo: string]

In [82]:
from pyspark.sql.functions import split

In [83]:
df.select(split(col("Description"), " ")).show(2)

+---------------------+
|split(Description,  )|
+---------------------+
| [WHITE, HANGING, ...|
| [WHITE, METAL, LA...|
+---------------------+
only showing top 2 rows



In [84]:
df.select(split(col("Description"), " ").alias("array_col"))\
.selectExpr("array_col[0]").show(2)

+------------+
|array_col[0]|
+------------+
|       WHITE|
|       WHITE|
+------------+
only showing top 2 rows



In [85]:
from pyspark.sql.functions import size

In [86]:
df.select(size(split(col("Description"), " "))).show(2)

+---------------------------+
|size(split(Description,  ))|
+---------------------------+
|                          5|
|                          3|
+---------------------------+
only showing top 2 rows



In [87]:
from pyspark.sql.functions import array_contains

In [89]:
df.select(array_contains(split(col("Description"), " "), "WHITE")).show(2)

+--------------------------------------------+
|array_contains(split(Description,  ), WHITE)|
+--------------------------------------------+
|                                        true|
|                                        true|
+--------------------------------------------+
only showing top 2 rows



In [90]:
from pyspark.sql.functions import split, explode

In [91]:
df.withColumn("splitted", split(col("Description"), " "))\
.withColumn("exploded", explode(col("splitted")))\
.select("Description", "InvoiceNo", "exploded").show(2)

+--------------------+---------+--------+
|         Description|InvoiceNo|exploded|
+--------------------+---------+--------+
|WHITE HANGING HEA...|   536365|   WHITE|
|WHITE HANGING HEA...|   536365| HANGING|
+--------------------+---------+--------+
only showing top 2 rows



In [92]:
from pyspark.sql.functions import create_map

In [93]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map")).show(2)

+--------------------+
|         complex_map|
+--------------------+
|Map(WHITE HANGING...|
|Map(WHITE METAL L...|
+--------------------+
only showing top 2 rows



In [96]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("complex_map['WHITE METAL LATERN']").show(2)

+-------------------------------+
|complex_map[WHITE METAL LATERN]|
+-------------------------------+
|                           null|
|                           null|
+-------------------------------+
only showing top 2 rows



In [97]:
# working with json

In [98]:
jsonDF = spark.range(1).selectExpr("""
'{"myJSONKey" : {"myJSONValue" : [1, 2, 3]}}' as jsonString
"""
)

In [99]:
from pyspark.sql.functions import get_json_object, json_tuple

In [101]:
jsonDF.select(
    get_json_object(col("jsonString"), "$.myJSONKey.myJSONValue[1]").alias("column"),
    json_tuple(col("jsonString"),"myJSONKey")
).show(2)

+------+--------------------+
|column|                  c0|
+------+--------------------+
|     2|{"myJSONValue":[1...|
+------+--------------------+



In [9]:
from pyspark.sql.functions import to_json

In [103]:
df.selectExpr("(InvoiceNo, Description) as myStruct").select(to_json(col("myStruct")))

DataFrame[structtojson(myStruct): string]

In [10]:
from pyspark.sql.functions import from_json,col

In [7]:
from pyspark.sql.types import *

In [8]:
parseSchema = StructType((
    StructField("InvoiceNo", StringType(), True),
    StructField("Description", StringType(), True)
))

In [11]:
df.selectExpr("(InvoiceNo, Description) as myStruct")\
.select(to_json(col("myStruct")).alias("newJSON"))\
.select(from_json(col("newJSON"), parseSchema), col("newJSON")).show(2)

+---------------------+--------------------+
|jsontostruct(newJSON)|             newJSON|
+---------------------+--------------------+
| [536365,WHITE HAN...|{"InvoiceNo":"536...|
| [536365,WHITE MET...|{"InvoiceNo":"536...|
+---------------------+--------------------+
only showing top 2 rows



In [12]:
# user define functions

In [13]:
udfExampleDF = spark.range(5).toDF("num")

In [14]:
def power3(double_value):
    return double_value ** 3

In [15]:
power3(2.0)

8.0

In [16]:
from pyspark.sql.functions import udf

In [17]:
power3udf = udf(power3)

In [18]:
udfExampleDF.select(power3udf(col("num"))).show(2)

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
+-----------+
only showing top 2 rows



In [20]:
spark.udf.register("power3", power3)

In [21]:
udfExampleDF.selectExpr("power3(num)").show(2)

+-----------+
|power3(num)|
+-----------+
|          0|
|          1|
+-----------+
only showing top 2 rows



In [22]:
from pyspark.sql.types import DoubleType, IntegerType

In [23]:
spark.udf.register("power3py",power3,DoubleType())

In [24]:
udfExampleDF.selectExpr("power3py(num)").show(2)

+-------------+
|power3py(num)|
+-------------+
|         null|
|         null|
+-------------+
only showing top 2 rows

