### Working with Different Types of Data

In [None]:
../data/retail-data/by-day/2010-12-01.csv

In [1]:
from pyspark import SparkContext, SQLContext
sc = SparkContext()
spark = SQLContext(sc)

In [2]:
df = spark.read.csv(path='../data/retail-data/by-day/2010-12-01.csv',header=True,inferSchema=None)

In [5]:
df.createOrReplaceTempView("dfTable")

In [9]:
df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- Country: string (nullable = true)



In [15]:
from pyspark.sql.functions import *

df.select(lit(5), lit("five"), lit(5.0))

DataFrame[5: int, five: string, 5.0: double]

#### Working with Booleans

In [21]:
df.where(col("InvoiceNo") != 536365).select("InvoiceNo","Description").show(5,False)

+---------+-----------------------------+
|InvoiceNo|Description                  |
+---------+-----------------------------+
|536366   |HAND WARMER UNION JACK       |
|536366   |HAND WARMER RED POLKA DOT    |
|536367   |ASSORTED COLOUR BIRD ORNAMENT|
|536367   |POPPY'S PLAYHOUSE BEDROOM    |
|536367   |POPPY'S PLAYHOUSE KITCHEN    |
+---------+-----------------------------+
only showing top 5 rows



In [25]:
priceFilter = col("UnitPrice") > 600
descFilter = col("Description").contains("POSTAGE")
df.where(df.StockCode.isin("DOT")).where(priceFilter|descFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536544|      DOT|DOTCOM POSTAGE|       1|2010-12-01 14:32:00|   569.77|      null|United Kingdom|
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [26]:
df.where(df.StockCode.isin("DOT")).where(priceFilter & descFilter).show()

+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|   Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+
|   536592|      DOT|DOTCOM POSTAGE|       1|2010-12-01 17:06:00|   607.49|      null|United Kingdom|
+---------+---------+--------------+--------+-------------------+---------+----------+--------------+



In [39]:
DOTFilter = col("StockCode") == "DOT"
descFilter = instr(col("Description"),"POSTAGE") >= 1
df.withColumn("isExpensive", DOTFilter & (priceFilter|descFilter))\
.where("isExpensive").select("InvoiceNo","Description", "UnitPrice","isExpensive").show()

+---------+--------------+---------+-----------+
|InvoiceNo|   Description|UnitPrice|isExpensive|
+---------+--------------+---------+-----------+
|   536544|DOTCOM POSTAGE|   569.77|       true|
|   536592|DOTCOM POSTAGE|   607.49|       true|
+---------+--------------+---------+-----------+

