In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local") \
    .appName("XZ") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

In [2]:
df = spark\
        .read\
        .format("csv")\
        .option("header", "true")\
        .option("inferSchema", "true")\
        .load("/FileStore/tables/retail-data/by-day/2010_12_01-*.csv")
    
df.printSchema()

In [3]:
df.show(5)

In [4]:
df\
    .where(df.InvoiceNo == "536365")\
    .select("InvoiceNo", "Description")\
    .show(5, False)

In [5]:
df.printSchema()

In [6]:
df\
    .where("InvoiceNo == 536365")\
    .select("InvoiceNo", "Description")\
    .show(5, False)

In [7]:
from pyspark.sql.functions import instr, col

priceFilter = col("UnitPrice") > 600
#descripFilter = instr(df.Description, "POSTAGE") >= 1
descripFilter = df.Description.contains("POSTAGE")

df\
    .where(df.StockCode.isin("DOT", "85123A"))\
    .where(priceFilter | descripFilter)\
    .show()

In [8]:
from pyspark.sql.functions import instr, col

DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1

df\
    .withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
    .where("isExpensive")\
    .select("Description", "unitPrice", "isExpensive").show(5)


### let’s imagine that we found out that we mis-recorded the quantity in our retail dataset and the true quantity is equal to (the current quantity * the unit price) pow of 2 + 5.

In [10]:
from pyspark.sql.functions import pow, ceil

# define a fn
newQtyFn = ceil(pow(df.Quantity * df.UnitPrice, 2) + 5)

df\
    .withColumn("NewQty", newQtyFn)\
    .select("Description", "UnitPrice", "Quantity", "NewQty", newQtyFn.alias("NewerQty"))\
    .show(5, False)


In [11]:
# OR as SQL

df\
    .selectExpr(
        "CustomerId",
        "(POWER((Quantity * UnitPrice), 2.0) + 5) as realQuantity")\
    .show(2)

### Rounding

In [13]:
from pyspark.sql.functions import lit, round, bround, floor, ceil

df\
    .select(round(lit(2.5253423423), 2),
           bround(lit(2.52342342342), 3),
           floor(lit(2.5)),
           ceil(lit(2.5)))\
    .show(1)

### Another numerical task is to compute the correlation of two columns

In [15]:
from pyspark.sql.functions import corr

df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show()

### summary statistics

In [17]:
df\
    .describe()\
    .select("Summary", "Quantity", "UnitPrice")\
    .show()

In [18]:
colName = "UnitPrice"
quantileProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantileProbs, relError) # 2.51

In [19]:
df.stat.crosstab("StockCode", "Quantity").show(1)

In [20]:
df.stat.freqItems(["StockCode", "Quantity"]).show()

In [21]:
from pyspark.sql.functions import monotonically_increasing_id

df.select(monotonically_increasing_id(), "Description").show(3, False)

### Working with Strings

In [23]:
from pyspark.sql.functions import lower, upper

df\
    .select(df.Description,
           lower(df.Description),
           upper(df.Description))\
    .show(2, False)

In [24]:
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim

df\
    .select(
        ltrim(lit(" HELLO ")).alias("ltrim"),
        rtrim(lit(" HELLO ")).alias("rtrim"),
        trim(lit(" HELLO ")).alias("trim"),
        lpad(lit("HELLO"), 10, " ").alias("lp"),
        rpad(lit("HELLO"), 10, " ").alias("rp"))\
    .show(1)

### Regular Expressions

In [26]:
from pyspark.sql.functions import regexp_replace

regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df\
    .select(
        regexp_replace(col("Description"), regex_string, "XZ").alias("color_clean"),
        col("Description"))\
    .show(5)

In [27]:
from pyspark.sql.functions import translate

# This is done at the character level and will replace all instances of a character with the
# indexed character in the replacement string
df\
    .select(translate(col("Description"), "LEET", "1337"),col("Description"))\
    .show(2)

### This simple feature can often help you programmatically generate columns or Boolean filters in a way that is simple to understand and extend.

In [29]:
from pyspark.sql.functions import expr, locate

simpleColors = ["black", "white", "red", "green", "blue"]
def color_locator(column, color_string):
    return locate(color_string.upper(), column)\
        .cast("boolean")\
        .alias("is_" + color_string)

selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
#print(selectedColumns)
selectedColumns.append(expr("*")) # has to a be Column type

df\
    .select(*selectedColumns)\
    .where(expr("is_white OR is_red"))\
    .select("Description", *selectedColumns[:5])\
    .show(5, False)

df\
    .select("Description",
            color_locator(df.Description, "black"),
            color_locator(df.Description, "white"),
            color_locator(df.Description, "red"))\
    .show(5, False)

### Working with Dates and Timestamps

In [31]:
from pyspark.sql.functions import current_date, current_timestamp

dateDF = spark\
            .range(5)\
            .withColumn("today", current_date())\
            .withColumn("now", current_timestamp())

dateDF.show(5, False)

In [32]:
dateDF.printSchema()

In [33]:
from pyspark.sql.functions import date_add, date_sub

dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)

In [34]:
from pyspark.sql.functions import datediff, months_between, to_date

dateDF\
    .withColumn("week_ago", date_sub(col("today"), 7))\
    .select(datediff(col("week_ago"), col("today")))\
    .show(1)

dateDF\
    .select(
        to_date(lit("2016-01-01")).alias("start"),
        to_date(lit("2017-05-22")).alias("end"))\
    .select(months_between(col("start"), col("end")))\
    .show(1)

In [35]:
from pyspark.sql.functions import to_date

dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))

cleanDateDF.createOrReplaceTempView("dateTable2")

cleanDateDF.show()

In [36]:
cleanDateDF.filter(col("date2") > lit("2017-12-12")).show()

In [37]:
cleanDateDF.filter(col("date2") > "2017-12-12").show()

In [38]:
cleanDateDF.filter(cleanDateDF.date2 > "'2017-12-12'").show()

### Working with Nulls in Data

---
**WARNING**

To reiterate, when
you define a schema in which all columns are declared to not have null values, Spark will not enforce
that and will happily let null values into that column. The nullable signal is simply to help Spark SQL
optimize for handling that column. If you have null values in columns that should not have null values,
you can get an incorrect result or see strange exceptions that can be difficult to debug.

---

In [41]:
print(f"Total rows = {df.count()}")
print(f"Rows w/o ANY na's = {df.dropna('any').count()}")
print(f"Rows w/o ANY na's (v.2) = {df.na.drop('any').count()}")

print(f"Rows w/o ALL na's = {df.dropna('all').count()}")


In [42]:
# to fill all null values in columns of type String
df.na.fill("All nulls become this string").show(2)



In [43]:
xz = df.exceptAll(df.dropna())

###
xz.na.fill("all", subset=["StockCode", "InvoiceNo"]).show(1)
xz.na.fill(111, subset=["CustomerID"]).show(1)
xz.na.fill(0).show(1)

###
cols_to_fill = {"CustomerID":222}
xz.na.fill(cols_to_fill).show(1)


In [44]:
### replace
df.na.replace([""], ["UNKNOWN"], "Description").show(1)
xz.na.fill(0).replace([0], [333], "CustomerID").show(1)

### Complex types

In [46]:
# Structs
from pyspark.sql.functions import struct

complexDF = df.select(struct("Description", "InvoiceNo").alias("complex"))
complexDF.show(2, False)


In [47]:
complexDF.select("complex.Description", "complex.InvoiceNo").show(2, False)

In [48]:
complexDF.select("complex.*").show(2, False)

In [49]:
# Arrays
from pyspark.sql.functions import split

df\
  .select(split(df.Description, " ").alias("arr_col"))\
  .show(1, False)


In [50]:
df\
  .select(split(df.Description, " ").alias("arr_col"))\
  .selectExpr("arr_col[0]")\
  .show(3, False)

In [51]:
from pyspark.sql.functions import split, explode

df\
  .withColumn("splitted", split(col("Description"), " "))\
  .withColumn("exploded", explode(col("splitted")))\
  .select("Description", "splitted", "exploded")\
  .show(20, False)
  


In [52]:
from pyspark.sql.functions import split, explode, lit

df\
  .withColumn("splitted", split(lit("xz df rt"), " "))\
  .withColumn("exploded", explode(col("splitted")))\
  .select("splitted", "exploded")\
  .show(6, False)
  


In [53]:
# Maps

from pyspark.sql.functions import create_map
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.show(2, False)


In [54]:
df.select(create_map(col("Description"), col("InvoiceNo")).alias("complex_map"))\
.selectExpr("complex_map['WHITE METAL LANTERN']").show(2)