In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
spark

In [3]:
import pandas as pd
import pydataset

In [4]:
tips = pydataset.data('tips')
df = spark.createDataFrame(tips)
df

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

## DataFrame Basics

In [9]:
# Don't do this!
# just use .show to view df contents
df2 = df.show(10)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
+----------+----+------+------+---+------+----+
only showing top 10 rows



In [11]:
type(df2)

NoneType

In [15]:
df.head(5)

[Row(total_bill=16.99, tip=1.01, sex='Female', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=10.34, tip=1.66, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=21.01, tip=3.5, sex='Male', smoker='No', day='Sun', time='Dinner', size=3),
 Row(total_bill=23.68, tip=3.31, sex='Male', smoker='No', day='Sun', time='Dinner', size=2),
 Row(total_bill=24.59, tip=3.61, sex='Female', smoker='No', day='Sun', time='Dinner', size=4)]

In [18]:
df.select('total_bill', 'tip', 'size', 'day').show()

+----------+----+----+---+
|total_bill| tip|size|day|
+----------+----+----+---+
|     16.99|1.01|   2|Sun|
|     10.34|1.66|   3|Sun|
|     21.01| 3.5|   3|Sun|
|     23.68|3.31|   2|Sun|
|     24.59|3.61|   4|Sun|
|     25.29|4.71|   4|Sun|
|      8.77| 2.0|   2|Sun|
|     26.88|3.12|   4|Sun|
|     15.04|1.96|   2|Sun|
|     14.78|3.23|   2|Sun|
|     10.27|1.71|   2|Sun|
|     35.26| 5.0|   4|Sun|
|     15.42|1.57|   2|Sun|
|     18.43| 3.0|   4|Sun|
|     14.83|3.02|   2|Sun|
|     21.58|3.92|   2|Sun|
|     10.33|1.67|   3|Sun|
|     16.29|3.71|   3|Sun|
|     16.97| 3.5|   3|Sun|
|     20.65|3.35|   3|Sat|
+----------+----+----+---+
only showing top 20 rows



In [19]:
df.select('*')

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [25]:
df.select(df.tip / df.total_bill).show(5)

+-------------------+
| (tip / total_bill)|
+-------------------+
|0.05944673337257211|
|0.16054158607350097|
|0.16658733936220846|
| 0.1397804054054054|
|0.14680764538430255|
+-------------------+
only showing top 5 rows



In [26]:
col = df.tip / df.total_bill
col

Column<b'(tip / total_bill)'>

In [29]:
df.select('*', col.alias('tip_pct')).show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [31]:
df_with_tip_pct = df.select('*', col.alias('tip_pct'))

In [33]:
df_with_tip_pct.show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [52]:
from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when

In [35]:
df.select(mean(df.tip), sum(df.total_bill)).show()

+------------------+-----------------+
|          avg(tip)|  sum(total_bill)|
+------------------+-----------------+
|2.9982786885245907|4827.769999999999|
+------------------+-----------------+



In [39]:
df.select(concat('day', lit(' '), 'time')).show(5)

+--------------------+
|concat(day,  , time)|
+--------------------+
|          Sun Dinner|
|          Sun Dinner|
|          Sun Dinner|
|          Sun Dinner|
|          Sun Dinner|
+--------------------+
only showing top 5 rows



In [45]:
df.select(df.time.cast('int')).show(5)

+----+
|time|
+----+
|null|
|null|
|null|
|null|
|null|
+----+
only showing top 5 rows



In [50]:
df.select(
    'time',
    regexp_extract('time', r'(\w).*', 1).alias('first_letter'),
    regexp_replace('time', r'[aeiou]', 'X')
).show(5)

+------+------------+--------------------------------+
|  time|first_letter|regexp_replace(time, [aeiou], X)|
+------+------------+--------------------------------+
|Dinner|           D|                          DXnnXr|
|Dinner|           D|                          DXnnXr|
|Dinner|           D|                          DXnnXr|
|Dinner|           D|                          DXnnXr|
|Dinner|           D|                          DXnnXr|
+------+------------+--------------------------------+
only showing top 5 rows



In [53]:
df = df.select(
    '*',
    (df.tip / df.total_bill).alias('tip_pct')
)

In [57]:
df.select(
    'tip_pct',
    (when(df.tip_pct > .2, 'good tip')
     .otherwise('not good tip')
     .alias('tip_desc'))
).show(25)

+-------------------+------------+
|            tip_pct|    tip_desc|
+-------------------+------------+
|0.05944673337257211|not good tip|
|0.16054158607350097|not good tip|
|0.16658733936220846|not good tip|
| 0.1397804054054054|not good tip|
|0.14680764538430255|not good tip|
|0.18623962040332148|not good tip|
|0.22805017103762829|    good tip|
|0.11607142857142858|not good tip|
|0.13031914893617022|not good tip|
| 0.2185385656292287|    good tip|
| 0.1665043816942551|not good tip|
|0.14180374361883155|not good tip|
|0.10181582360570687|not good tip|
|0.16277807921866522|not good tip|
|0.20364126770060686|    good tip|
|0.18164967562557924|not good tip|
| 0.1616650532429816|not good tip|
|0.22774708410067526|    good tip|
|0.20624631703005306|    good tip|
|0.16222760290556903|not good tip|
|0.22767857142857142|    good tip|
|0.13553474618038444|not good tip|
|0.14140773620798985|not good tip|
|0.19228817858954844|not good tip|
|0.16044399596367306|not good tip|
+-------------------

## Transforming Rows

In [6]:
df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [9]:
df.orderBy(df.total_bill).show()

+----------+----+------+------+----+------+----+
|total_bill| tip|   sex|smoker| day|  time|size|
+----------+----+------+------+----+------+----+
|      3.07| 1.0|Female|   Yes| Sat|Dinner|   1|
|      5.75| 1.0|Female|   Yes| Fri|Dinner|   2|
|      7.25|5.15|  Male|   Yes| Sun|Dinner|   2|
|      7.25| 1.0|Female|    No| Sat|Dinner|   1|
|      7.51| 2.0|  Male|    No|Thur| Lunch|   2|
|      7.56|1.44|  Male|    No|Thur| Lunch|   2|
|      7.74|1.44|  Male|   Yes| Sat|Dinner|   2|
|      8.35| 1.5|Female|    No|Thur| Lunch|   2|
|      8.51|1.25|Female|    No|Thur| Lunch|   2|
|      8.52|1.48|  Male|    No|Thur| Lunch|   2|
|      8.58|1.92|  Male|   Yes| Fri| Lunch|   1|
|      8.77| 2.0|  Male|    No| Sun|Dinner|   2|
|      9.55|1.45|  Male|    No| Sat|Dinner|   2|
|       9.6| 4.0|Female|   Yes| Sun|Dinner|   2|
|      9.68|1.32|  Male|    No| Sun|Dinner|   2|
|      9.78|1.73|  Male|    No|Thur| Lunch|   2|
|      9.94|1.56|  Male|    No| Sun|Dinner|   2|
|     10.07|1.83|Fem

In [11]:
df.sort(df.day, df.size).show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      8.58|1.92|  Male|   Yes|Fri| Lunch|   1|
|     16.27| 2.5|Female|   Yes|Fri| Lunch|   2|
|     12.16| 2.2|  Male|   Yes|Fri| Lunch|   2|
|     22.49| 3.5|  Male|    No|Fri|Dinner|   2|
|     13.42|1.58|  Male|   Yes|Fri| Lunch|   2|
|      5.75| 1.0|Female|   Yes|Fri|Dinner|   2|
|     16.32| 4.3|Female|   Yes|Fri|Dinner|   2|
|     22.75|3.25|Female|    No|Fri|Dinner|   2|
|     13.42|3.48|Female|   Yes|Fri| Lunch|   2|
|     11.35| 2.5|Female|   Yes|Fri|Dinner|   2|
|     28.97| 3.0|  Male|   Yes|Fri|Dinner|   2|
|     15.38| 3.0|Female|   Yes|Fri|Dinner|   2|
|     27.28| 4.0|  Male|   Yes|Fri|Dinner|   2|
|     12.03| 1.5|  Male|   Yes|Fri|Dinner|   2|
|     21.01| 3.0|  Male|   Yes|Fri|Dinner|   2|
|     12.46| 1.5|  Male|    No|Fri|Dinner|   2|
|     10.09| 2.0|Female|   Yes|Fri| Lunch|   2|
|     15.98| 3.0|Female|    No|Fri| Lunc

In [19]:
from pyspark.sql.functions import asc, desc, col

In [17]:
df.sort(df.day, asc('time'), desc('size')).show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     40.17|4.73|  Male|   Yes|Fri|Dinner|   4|
|     21.01| 3.0|  Male|   Yes|Fri|Dinner|   2|
|     28.97| 3.0|  Male|   Yes|Fri|Dinner|   2|
|     12.03| 1.5|  Male|   Yes|Fri|Dinner|   2|
|     11.35| 2.5|Female|   Yes|Fri|Dinner|   2|
|     12.46| 1.5|  Male|    No|Fri|Dinner|   2|
|     15.38| 3.0|Female|   Yes|Fri|Dinner|   2|
|      5.75| 1.0|Female|   Yes|Fri|Dinner|   2|
|     27.28| 4.0|  Male|   Yes|Fri|Dinner|   2|
|     22.75|3.25|Female|    No|Fri|Dinner|   2|
|     22.49| 3.5|  Male|    No|Fri|Dinner|   2|
|     16.32| 4.3|Female|   Yes|Fri|Dinner|   2|
|     15.98| 3.0|Female|    No|Fri| Lunch|   3|
|     10.09| 2.0|Female|   Yes|Fri| Lunch|   2|
|     13.42|1.58|  Male|   Yes|Fri| Lunch|   2|
|     16.27| 2.5|Female|   Yes|Fri| Lunch|   2|
|     13.42|3.48|Female|   Yes|Fri| Lunch|   2|
|     12.16| 2.2|  Male|   Yes|Fri| Lunc

In [23]:
col('size').asc()

Column<b'size ASC NULLS FIRST'>

In [25]:
df.sort(col('size').desc(), col('time')).show()

+----------+----+------+------+----+------+----+
|total_bill| tip|   sex|smoker| day|  time|size|
+----------+----+------+------+----+------+----+
|     48.17| 5.0|  Male|    No| Sun|Dinner|   6|
|     27.05| 5.0|Female|    No|Thur| Lunch|   6|
|      34.3| 6.7|  Male|    No|Thur| Lunch|   6|
|      29.8| 4.2|Female|    No|Thur| Lunch|   6|
|     28.15| 3.0|  Male|   Yes| Sat|Dinner|   5|
|     20.69| 5.0|  Male|    No| Sun|Dinner|   5|
|     29.85|5.14|Female|    No| Sun|Dinner|   5|
|     30.46| 2.0|  Male|   Yes| Sun|Dinner|   5|
|     41.19| 5.0|  Male|    No|Thur| Lunch|   5|
|     25.89|5.16|  Male|   Yes| Sat|Dinner|   4|
|      21.5| 3.5|  Male|    No| Sun|Dinner|   4|
|     48.33| 9.0|  Male|    No| Sat|Dinner|   4|
|     24.55| 2.0|  Male|    No| Sun|Dinner|   4|
|     31.71| 4.5|  Male|    No| Sun|Dinner|   4|
|     24.01| 2.0|  Male|   Yes| Sat|Dinner|   4|
|      25.0|3.75|Female|    No| Sun|Dinner|   4|
|     18.29|3.76|  Male|   Yes| Sat|Dinner|   4|
|     16.49| 2.0|  M

In [26]:
df.where(df.tip < 4).show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinner|   3|
|     16.97| 3.5|Female|    No|Sun|Dinner|   3|
|     20.65|3.35|  Male|    No|Sat|Dinne

In [29]:
mask = df.tip < 4
df.where(mask).show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinner|   3|
|     16.97| 3.5|Female|    No|Sun|Dinner|   3|
|     20.65|3.35|  Male|    No|Sat|Dinne

In [33]:
df.filter((df.time == "Dinner") | (df.tip <= 2)).sort('tip').show()

+----------+----+------+------+----+------+----+
|total_bill| tip|   sex|smoker| day|  time|size|
+----------+----+------+------+----+------+----+
|      7.25| 1.0|Female|    No| Sat|Dinner|   1|
|      12.6| 1.0|  Male|   Yes| Sat|Dinner|   2|
|      5.75| 1.0|Female|   Yes| Fri|Dinner|   2|
|      3.07| 1.0|Female|   Yes| Sat|Dinner|   1|
|     16.99|1.01|Female|    No| Sun|Dinner|   2|
|      12.9| 1.1|Female|   Yes| Sat|Dinner|   2|
|     32.83|1.17|  Male|   Yes| Sat|Dinner|   2|
|     10.07|1.25|  Male|    No| Sat|Dinner|   2|
|     10.51|1.25|  Male|    No| Sat|Dinner|   2|
|      8.51|1.25|Female|    No|Thur| Lunch|   2|
|      9.68|1.32|  Male|    No| Sun|Dinner|   2|
|     18.64|1.36|Female|    No|Thur| Lunch|   3|
|      7.74|1.44|  Male|   Yes| Sat|Dinner|   2|
|      7.56|1.44|  Male|    No|Thur| Lunch|   2|
|      9.55|1.45|  Male|    No| Sat|Dinner|   2|
|     10.77|1.47|  Male|    No| Sat|Dinner|   2|
|      8.52|1.48|  Male|    No|Thur| Lunch|   2|
|     12.03| 1.5|  M

In [34]:
df.where(df.smoker == "Yes").where(df.day == "Sat").show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     38.01| 3.0|  Male|   Yes|Sat|Dinner|   4|
|     11.24|1.76|  Male|   Yes|Sat|Dinner|   2|
|     20.29|3.21|  Male|   Yes|Sat|Dinner|   2|
|     13.81| 2.0|  Male|   Yes|Sat|Dinner|   2|
|     11.02|1.98|  Male|   Yes|Sat|Dinner|   2|
|     18.29|3.76|  Male|   Yes|Sat|Dinner|   4|
|      3.07| 1.0|Female|   Yes|Sat|Dinner|   1|
|     15.01|2.09|  Male|   Yes|Sat|Dinner|   2|
|     26.86|3.14|Female|   Yes|Sat|Dinner|   2|
|     25.28| 5.0|Female|   Yes|Sat|Dinner|   2|
|     17.92|3.08|  Male|   Yes|Sat|Dinner|   2|
|      44.3| 2.5|Female|   Yes|Sat|Dinner|   3|
|     22.42|3.48|Female|   Yes|Sat|Dinner|   2|
|     15.36|1.64|  Male|   Yes|Sat|Dinner|   2|
|     20.49|4.06|  Male|   Yes|Sat|Dinner|   2|
|     25.21|4.29|  Male|   Yes|Sat|Dinner|   2|
|     14.31| 4.0|Female|   Yes|Sat|Dinner|   2|
|     10.59|1.61|Female|   Yes|Sat|Dinne

## Aggregating

In [36]:
from pyspark.sql.functions import mean, min, max

In [35]:
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [37]:
df.groupBy('time').agg(mean('tip')).show()

+------+------------------+
|  time|          avg(tip)|
+------+------------------+
| Lunch|2.7280882352941176|
|Dinner| 3.102670454545455|
+------+------------------+



In [39]:
df.groupBy('time').agg(min('tip'), mean('tip'), max('tip')).show()

+------+--------+------------------+--------+
|  time|min(tip)|          avg(tip)|max(tip)|
+------+--------+------------------+--------+
| Lunch|    1.25|2.7280882352941176|     6.7|
|Dinner|     1.0| 3.102670454545455|    10.0|
+------+--------+------------------+--------+



In [40]:
df.groupBy('time').agg(mean('tip').alias('avg_tip')).show()

+------+------------------+
|  time|           avg_tip|
+------+------------------+
| Lunch|2.7280882352941176|
|Dinner| 3.102670454545455|
+------+------------------+



In [41]:
df.groupBy('time', 'day').agg(mean('total_bill')).show()

+------+----+------------------+
|  time| day|   avg(total_bill)|
+------+----+------------------+
| Lunch|Thur|17.664754098360653|
|Dinner|Thur|             18.78|
| Lunch| Fri|12.845714285714285|
|Dinner| Fri| 19.66333333333333|
|Dinner| Sun|21.409999999999997|
|Dinner| Sat|20.441379310344825|
+------+----+------------------+



In [42]:
df.crosstab('time', 'day').show()

+--------+---+---+---+----+
|time_day|Fri|Sat|Sun|Thur|
+--------+---+---+---+----+
|   Lunch|  7|  0|  0|  61|
|  Dinner| 12| 87| 76|   1|
+--------+---+---+---+----+



In [44]:
df.groupBy('time').pivot('day').agg(mean('total_bill')).show()

+------+------------------+------------------+------------------+------------------+
|  time|               Fri|               Sat|               Sun|              Thur|
+------+------------------+------------------+------------------+------------------+
| Lunch|12.845714285714285|              null|              null|17.664754098360653|
|Dinner| 19.66333333333333|20.441379310344825|21.409999999999997|             18.78|
+------+------------------+------------------+------------------+------------------+



`.crosstab` is just for counts, for other methods of summarizing groups, use `.groupBy` (maybe in combination with `.pivot`) + `.agg`.

## Additional Features

In [45]:
df.createOrReplaceTempView('tips')

In [47]:
spark.sql('''
SELECT *
FROM tips
''').show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [51]:
# find the tip, total_bill, and day with the highest overall sales for that day
spark.sql('''
SELECT tip, total_bill, day
FROM tips
WHERE day = (
    SELECT day
    FROM tips
    GROUP BY day
    ORDER BY sum(total_bill) DESC
    LIMIT 1
)    
''').show()

+----+----------+---+
| tip|total_bill|day|
+----+----------+---+
|3.35|     20.65|Sat|
|4.08|     17.92|Sat|
|2.75|     20.29|Sat|
|2.23|     15.77|Sat|
|7.58|     39.42|Sat|
|3.18|     19.82|Sat|
|2.34|     17.81|Sat|
| 2.0|     13.37|Sat|
| 2.0|     12.69|Sat|
| 4.3|      21.7|Sat|
| 3.0|     19.65|Sat|
|1.45|      9.55|Sat|
| 2.5|     18.35|Sat|
| 3.0|     15.06|Sat|
|2.45|     20.69|Sat|
|3.27|     17.78|Sat|
| 3.6|     24.06|Sat|
| 2.0|     16.31|Sat|
|3.07|     16.93|Sat|
|2.31|     18.69|Sat|
+----+----------+---+
only showing top 20 rows



In [57]:
df.where(
    df.time == 'Dinner'
).select(
    '*',
    (df.tip / df.total_bill).alias('tip_pct'),
).explain()

== Physical Plan ==
*(1) Project [total_bill#0, tip#1, sex#2, smoker#3, day#4, time#5, size#6L, (tip#1 / total_bill#0) AS tip_pct#1115]
+- *(1) Filter (isnotnull(time#5) AND (time#5 = Dinner))
   +- *(1) Scan ExistingRDD[total_bill#0,tip#1,sex#2,smoker#3,day#4,time#5,size#6L]




In [58]:
df.select(
    '*',
    (df.tip / df.total_bill).alias('tip_pct'),
).where(
    df.time == 'Dinner'
).explain()

== Physical Plan ==
*(1) Project [total_bill#0, tip#1, sex#2, smoker#3, day#4, time#5, size#6L, (tip#1 / total_bill#0) AS tip_pct#1124]
+- *(1) Filter (isnotnull(time#5) AND (time#5 = Dinner))
   +- *(1) Scan ExistingRDD[total_bill#0,tip#1,sex#2,smoker#3,day#4,time#5,size#6L]




In [59]:
from pyspark.sql.functions import expr

Expr lets us mix in parts of SQL into our dataframes

In [61]:
df.select(
    '*',
    expr('tip / total_bill as tip_pct')
).where(
    expr('day = "Sun" AND time = "Dinner"')
).show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|0.18623962040332148|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|0.13031914893617022|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 0.1665043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|0