# Spark API

In [1]:
import pydataset # for demonstration
import pyspark

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [4]:
tips = pydataset.data('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3
3,21.01,3.5,Male,No,Sun,Dinner,3
4,23.68,3.31,Male,No,Sun,Dinner,2
5,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
tips = spark.createDataFrame(tips) # any pandas dataframe

In [8]:
tips2 = tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [9]:
tips2

In [10]:
type(tips2)

NoneType

In [12]:
first_five_rows = tips.head(5)

In [13]:
first_row = first_five_rows[0]

In [17]:
first_row.total_bill

16.99

In [18]:
tips

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [20]:
tips.select('total_bill', 'smoker', 'day').show()

+----------+------+---+
|total_bill|smoker|day|
+----------+------+---+
|     16.99|    No|Sun|
|     10.34|    No|Sun|
|     21.01|    No|Sun|
|     23.68|    No|Sun|
|     24.59|    No|Sun|
|     25.29|    No|Sun|
|      8.77|    No|Sun|
|     26.88|    No|Sun|
|     15.04|    No|Sun|
|     14.78|    No|Sun|
|     10.27|    No|Sun|
|     35.26|    No|Sun|
|     15.42|    No|Sun|
|     18.43|    No|Sun|
|     14.83|    No|Sun|
|     21.58|    No|Sun|
|     10.33|    No|Sun|
|     16.29|    No|Sun|
|     16.97|    No|Sun|
|     20.65|    No|Sat|
+----------+------+---+
only showing top 20 rows



In [22]:
tip_percentage = tips.tip / tips.total_bill

In [30]:
tip_percentage.alias('tip_perc')

Column<'(tip / total_bill) AS tip_perc'>

In [33]:
tip_percentage

Column<'(tip / total_bill)'>

In [32]:
tips

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

In [34]:
tips = tips.select('*', tip_percentage.alias('tip_perc'))

In [35]:
tips

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint, tip_perc: double]

In spark, in order to modify objects, you must re-assign!

Can we use pandas/numpy methods (e.g. `np.where`) with a spark dataframe? No, there's ways to do it w/ the spark API

In [41]:
tips.select('size', tips.size.cast('string'))

DataFrame[size: bigint, size: string]

In [43]:
from pyspark.sql.functions import regexp_replace, regexp_extract

In [50]:
regexp_extract('time', r'^(\w)', 1)

Column<'regexp_extract(time, ^(\w), 1)'>

In [52]:
tips.select(
    'time',
    regexp_replace('time', r'.{3}$', ''), # col, pattern, repl,
    regexp_extract('time', r'^(\w)', 1).alias('first_letter')
).show(10)

+------+--------------------------------+------------+
|  time|regexp_replace(time, .{3}$, , 1)|first_letter|
+------+--------------------------------+------------+
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
|Dinner|                             Din|           D|
+------+--------------------------------+------------+
only showing top 10 rows



In [57]:
len(tips.columns), tips.count()

(8, 244)

In [58]:
from pyspark.sql.functions import mean, sum

In [60]:
tips.select(mean('total_bill')).show()

+------------------+
|   avg(total_bill)|
+------------------+
|19.785942622950813|
+------------------+



In [61]:
from pyspark.sql.functions import concat, lit

In [64]:
# party of 4, party of 3
tips.select(
    'size',
    concat(lit('Party of '), 'size'),
).show()

+----+-----------------------+
|size|concat(Party of , size)|
+----+-----------------------+
|   2|             Party of 2|
|   3|             Party of 3|
|   3|             Party of 3|
|   2|             Party of 2|
|   4|             Party of 4|
|   4|             Party of 4|
|   2|             Party of 2|
|   4|             Party of 4|
|   2|             Party of 2|
|   2|             Party of 2|
|   2|             Party of 2|
|   4|             Party of 4|
|   2|             Party of 2|
|   4|             Party of 4|
|   2|             Party of 2|
|   2|             Party of 2|
|   3|             Party of 3|
|   3|             Party of 3|
|   3|             Party of 3|
|   3|             Party of 3|
+----+-----------------------+
only showing top 20 rows



In [66]:
from pyspark.sql.functions import when

In [69]:
when(tips.tip_perc > .20, 'good tip!').otherwise('not a good tip')

Column<'CASE WHEN (tip_perc > 0.2) THEN good tip! ELSE not a good tip END'>

In [70]:
tips.select(
    'tip_perc',
    when(tips.tip_perc > .20, 'good tip!').otherwise('not a good tip').alias('tip_description')
).show()

+-------------------+---------------+
|           tip_perc|tip_description|
+-------------------+---------------+
|0.05944673337257211| not a good tip|
|0.16054158607350097| not a good tip|
|0.16658733936220846| not a good tip|
| 0.1397804054054054| not a good tip|
|0.14680764538430255| not a good tip|
|0.18623962040332148| not a good tip|
|0.22805017103762829|      good tip!|
|0.11607142857142858| not a good tip|
|0.13031914893617022| not a good tip|
| 0.2185385656292287|      good tip!|
| 0.1665043816942551| not a good tip|
|0.14180374361883155| not a good tip|
|0.10181582360570687| not a good tip|
|0.16277807921866522| not a good tip|
|0.20364126770060686|      good tip!|
|0.18164967562557924| not a good tip|
| 0.1616650532429816| not a good tip|
|0.22774708410067526|      good tip!|
|0.20624631703005306|      good tip!|
|0.16222760290556903| not a good tip|
+-------------------+---------------+
only showing top 20 rows



In [72]:
tips.where(tips.tip_perc > .2).show(10)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|           tip_perc|
+----------+----+------+------+---+------+----+-------------------+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|0.20364126770060686|
|     16.29|3.71|  Male|    No|Sun|Dinner|   3|0.22774708410067526|
|     16.97| 3.5|Female|    No|Sun|Dinner|   3|0.20624631703005306|
|     17.92|4.08|  Male|    No|Sat|Dinner|   2|0.22767857142857142|
|     13.94|3.06|  Male|    No|Sun|Dinner|   2|0.21951219512195122|
|     22.23| 5.0|  Male|    No|Sun|Dinner|   2|0.22492127755285649|
|     10.29| 2.6|Female|    No|Sun|Dinner|   2| 0.2526724975704568|
|     18.29|3.76|  Male|   Yes|Sat|Dinner|   4|0.20557681793329688|
+----------+----+------+------+---+------+----+-------------------+
only showing top 10 rows



In [73]:
tips.filter(tips.tip_perc > .2).show(10)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|           tip_perc|
+----------+----+------+------+---+------+----+-------------------+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|0.20364126770060686|
|     16.29|3.71|  Male|    No|Sun|Dinner|   3|0.22774708410067526|
|     16.97| 3.5|Female|    No|Sun|Dinner|   3|0.20624631703005306|
|     17.92|4.08|  Male|    No|Sat|Dinner|   2|0.22767857142857142|
|     13.94|3.06|  Male|    No|Sun|Dinner|   2|0.21951219512195122|
|     22.23| 5.0|  Male|    No|Sun|Dinner|   2|0.22492127755285649|
|     10.29| 2.6|Female|    No|Sun|Dinner|   2| 0.2526724975704568|
|     18.29|3.76|  Male|   Yes|Sat|Dinner|   4|0.20557681793329688|
+----------+----+------+------+---+------+----+-------------------+
only showing top 10 rows



In [74]:
tips.sort(tips.size).show()

+----------+----+------+------+----+------+----+-------------------+
|total_bill| tip|   sex|smoker| day|  time|size|           tip_perc|
+----------+----+------+------+----+------+----+-------------------+
|     10.07|1.83|Female|    No|Thur| Lunch|   1| 0.1817279046673287|
|      8.58|1.92|  Male|   Yes| Fri| Lunch|   1|0.22377622377622378|
|      3.07| 1.0|Female|   Yes| Sat|Dinner|   1|0.32573289902280134|
|      7.25| 1.0|Female|    No| Sat|Dinner|   1|0.13793103448275862|
|     14.83|3.02|Female|    No| Sun|Dinner|   2|0.20364126770060686|
|      21.7| 4.3|  Male|    No| Sat|Dinner|   2|0.19815668202764977|
|      8.77| 2.0|  Male|    No| Sun|Dinner|   2|0.22805017103762829|
|     19.65| 3.0|Female|    No| Sat|Dinner|   2|0.15267175572519084|
|     13.27| 2.5|Female|   Yes| Sat|Dinner|   2|0.18839487565938207|
|     10.27|1.71|  Male|    No| Sun|Dinner|   2| 0.1665043816942551|
|     15.42|1.57|  Male|    No| Sun|Dinner|   2|0.10181582360570687|
|     23.68|3.31|  Male|    No| Su

In [75]:
tips.orderBy(tips.size).show()

+----------+----+------+------+----+------+----+-------------------+
|total_bill| tip|   sex|smoker| day|  time|size|           tip_perc|
+----------+----+------+------+----+------+----+-------------------+
|      3.07| 1.0|Female|   Yes| Sat|Dinner|   1|0.32573289902280134|
|      8.58|1.92|  Male|   Yes| Fri| Lunch|   1|0.22377622377622378|
|      7.25| 1.0|Female|    No| Sat|Dinner|   1|0.13793103448275862|
|     10.07|1.83|Female|    No|Thur| Lunch|   1| 0.1817279046673287|
|     10.59|1.61|Female|   Yes| Sat|Dinner|   2|0.15203021718602455|
|     14.48| 2.0|  Male|   Yes| Sun|Dinner|   2|0.13812154696132597|
|     13.39|2.61|Female|    No| Sun|Dinner|   2|0.19492158327109782|
|       9.6| 4.0|Female|   Yes| Sun|Dinner|   2| 0.4166666666666667|
|     13.81| 2.0|  Male|    No| Sun|Dinner|   2| 0.1448225923244026|
|     17.51| 3.0|Female|   Yes| Sun|Dinner|   2|0.17133066818960593|
|     20.76|2.24|  Male|    No| Sun|Dinner|   2|0.10789980732177264|
|     12.66| 2.5|  Male|    No| Su

Filtering and Sorting:

- `.filter` or `.where` to filter rows based on a condition
- `.sort` or `.orderBy` to sort the data

In [77]:
tips.orderBy(tips.size.desc()).show(5)

+----------+---+------+------+----+------+----+-------------------+
|total_bill|tip|   sex|smoker| day|  time|size|           tip_perc|
+----------+---+------+------+----+------+----+-------------------+
|     27.05|5.0|Female|    No|Thur| Lunch|   6|0.18484288354898337|
|      34.3|6.7|  Male|    No|Thur| Lunch|   6|0.19533527696793004|
|      29.8|4.2|Female|    No|Thur| Lunch|   6|0.14093959731543623|
|     48.17|5.0|  Male|    No| Sun|Dinner|   6|0.10379904504878555|
|     20.69|5.0|  Male|    No| Sun|Dinner|   5| 0.2416626389560174|
+----------+---+------+------+----+------+----+-------------------+
only showing top 5 rows



In [78]:
from pyspark.sql.functions import asc, desc

In [79]:
tips.orderBy(desc('size'), asc('total_bill')).show()

+----------+----+------+------+----+------+----+-------------------+
|total_bill| tip|   sex|smoker| day|  time|size|           tip_perc|
+----------+----+------+------+----+------+----+-------------------+
|     27.05| 5.0|Female|    No|Thur| Lunch|   6|0.18484288354898337|
|      29.8| 4.2|Female|    No|Thur| Lunch|   6|0.14093959731543623|
|      34.3| 6.7|  Male|    No|Thur| Lunch|   6|0.19533527696793004|
|     48.17| 5.0|  Male|    No| Sun|Dinner|   6|0.10379904504878555|
|     20.69| 5.0|  Male|    No| Sun|Dinner|   5| 0.2416626389560174|
|     28.15| 3.0|  Male|   Yes| Sat|Dinner|   5|0.10657193605683837|
|     29.85|5.14|Female|    No| Sun|Dinner|   5| 0.1721943048576214|
|     30.46| 2.0|  Male|   Yes| Sun|Dinner|   5|0.06565988181221273|
|     41.19| 5.0|  Male|    No|Thur| Lunch|   5|0.12138868657441128|
|     16.49| 2.0|  Male|    No| Sun|Dinner|   4|0.12128562765312312|
|     17.81|2.34|  Male|    No| Sat|Dinner|   4|0.13138686131386862|
|     18.29|3.76|  Male|   Yes| Sa