In [3]:
import numpy as np
import pandas as pd
import pyspark

In [4]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [5]:
np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [6]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: bigint, group: string]

In [7]:
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
|  5|    c|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    b|
| 10|    b|
| 11|    a|
| 12|    b|
| 13|    a|
| 14|    b|
| 15|    b|
| 16|    c|
| 17|    c|
| 18|    a|
| 19|    c|
+---+-----+



In [8]:
df.group

Column<b'group'>

In [10]:
df.select(df.group).show()

+-----+
|group|
+-----+
|    b|
|    b|
|    c|
|    a|
|    c|
|    c|
|    a|
|    b|
|    a|
|    b|
|    b|
|    a|
|    b|
|    a|
|    b|
|    b|
|    c|
|    c|
|    a|
|    c|
+-----+



In [13]:
df.select(df.n+1).show()

+-------+
|(n + 1)|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
|      6|
|      7|
|      8|
|      9|
|     10|
|     11|
|     12|
|     13|
|     14|
|     15|
|     16|
|     17|
|     18|
|     19|
|     20|
+-------+



In [15]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [27]:
df.describe().select("n","summary").show()

+-----------------+-------+
|                n|summary|
+-----------------+-------+
|               20|  count|
|              9.5|   mean|
|5.916079783099616| stddev|
|                0|    min|
|               19|    max|
+-----------------+-------+



In [37]:
# df.select("n","summary").describe.show()

In [38]:
from pydataset import data

In [39]:
mpg = spark.createDataFrame(data("mpg"))
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [42]:
mpg.select(mpg.model, "manufacturer", mpg.hwy.alias("hw_mileage")).show()

+------------------+------------+----------+
|             model|manufacturer|hw_mileage|
+------------------+------------+----------+
|                a4|        audi|        29|
|                a4|        audi|        29|
|                a4|        audi|        31|
|                a4|        audi|        30|
|                a4|        audi|        26|
|                a4|        audi|        26|
|                a4|        audi|        27|
|        a4 quattro|        audi|        26|
|        a4 quattro|        audi|        25|
|        a4 quattro|        audi|        28|
|        a4 quattro|        audi|        27|
|        a4 quattro|        audi|        25|
|        a4 quattro|        audi|        25|
|        a4 quattro|        audi|        25|
|        a4 quattro|        audi|        25|
|        a6 quattro|        audi|        24|
|        a6 quattro|        audi|        25|
|        a6 quattro|        audi|        23|
|c1500 suburban 2wd|   chevrolet|        20|
|c1500 sub

In [43]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [60]:
avg_mile_column = ((mpg.cty + mpg.hwy) /2).alias('avg_mileage')
avg_mile_column

Column<b'((cty + hwy) / 2) AS `avg_mileage`'>

In [61]:
mpg.select('*', avg_mile_column).show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+-----------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|avg_mileage|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+-----------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|       23.5|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|       25.0|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|       25.5|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|       25.5|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|       21.0|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|       22.0|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|       22.5|
|        audi|        a4 quattro|  1.8|1

In [62]:
from pyspark.sql.functions import col

In [63]:
col

<function pyspark.sql.functions._create_function.<locals>._(col)>

In [67]:
col('hwy')

Column<b'hwy'>

In [68]:
mpg.select(col('hwy'))

DataFrame[hwy: bigint]

In [69]:
just_hwy_and_cty = mpg.select('hwy','cty')

In [71]:
just_hwy_and_cty.show(5)

+---+---+
|hwy|cty|
+---+---+
| 29| 18|
| 29| 21|
| 31| 20|
| 30| 21|
| 26| 16|
+---+---+
only showing top 5 rows



In [72]:
mpg.select('*', mpg.cyl.cast('string').alias('cyl_string')).show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+----------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|cyl_string|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+----------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|         4|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|         4|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|         4|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|         4|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|         6|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|         6|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|         6|
|        audi|        a4 quattro|  1.8|1999|  4|ma

In [77]:
mpg.select(mpg.manufacturer).show()

+------------+
|manufacturer|
+------------+
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|        audi|
|   chevrolet|
|   chevrolet|
+------------+
only showing top 20 rows



In [80]:
from pyspark.sql.functions import min, max

In [86]:
min([1,2,3])

Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.min. Trace:
py4j.Py4JException: Method min([class java.util.ArrayList]) does not exist
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:318)
	at py4j.reflection.ReflectionEngine.getMethod(ReflectionEngine.java:339)
	at py4j.Gateway.invoke(Gateway.java:276)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



In [87]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [88]:
textdf = spark.createDataFrame(
    pd.DataFrame(
        {
            "address": [
                "600 Navarro St ste 600, San Antonio, TX 78205",
                "3130 Broadway St, San Antonio, TX 78209",
                "303 Pearl Pkwy, San Antonio, TX 78215",
                "1255 SW Loop 410, San Antonio, TX 78227",
            ]
        }
    )
)

textdf.show(truncate=False)

+---------------------------------------------+
|address                                      |
+---------------------------------------------+
|600 Navarro St ste 600, San Antonio, TX 78205|
|3130 Broadway St, San Antonio, TX 78209      |
|303 Pearl Pkwy, San Antonio, TX 78215        |
|1255 SW Loop 410, San Antonio, TX 78227      |
+---------------------------------------------+



In [89]:
# extract house/unit number

In [90]:
from pyspark.sql.functions import regexp_extract, regexp_replace

In [107]:
textdf.select(
    'address',
    regexp_extract('address', r'^(\d+)', 1).alias('unit_no'),
    regexp_extract('address', r'^\d+\s+(.*?),', 1).alias('street'),
    regexp_replace(
        regexp_replace('address', r'^\d+\s+.*?,\s+', ''),
        r'\s+\d+$', ''
        ).alias('city_state'),
    regexp_extract('address', r'(\d+(-\d+)?)$', 1).alias('zip')
).show(truncate=False)

+---------------------------------------------+-------+------------------+---------------+-----+
|address                                      |unit_no|street            |city_state     |zip  |
+---------------------------------------------+-------+------------------+---------------+-----+
|600 Navarro St ste 600, San Antonio, TX 78205|600    |Navarro St ste 600|San Antonio, TX|78205|
|3130 Broadway St, San Antonio, TX 78209      |3130   |Broadway St       |San Antonio, TX|78209|
|303 Pearl Pkwy, San Antonio, TX 78215        |303    |Pearl Pkwy        |San Antonio, TX|78215|
|1255 SW Loop 410, San Antonio, TX 78227      |1255   |SW Loop 410       |San Antonio, TX|78227|
+---------------------------------------------+-------+------------------+---------------+-----+



In [110]:
import pyspark

#tab to see builtin functions
pyspark.sql.functions

<module 'pyspark.sql.functions' from '/usr/local/anaconda3/lib/python3.7/site-packages/pyspark/sql/functions.py'>

In [112]:
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [115]:
mpg.createOrReplaceTempView('mpg')

In [116]:
spark.sql('''
 SELECT distinct class
 from mpg
 ''').show()

+----------+
|     class|
+----------+
|subcompact|
|   compact|
|   minivan|
|       suv|
|   midsize|
|    pickup|
|   2seater|
+----------+



In [119]:
mpg.where(mpg['class'] == 'minivan').show(5)

+------------+-----------+-----+----+---+--------+---+---+---+---+-------+
|manufacturer|      model|displ|year|cyl|   trans|drv|cty|hwy| fl|  class|
+------------+-----------+-----+----+---+--------+---+---+---+---+-------+
|       dodge|caravan 2wd|  2.4|1999|  4|auto(l3)|  f| 18| 24|  r|minivan|
|       dodge|caravan 2wd|  3.0|1999|  6|auto(l4)|  f| 17| 24|  r|minivan|
|       dodge|caravan 2wd|  3.3|1999|  6|auto(l4)|  f| 16| 22|  r|minivan|
|       dodge|caravan 2wd|  3.3|1999|  6|auto(l4)|  f| 16| 22|  r|minivan|
|       dodge|caravan 2wd|  3.3|2008|  6|auto(l4)|  f| 17| 24|  r|minivan|
+------------+-----------+-----+----+---+--------+---+---+---+---+-------+
only showing top 5 rows



In [120]:
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [128]:
from pyspark.sql.functions import when

In [134]:
mpg.select(
    'cty',
    when(mpg.cty >= 20, 'good mileage').otherwise('bad mileage').alias('mileage')
    ).show()

+---+------------+
|cty|     mileage|
+---+------------+
| 18| bad mileage|
| 21|good mileage|
| 20|good mileage|
| 21|good mileage|
| 16| bad mileage|
| 18| bad mileage|
| 18| bad mileage|
| 18| bad mileage|
| 16| bad mileage|
| 20|good mileage|
| 19| bad mileage|
| 15| bad mileage|
| 17| bad mileage|
| 17| bad mileage|
| 15| bad mileage|
| 15| bad mileage|
| 17| bad mileage|
| 16| bad mileage|
| 14| bad mileage|
| 11| bad mileage|
+---+------------+
only showing top 20 rows



In [138]:
mpg.select(
    mpg.displ,
    (
        when(mpg.displ < 2, 'small')
        .when(mpg.displ < 3, 'medium')
        .otherwise('large')
        .alias('engine_size')
    )).show()

+-----+-----------+
|displ|engine_size|
+-----+-----------+
|  1.8|      small|
|  1.8|      small|
|  2.0|     medium|
|  2.0|     medium|
|  2.8|     medium|
|  2.8|     medium|
|  3.1|      large|
|  1.8|      small|
|  1.8|      small|
|  2.0|     medium|
|  2.0|     medium|
|  2.8|     medium|
|  2.8|     medium|
|  3.1|      large|
|  3.1|      large|
|  2.8|     medium|
|  3.1|      large|
|  4.2|      large|
|  5.3|      large|
|  5.3|      large|
+-----+-----------+
only showing top 20 rows



In [141]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [143]:
mpg.sort('hwy').show()

+------------+--------------------+-----+----+---+----------+---+---+---+---+------+
|manufacturer|               model|displ|year|cyl|     trans|drv|cty|hwy| fl| class|
+------------+--------------------+-----+----+---+----------+---+---+---+---+------+
|       dodge|         durango 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|   suv|
|       dodge|   dakota pickup 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|pickup|
|        jeep|  grand cherokee 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|   suv|
|       dodge| ram 1500 pickup 4wd|  4.7|2008|  8|manual(m6)|  4|  9| 12|  e|pickup|
|       dodge| ram 1500 pickup 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|pickup|
|   chevrolet|     k1500 tahoe 4wd|  5.3|2008|  8|  auto(l4)|  4| 11| 14|  e|   suv|
|        jeep|  grand cherokee 4wd|  6.1|2008|  8|  auto(l5)|  4| 11| 14|  p|   suv|
|       dodge|         durango 4wd|  5.9|1999|  8|  auto(l4)|  4| 11| 15|  r|   suv|
|      toyota|land cruiser wago...|  4.7|1999|  8|  auto(l4)|  4|

In [146]:
from pyspark.sql.functions import desc

In [145]:
mpg.orderBy('hwy').show(truncate=False)

+------------+----------------------+-----+----+---+----------+---+---+---+---+------+
|manufacturer|model                 |displ|year|cyl|trans     |drv|cty|hwy|fl |class |
+------------+----------------------+-----+----+---+----------+---+---+---+---+------+
|dodge       |ram 1500 pickup 4wd   |4.7  |2008|8  |manual(m6)|4  |9  |12 |e  |pickup|
|dodge       |ram 1500 pickup 4wd   |4.7  |2008|8  |auto(l5)  |4  |9  |12 |e  |pickup|
|jeep        |grand cherokee 4wd    |4.7  |2008|8  |auto(l5)  |4  |9  |12 |e  |suv   |
|dodge       |dakota pickup 4wd     |4.7  |2008|8  |auto(l5)  |4  |9  |12 |e  |pickup|
|dodge       |durango 4wd           |4.7  |2008|8  |auto(l5)  |4  |9  |12 |e  |suv   |
|chevrolet   |k1500 tahoe 4wd       |5.3  |2008|8  |auto(l4)  |4  |11 |14 |e  |suv   |
|jeep        |grand cherokee 4wd    |6.1  |2008|8  |auto(l5)  |4  |11 |14 |p  |suv   |
|chevrolet   |k1500 tahoe 4wd       |5.7  |1999|8  |auto(l4)  |4  |11 |15 |r  |suv   |
|dodge       |ram 1500 pickup 4wd   |5.2  |

In [149]:
mpg.sort(desc('hwy')).show()

+------------+------------+-----+----+---+----------+---+---+---+---+----------+
|manufacturer|       model|displ|year|cyl|     trans|drv|cty|hwy| fl|     class|
+------------+------------+-----+----+---+----------+---+---+---+---+----------+
|  volkswagen|       jetta|  1.9|1999|  4|manual(m5)|  f| 33| 44|  d|   compact|
|  volkswagen|  new beetle|  1.9|1999|  4|manual(m5)|  f| 35| 44|  d|subcompact|
|  volkswagen|  new beetle|  1.9|1999|  4|  auto(l4)|  f| 29| 41|  d|subcompact|
|      toyota|     corolla|  1.8|2008|  4|manual(m5)|  f| 28| 37|  r|   compact|
|       honda|       civic|  1.8|2008|  4|  auto(l5)|  f| 24| 36|  c|subcompact|
|       honda|       civic|  1.8|2008|  4|  auto(l5)|  f| 25| 36|  r|subcompact|
|      toyota|     corolla|  1.8|1999|  4|manual(m5)|  f| 26| 35|  r|   compact|
|      toyota|     corolla|  1.8|2008|  4|  auto(l4)|  f| 26| 35|  r|   compact|
|       honda|       civic|  1.8|2008|  4|manual(m5)|  f| 26| 34|  r|subcompact|
|       honda|       civic| 

In [154]:
mpg.sort(mpg.hwy.desc()).show()

+------------+----------+-----+----+---+----------+---+---+---+---+----------+
|manufacturer|     model|displ|year|cyl|     trans|drv|cty|hwy| fl|     class|
+------------+----------+-----+----+---+----------+---+---+---+---+----------+
|  volkswagen|new beetle|  1.9|1999|  4|manual(m5)|  f| 35| 44|  d|subcompact|
|  volkswagen|     jetta|  1.9|1999|  4|manual(m5)|  f| 33| 44|  d|   compact|
|  volkswagen|new beetle|  1.9|1999|  4|  auto(l4)|  f| 29| 41|  d|subcompact|
|      toyota|   corolla|  1.8|2008|  4|manual(m5)|  f| 28| 37|  r|   compact|
|       honda|     civic|  1.8|2008|  4|  auto(l5)|  f| 25| 36|  r|subcompact|
|       honda|     civic|  1.8|2008|  4|  auto(l5)|  f| 24| 36|  c|subcompact|
|      toyota|   corolla|  1.8|1999|  4|manual(m5)|  f| 26| 35|  r|   compact|
|      toyota|   corolla|  1.8|2008|  4|  auto(l4)|  f| 26| 35|  r|   compact|
|       honda|     civic|  1.8|2008|  4|manual(m5)|  f| 26| 34|  r|subcompact|
|      toyota|   corolla|  1.8|1999|  4|  auto(l4)| 

In [155]:
mpg.sort(desc("class"), mpg.cyl.asc(), mpg.hwy.desc()).show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-----+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-----+
|      subaru|      forester awd|  2.5|2008|  4|manual(m5)|  4| 20| 27|  r|  suv|
|      subaru|      forester awd|  2.5|2008|  4|  auto(l4)|  4| 20| 26|  r|  suv|
|      subaru|      forester awd|  2.5|1999|  4|manual(m5)|  4| 18| 25|  r|  suv|
|      subaru|      forester awd|  2.5|2008|  4|manual(m5)|  4| 19| 25|  p|  suv|
|      subaru|      forester awd|  2.5|1999|  4|  auto(l4)|  4| 18| 24|  r|  suv|
|      subaru|      forester awd|  2.5|2008|  4|  auto(l4)|  4| 18| 23|  p|  suv|
|      toyota|       4runner 4wd|  2.7|1999|  4|manual(m5)|  4| 15| 20|  r|  suv|
|      toyota|       4runner 4wd|  2.7|1999|  4|  auto(l4)|  4| 16| 20|  r|  suv|
|        jeep|grand cherokee 4wd|  3.0|2008|  6|  auto(l5)|  4| 17| 22|  d|  suv|
|        jeep|gr

In [156]:
mpg.groupby('cyl').mean().show()

+---+------------------+------------------+--------+------------------+-----------------+
|cyl|        avg(displ)|         avg(year)|avg(cyl)|          avg(cty)|         avg(hwy)|
+---+------------------+------------------+--------+------------------+-----------------+
|  6| 3.408860759493671| 2002.873417721519|     6.0| 16.21518987341772|22.82278481012658|
|  5|               2.5|            2008.0|     5.0|              20.5|            28.75|
|  8|5.1328571428571435|2004.5285714285715|     8.0|12.571428571428571|17.62857142857143|
|  4| 2.145679012345679|            2003.0|     4.0|21.012345679012345|28.80246913580247|
+---+------------------+------------------+--------+------------------+-----------------+



In [158]:
mpg.groupby('class').pivot('cyl').count().show()

+----------+----+----+----+----+
|     class|   4|   5|   6|   8|
+----------+----+----+----+----+
|subcompact|  21|   2|   7|   5|
|   compact|  32|   2|  13|null|
|   minivan|   1|null|  10|null|
|       suv|   8|null|  16|  38|
|   midsize|  16|null|  23|   2|
|    pickup|   3|null|  10|  20|
|   2seater|null|null|null|   5|
+----------+----+----+----+----+



In [162]:
mpg.sort('manufacturer').groupby('manufacturer').pivot('drv').count().show()
# mpg.groupby('manufacturer').pivot('drv').count().sort('manufacturer').show()

+------------+----+----+----+
|manufacturer|   4|   f|   r|
+------------+----+----+----+
|        audi|  11|   7|null|
|   chevrolet|   4|   5|  10|
|       dodge|  26|  11|null|
|        ford|  13|null|  12|
|       honda|null|   9|null|
|     hyundai|null|  14|null|
|        jeep|   8|null|null|
|  land rover|   4|null|null|
|     lincoln|null|null|   3|
|     mercury|   4|null|null|
|      nissan|   4|   9|null|
|     pontiac|null|   5|null|
|      subaru|  14|null|null|
|      toyota|  15|  19|null|
|  volkswagen|null|  27|null|
+------------+----+----+----+



In [164]:
from pyspark.sql.functions import mean

In [167]:
mpg.rollup('cyl', 'class').agg(mean('hwy')).sort('class','cyl').show()

+----+----------+------------------+
| cyl|     class|          avg(hwy)|
+----+----------+------------------+
|null|      null| 23.44017094017094|
|   4|      null| 28.80246913580247|
|   5|      null|             28.75|
|   6|      null| 22.82278481012658|
|   8|      null| 17.62857142857143|
|   8|   2seater|              24.8|
|   4|   compact|          29.46875|
|   5|   compact|              29.0|
|   6|   compact|25.307692307692307|
|   4|   midsize|           29.1875|
|   6|   midsize| 26.26086956521739|
|   8|   midsize|              24.0|
|   4|   minivan|              24.0|
|   6|   minivan|              22.2|
|   4|    pickup|20.666666666666668|
|   6|    pickup|              17.9|
|   8|    pickup|              15.8|
|   4|subcompact| 30.80952380952381|
|   5|subcompact|              28.5|
|   6|subcompact|24.714285714285715|
+----+----------+------------------+
only showing top 20 rows

