# Spark API Exercises

In [1]:
# imports
import pyspark
import pandas as pd
import numpy as np

from pyspark.sql.functions import *

In [2]:
# sparks fly
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# ------------------------------------------------------------------------
## Exercise 1:

Within your `codeup-data-science` directory, create a new repo named `spark-exercises`. This will be where you do your work for this module. Create a repository on GitHub with the same name, and link your local repository to GitHub.

Save this work in your `spark-exercises` repo. Then add, commit, and push your changes.

Create a jupyter notebook or python script named `spark101` for this exercise.

Create a spark data frame that contains your favorite programming languages.

- Create a dataframe with one column named `language`
> Hint: Start with a pandas dataframe. Maybe use a dictionary?
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

# ------------------------------------------------------------------------

Create a dataframe with one column named `language`

In [3]:
# create pandas dataframe
languages = pd.DataFrame(dict(language = ['python', 'java', 'html', 'c', 'r', 'php', 'scala'],
                            studied = ['yes', 'no', 'no', 'no', 'no', 'no', 'no'])
                        )
# a look at the dataframe
languages

Unnamed: 0,language,studied
0,python,yes
1,java,no
2,html,no
3,c,no
4,r,no
5,php,no
6,scala,no


In [4]:
# create spark dataframe from pandas dataframe
df = spark.createDataFrame(languages)

# view dataframe columns and datatypes
df

DataFrame[language: string, studied: string]

View the schema of the dataframe

In [5]:
# view dataframe schema
df.printSchema()

root
 |-- language: string (nullable = true)
 |-- studied: string (nullable = true)



Ouput the shape of the dataframe

In [6]:
print('DataFrame shape: ', df.count(), ' x ', len(df.columns))

DataFrame shape:  7  x  2


Show the first five records in the dataframe

In [7]:
# show first five records of dataframe
df.show(5)

+--------+-------+
|language|studied|
+--------+-------+
|  python|    yes|
|    java|     no|
|    html|     no|
|       c|     no|
|       r|     no|
+--------+-------+
only showing top 5 rows



# ------------------------------------------------------------------------
## Exercise 2:

Load the `mpg` dataset as a spark dataframe.

a. Create 1 column of output that contains a message like the one below for each record:

    The 1999 audi a4 has a 4 cylinder engine.

> Hint: You will need to concatenate values that already exist in the data with string literals

b. Transform the trans column so that it only contains either manual or auto.

> Hint: Consider spark string methods and `when().otherwise()` chaining
# ------------------------------------------------------------------------

In [8]:
from pydataset import data
mpg = spark.createDataFrame(data('mpg'))
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [9]:
# view columns and data types
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



a. Create 1 column of output that contains a message like the one below for each record:

    The 1999 audi a4 has a 4 cylinder engine.

In [10]:
# write out pyspark concatenation of literal (lit) strings and column (col) values
mpg.select(concat(lit('The '),
                  col('year'),
                  lit(' '),
                  col('manufacturer'),
                  lit(' '),
                  col('model'),
                  lit(' has a '), 
                  col('cyl'),
                  lit(' cylinder engine.')
                 ).alias('summary_description')
          ).\
show(11, truncate = False)

+-------------------------------------------------+
|summary_description                              |
+-------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.        |
|The 1999 audi a4 has a 4 cylinder engine.        |
|The 2008 audi a4 has a 4 cylinder engine.        |
|The 2008 audi a4 has a 4 cylinder engine.        |
|The 1999 audi a4 has a 6 cylinder engine.        |
|The 1999 audi a4 has a 6 cylinder engine.        |
|The 2008 audi a4 has a 6 cylinder engine.        |
|The 1999 audi a4 quattro has a 4 cylinder engine.|
|The 1999 audi a4 quattro has a 4 cylinder engine.|
|The 2008 audi a4 quattro has a 4 cylinder engine.|
|The 2008 audi a4 quattro has a 4 cylinder engine.|
+-------------------------------------------------+
only showing top 11 rows



b. Transform the trans column so that it only contains either manual or auto.

> Hint: Consider spark string methods and `when().otherwise()` chaining

In [11]:
# review dataframe
mpg.show(2)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 2 rows



In [12]:
# register mpg table with spark
mpg.createOrReplaceTempView('mpg')

In [13]:
# create mysql query to grab columns to fit needs
mpg = spark.sql(
'''
SELECT manufacturer, model, displ, cyl, LEFT(trans, 4) as trans, drv, cty, hwy, fl, class
FROM mpg
''')
mpg.show()

+------------+------------------+-----+---+-----+---+---+---+---+-------+
|manufacturer|             model|displ|cyl|trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+---+-----+---+---+---+---+-------+
|        audi|                a4|  1.8|  4| auto|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|  4| manu|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|  4| manu|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|  4| auto|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|  6| auto|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|  6| manu|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|  6| auto|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|  4| manu|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|  4| auto|  4| 16| 25|  p|compact|
|        audi|        a4 quattro|  2.0|  4| manu|  4| 20| 28|  p|compact|
|        audi|        a4 quattro|  2.0

In [14]:
# test functionality
mpg.select(col('*')).show()

+------------+------------------+-----+---+-----+---+---+---+---+-------+
|manufacturer|             model|displ|cyl|trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+---+-----+---+---+---+---+-------+
|        audi|                a4|  1.8|  4| auto|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|  4| manu|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|  4| manu|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|  4| auto|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|  6| auto|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|  6| manu|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|  6| auto|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|  4| manu|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|  4| auto|  4| 16| 25|  p|compact|
|        audi|        a4 quattro|  2.0|  4| manu|  4| 20| 28|  p|compact|
|        audi|        a4 quattro|  2.0

In [15]:
# select trans column, and an aliased column of transmission, showing only 'automatic' or 'manual'
mpg.select(when(mpg.trans == 'auto', 'auto')
           .otherwise('manual')
           .alias('trans')
          )\
.show()

+------+
| trans|
+------+
|  auto|
|manual|
|manual|
|  auto|
|  auto|
|manual|
|  auto|
|manual|
|  auto|
|manual|
|  auto|
|  auto|
|manual|
|  auto|
|manual|
|  auto|
|  auto|
|  auto|
|  auto|
|  auto|
+------+
only showing top 20 rows



In [16]:
# pull original complete table
mpg = spark.createDataFrame(data('mpg'))

In [17]:
# ALTERNATE SOLUTION
mpg.select(mpg.trans,
           when(length(mpg.trans) < 10, 'automatic')
           .otherwise('manual')
           .alias('transmission')
          )\
.show()

+----------+------------+
|     trans|transmission|
+----------+------------+
|  auto(l5)|   automatic|
|manual(m5)|      manual|
|manual(m6)|      manual|
|  auto(av)|   automatic|
|  auto(l5)|   automatic|
|manual(m5)|      manual|
|  auto(av)|   automatic|
|manual(m5)|      manual|
|  auto(l5)|   automatic|
|manual(m6)|      manual|
|  auto(s6)|   automatic|
|  auto(l5)|   automatic|
|manual(m5)|      manual|
|  auto(s6)|   automatic|
|manual(m6)|      manual|
|  auto(l5)|   automatic|
|  auto(s6)|   automatic|
|  auto(s6)|   automatic|
|  auto(l4)|   automatic|
|  auto(l4)|   automatic|
+----------+------------+
only showing top 20 rows



In [18]:
# ALTERNATE SOLUTION ('transform' the trans column)
mpg.select(mpg.manufacturer,
           mpg.model,
           mpg.displ,
           mpg.cyl,
           when(length(mpg.trans) < 10, 'auto')
           .otherwise('manual')
           .alias('transmission'),
           mpg.drv,
           mpg.cty,
           mpg.hwy,
           mpg.fl,
           col('class')
          )\
.show()

+------------+------------------+-----+---+------------+---+---+---+---+-------+
|manufacturer|             model|displ|cyl|transmission|drv|cty|hwy| fl|  class|
+------------+------------------+-----+---+------------+---+---+---+---+-------+
|        audi|                a4|  1.8|  4|        auto|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|  4|      manual|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|  4|      manual|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|  4|        auto|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|  6|        auto|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|  6|      manual|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|  6|        auto|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|  4|      manual|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|  4|        auto|  4| 16| 25|  p|compact|
|        audi|        a4 qua

# ------------------------------------------------------------------------
## Exercise 3: 

Load the `tips` dataset as a spark dataframe.

a. What percentage of observations are smokers?
> Hint: `.groupBy()` and `.withColumn()` are useful functions here

b. Create a column that contains the tip percentage
> Hint: `.withColumn()` is useful here

c. Calculate the average tip percentage for each combination of sex and smoker.
> Hint: Chain additional functions off the answer to part b 

# ------------------------------------------------------------------------

# ------------------------------------------------------------------------
## Exercise 4:

Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to fahrenheit.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [19]:
# import data
from vega_datasets import data

# get data into pandas dataframe and assign new column for string-type dates
weather = data.seattle_weather().assign(date = lambda df: df.date.astype(str))

# get pandas dataframe into spark dataframe
weather = spark.createDataFrame(weather)

# view dataframe
weather.show()

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
|2012-01-07|          0.0|     7.2|     2.8| 2.3|   rain|
|2012-01-08|          0.0|    10.0|     2.8| 2.0|    sun|
|2012-01-09|          4.3|     9.4|     5.0| 3.4|   rain|
|2012-01-10|          1.0|     6.1|     0.6| 3.4|   rain|
|2012-01-11|          0.0|     6.1|    -1.1| 5.1|    sun|
|2012-01-12|          0.0|     6.1|    -1.7| 1.9|    sun|
|2012-01-13|          0.0|     5.0|    -2.8| 1.3|    sun|
|2012-01-14|          4.1|     4.4|     0.6| 5.3|   snow|
|2012-01-15|  