# Exercises

### 1.) Create a spark data frame that contains your favorite programming languages.

The name of the column should be language

View the schema of the dataframe

Output the shape of the dataframe

Show the first 5 records in the dataframe

In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
import pandas as pd
import numpy as np

np.random.seed(123)

table_data = {"language" : ["Python", "Ruby", "HTML", "Java", "C++", "RedLion", "Honeywell"],
             "experience_level" : [2, 1, 3, 1, 0, 3, 3],
             "teachers" : ["Ryan", "Zach", "Self", "Ryan", "Zach", "Self", "Randy"]}

pandas_df = pd.DataFrame(table_data, columns=["language", "experience_level", "teachers"])

pandas_df

Unnamed: 0,language,experience_level,teachers
0,Python,2,Ryan
1,Ruby,1,Zach
2,HTML,3,Self
3,Java,1,Ryan
4,C++,0,Zach
5,RedLion,3,Self
6,Honeywell,3,Randy


In [3]:
# Turning it into a Spark dataframe:

df = spark.createDataFrame(pandas_df)
df

DataFrame[language: string, experience_level: bigint, teachers: string]

In [4]:
# Show the schema (structure)

df.printSchema()

root
 |-- language: string (nullable = true)
 |-- experience_level: long (nullable = true)
 |-- teachers: string (nullable = true)



In [5]:
# Show the shape

df_shape = df.count(), len(df.columns)

print(df.count(), "rows", len(df.columns), "columns")



# def spark_shape(self):
#     return (self.count(), len(self.columns))
# pyspark.sql.dataframe.DataFrame.shape = spark_shape

7 rows 3 columns


In [6]:
df.show(5)

+--------+----------------+--------+
|language|experience_level|teachers|
+--------+----------------+--------+
|  Python|               2|    Ryan|
|    Ruby|               1|    Zach|
|    HTML|               3|    Self|
|    Java|               1|    Ryan|
|     C++|               0|    Zach|
+--------+----------------+--------+
only showing top 5 rows



In [7]:
df.describe()

DataFrame[summary: string, language: string, experience_level: string, teachers: string]

In [8]:
df.describe().show()

+-------+--------+------------------+--------+
|summary|language|  experience_level|teachers|
+-------+--------+------------------+--------+
|  count|       7|                 7|       7|
|   mean|    null|1.8571428571428572|    null|
| stddev|    null|1.2149857925879117|    null|
|    min|     C++|                 0|   Randy|
|    max|    Ruby|                 3|    Zach|
+-------+--------+------------------+--------+



### 2.) Load the mpg dataset as a spark dataframe.

Create 1 column of output that contains a message like the one below:


The 1999 audi a4 has a 4 cylinder engine.
For each vehicle.

Transform the trans column so that it only contains either manual or auto.

In [9]:
# Loading mpg

from pydataset import data

mpg = spark.createDataFrame(data("mpg"))

mpg.show(10)
# from pydataset import data

# mpg = spark.createDataFrame(data("mpg"))
# mpg.show(5)

+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|     model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|        audi|        a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|        a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|        a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|        a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|        a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|        a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|        a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|compact|
|        audi|a4 quattro|  2.0|2008|  4|manual(m6)|  4| 20| 28|  p|compact|
+-----------

In [10]:
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [11]:
# creating a column of output with that reading:

from pyspark.sql.functions import col, expr, lit, when


mpg = mpg.withColumn('details', lit('these are the details'))

display(mpg)

# # The 1999 audi a4 has a 4 cylinder engine. For each vehicle.

# from pyspark.sql.functions import col, expr, lit, when

# def car_description(year, model, car):
    
#     date = mpg.select(mpg.year)
#     maker = mpg.select(mpg.manufacturer)
#     style = mpg.select(mpg.model)
#     pistons = mpg.select(mpg.cyl)

#     return f"The {date} {maker} {style} has a {pistons} engine."

# car_description("2008", "a4 quattro", "audi" )

DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string, details: string]

In [12]:
mpg.describe().show()

+-------+------------+-----------------+------------------+-----------------+------------------+----------+---+------------------+-----------------+----+-------+--------------------+
|summary|manufacturer|            model|             displ|             year|               cyl|     trans|drv|               cty|              hwy|  fl|  class|             details|
+-------+------------+-----------------+------------------+-----------------+------------------+----------+---+------------------+-----------------+----+-------+--------------------+
|  count|         234|              234|               234|              234|               234|       234|234|               234|              234| 234|    234|                 234|
|   mean|        null|             null| 3.471794871794873|           2003.5| 5.888888888888889|      null|4.0|16.858974358974358|23.44017094017094|null|   null|                null|
| stddev|        null|             null|1.2919590310839348|4.509646313320409|1.611534

**^^ Need Help with this one.  Spent WAYYYY too long on it.**

In [13]:
# transform the columns

mpg.explain() # '.explain()' shows us how spark is thinking about our data

== Physical Plan ==
*(1) Project [manufacturer#403, model#404, displ#405, year#406L, cyl#407L, trans#408, drv#409, cty#410L, hwy#411L, fl#412, class#413, these are the details AS details#459]
+- Scan ExistingRDD[manufacturer#403,model#404,displ#405,year#406L,cyl#407L,trans#408,drv#409,cty#410L,hwy#411L,fl#412,class#413]


In [14]:
mpg.select(mpg.trans).explain() # shows us how Spark is thinking about our 'trans' column

== Physical Plan ==
*(1) Project [trans#408]
+- Scan ExistingRDD[manufacturer#403,model#404,displ#405,year#406L,cyl#407L,trans#408,drv#409,cty#410L,hwy#411L,fl#412,class#413]


# Maggie's Walkthrough

### 1.) Create a spark data frame that contains your favorite programming languages.

The name of the column should be language

View the schema of the dataframe

Output the shape of the dataframe

Show the first 5 records in the dataframe

In [15]:
import pyspark

import pandas as pd

from pyspark.sql.functions import *

In [16]:
# Create dataframe of languages w/ one (1) column named 'language'

pd_df = pd.DataFrame({"language" : ["r", "python", "sql", "english", "spanish", "french", "julia", "pig latin", "lorem ipsum"]})

df = spark.createDataFrame(pd_df)

df

DataFrame[language: string]

In [17]:
# View the structure, or "schema"

df.printSchema()

root
 |-- language: string (nullable = true)



In [18]:
# View the shape (her way)

print("DataFrame Shape: ", df.count(), " rows X " , len(df.columns), " column")


DataFrame Shape:  9  rows X  1  column


In [19]:
# The first 5 records:

df.show(5)

+--------+
|language|
+--------+
|       r|
|  python|
|     sql|
| english|
| spanish|
+--------+
only showing top 5 rows



### 2.) Load the mpg dataset as a spark dataframe.

Create 1 column of output that contains a message like the one below:


The 1999 audi a4 has a 4 cylinder engine.
For each vehicle.

Transform the trans column so that it only contains either manual or auto.

In [20]:
# Create 1 column of output that contains a message like the one below:
# "The 1999 audi a4 has a 4 cylinder engine. For each vehicle."

# * NB: the 'mpg' data was loaded from pydataset in the cells above, with 
# mpg = spark.createDataFrame(pydataset.data('mpg'))

mpg.select(concat(
    lit('The '),
    col('year'), 
    lit(' '), 
    col('manufacturer'), 
    lit(' '), 
    col('model'), 
    lit(' has a '), 
    col('cyl'), 
    lit(' cylinder engine.')).alias('vehicle_cylinder_desc')).show(truncate = False)

# so instead of f-strings, Spark seems to use 'col.'  If you want the data from that 
# column, just 'col('column_name').'  Whatever words go between the column data are 
# represented with 'lit('words_I_want_to_use').'

+--------------------------------------------------------------+
|vehicle_cylinder_desc                                         |
+--------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.                     |
|The 1999 audi a4 has a 4 cylinder engine.                     |
|The 2008 audi a4 has a 4 cylinder engine.                     |
|The 2008 audi a4 has a 4 cylinder engine.                     |
|The 1999 audi a4 has a 6 cylinder engine.                     |
|The 1999 audi a4 has a 6 cylinder engine.                     |
|The 2008 audi a4 has a 6 cylinder engine.                     |
|The 1999 audi a4 quattro has a 4 cylinder engine.             |
|The 1999 audi a4 quattro has a 4 cylinder engine.             |
|The 2008 audi a4 quattro has a 4 cylinder engine.             |
|The 2008 audi a4 quattro has a 4 cylinder engine.             |
|The 1999 audi a4 quattro has a 6 cylinder engine.             |
|The 1999 audi a4 quattro

In [21]:
# Transforming 'trans' column to only show either 'manual' or 'auto.'  
# Apparently, there are a lot of ways to do this, but Maggie's going with 
# regex extract, replace, and when

mpg.select(
    regexp_extract('trans', r'^(\w+)\(', 1).alias('trans_extract'),
    # she's regexing from the start of the string (^) and all the letters in the
    # capture group (\w+) and ASK HERE
    regexp_replace('trans', r'\(.+$', '').alias('trans_replace'),
    when(mpg.trans.like('auto%'), 'auto').otherwise('manual').alias('trans_when')).show()

+-------------+-------------+----------+
|trans_extract|trans_replace|trans_when|
+-------------+-------------+----------+
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
+-------------+-------------+----------+
only showing top

### 3.) Load the tips dataset as a spark dataframe.

- What percentage of observations are smokers?

- Create a column that contains the tip percentage

- Calculate the average tip percentage for each combination of sex and smoker.

In [22]:
# load the dataset

from pydataset import data

tips = spark.createDataFrame(data("tips"))

tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [23]:
# % of total are smokers?

tips.groupBy("smoker").count().withColumn("percent", 
                        concat(round((col('count')/tips.count()*100), 0).cast("int"),
                        lit("%"))).show()

# group 'tips' by smoker count, add column "percent," round the math that makes 
# "percent," make the data in that column an integer (.cast("int")), and add
# the literal '%' sign to the number (lit("%")

+------+-----+-------+
|smoker|count|percent|
+------+-----+-------+
|    No|  151|    62%|
|   Yes|   93|    38%|
+------+-----+-------+



In [28]:
smoker_prop = tips.groupBy("smoker").count()
smoker_prop.show()

smoker_prop.withColumn("percent", round((col("count") / tips.count() * 100), 0)).show()

# "percent" is the new column, and all the stuff after "percent" is the stuff that's in
# that new column

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+

+------+-----+-------+
|smoker|count|percent|
+------+-----+-------+
|    No|  151|   62.0|
|   Yes|   93|   38.0|
+------+-----+-------+



In [29]:
tips.withColumn("tip_percent", col("tip") / col("total_bill") * 100).show()

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|       tip_percent|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|5.9446733372572105|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|16.054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|16.658733936220845|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 13.97804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|14.680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4| 18.62396204033215|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| 22.80501710376283|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|11.607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|13.031914893617023|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|21.853856562922868|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 16.65043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|14.180374361883

In [40]:
tips.groupBy("sex", "smoker").agg(mean(col("tip_percent")).alias("avg_tip_pct")).show()

AnalysisException: "cannot resolve '`tip_percent`' given input columns: [day, size, tip, sex, time, total_bill, smoker];;\n'Aggregate [sex#1259, smoker#1260], [sex#1259, smoker#1260, avg('tip_percent) AS avg_tip_pct#1516]\n+- LogicalRDD [total_bill#1257, tip#1258, sex#1259, smoker#1260, day#1261, time#1262, size#1263L], false\n"

### 4.) Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to farenheight.

- Which month has the most rain, on average?

- Which year was the windiest?

- What is the most frequent type of weather in January?

- What is the average high and low tempurature on sunny days in July in 2013 and 2014?

- What percentage of days were rainy in q3 of 2015?

- For each year, find what percentage of days it rained (had non-zero precipitation).


In [41]:
# Get the dataset

import vega_datasets

from vega_datasets import data

weather = data.seattle_weather()

weather = spark.createDataFrame(weather)

weather.show(4)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 4 rows



In [42]:
# Convert temps from C to F:

weather = (weather.withColumn("temp_max", (col("temp_max") * 9/5 +32))
          .withColumn("temp_min", (col("temp_min") * 9/5 + 32)))

# add column ("temp_max"), adjust from Celsius to Farenheit, do this again, and 
# assign all the math to the variable "weather.""

weather.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|   48.02|   37.04| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [43]:
# month with the most rain on average

(weather
.withColumn("month", month("date"))
.withColumn("year", year("date"))
.groupBy("month", "year")
.agg(sum("precipitation").alias("total_monthly_precipitation"))
.groupBy("month")
.agg(mean("total_monthly_precipitation").alias("avg_monthly_rain"))
.sort(col("avg_monthly_rain").desc())
.first()
)

Row(month=11, avg_monthly_rain=160.625)

In [44]:
# windiest year

(weather
.withColumn("year", year("date"))
.groupby("year")
.agg(sum("wind").alias("total_winds"))
.sort(col("total_winds").desc())
.first()
)

Row(year=2012, total_winds=1244.6999999999998)

In [45]:
# most frequent type of weather in January

(weather
.withColumn("month", month("date"))
.filter(col("month") == 1)
.groupBy("weather")
.count()
.sort(col("count").desc())
.show()
)

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



In [46]:
# avg hi and low temps on sunny days in July 2013 and 2014

(weather
.filter(month("date") == 7)
.filter(year("date") > 2012)
.filter(year("date") < 2015)
.filter(col("weather") == lit("sun"))
.agg(avg("temp_max").alias("average_hi_temp"), avg("temp_min").alias("average_low_temp"))
.show()
)


+-----------------+-----------------+
|  average_hi_temp| average_low_temp|
+-----------------+-----------------+
|80.29192307692308|57.52884615384615|
+-----------------+-----------------+



In [47]:
# percentage of rainy days in Q3 of 2015

# tip: measure a rainy day by weather == rain

(weather
.filter(year("date") == 2015)
.filter(quarter("date") == 3)
.select(when(col("weather") == "rain", 1).otherwise(0).alias("rain"))
.agg(mean("rain"))
.show()
)

+--------------------+
|           avg(rain)|
+--------------------+
|0.021739130434782608|
+--------------------+



In [48]:
# % of days it rained (had non-zero precipitation) for each year:

(weather
.filter(year("date") == 2015)
.filter(quarter("date") == 3)
.select(when(col("precipitation") > 0, 1).otherwise(0).alias("rain"))
.agg(mean("rain"))
.show()
)

+-------------------+
|          avg(rain)|
+-------------------+
|0.18478260869565216|
+-------------------+

