# Exercises
Within your `codeup-data-science` directory, create a new repo named `spark-exercises`. This will be where you do your work for this module. Create a repository on GitHub with the same name, and link your local repository to GitHub.

Save this work in your `spark-exercises` repo. Then add, commit, and push your changes.

Create a jupyter notebook or python script named `spark101` for this exercise.

In [146]:
import pandas as pd
import numpy as np
import pyspark
from pyspark.sql.functions import min, max, mean, concat, lit, when, substring_index, count, round, sum
from pydataset import data
from pyspark.sql.functions import asc, desc, col, substring, expr

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# 1. Create a spark data frame that contains your favorite programming languages.

In [3]:
pd_df = pd.DataFrame({
    
    "language": ["python", "sql", "html", "java", "js", "cplusplus"]
})

In [4]:
pd_df

Unnamed: 0,language
0,python
1,sql
2,html
3,java
4,js
5,cplusplus


In [5]:
df = spark.createDataFrame(pd_df)
df

DataFrame[language: string]

- The name of the column should be `language`
- View the schema of the dataframe

In [6]:
df.printSchema()

root
 |-- language: string (nullable = true)



- Output the shape of the dataframe

In [7]:
df.describe().show()

+-------+---------+
|summary| language|
+-------+---------+
|  count|        6|
|   mean|     null|
| stddev|     null|
|    min|cplusplus|
|    max|      sql|
+-------+---------+



In [58]:
print(df.count(), "rows", len(df.columns), "columns")

6 rows 1 columns


- Show the first 5 records in the dataframe

In [8]:
df.show(5)

+--------+
|language|
+--------+
|  python|
|     sql|
|    html|
|    java|
|      js|
+--------+
only showing top 5 rows



# 2. Load the `mpg` dataset as a spark dataframe.

In [9]:
mpg = data('mpg')

In [10]:
mpg.head(3)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact


In [11]:
smpg = spark.createDataFrame(mpg)
smpg

DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string]

## a. Create 1 column of output that contains a message like the one below:
`The 1999 audi a4 has a 4 cylinder engine.`
For each vehicle.

In [12]:
smpg.select(concat(lit('The '), 
                 'year',
                 lit(' '),
                 'model',
                 lit(' has a '),
                 'cyl',
                 lit(' cylinder engine.'),
                ).alias('summary_sentence')).show(5)

+--------------------+
|    summary_sentence|
+--------------------+
|The 1999 a4 has a...|
|The 1999 a4 has a...|
|The 2008 a4 has a...|
|The 2008 a4 has a...|
|The 1999 a4 has a...|
+--------------------+
only showing top 5 rows



In [159]:
smpg.select(concat(lit('The '), 
                 'year',
                 lit(' '),
                 'model',
                 lit(' has a '),
                 'cyl',
                 lit(' cylinder engine.'),
                ).alias('summary_sentence')).show(truncate=False)

+----------------------------------------------------+
|summary_sentence                                    |
+----------------------------------------------------+
|The 1999 a4 has a 4 cylinder engine.                |
|The 1999 a4 has a 4 cylinder engine.                |
|The 2008 a4 has a 4 cylinder engine.                |
|The 2008 a4 has a 4 cylinder engine.                |
|The 1999 a4 has a 6 cylinder engine.                |
|The 1999 a4 has a 6 cylinder engine.                |
|The 2008 a4 has a 6 cylinder engine.                |
|The 1999 a4 quattro has a 4 cylinder engine.        |
|The 1999 a4 quattro has a 4 cylinder engine.        |
|The 2008 a4 quattro has a 4 cylinder engine.        |
|The 2008 a4 quattro has a 4 cylinder engine.        |
|The 1999 a4 quattro has a 6 cylinder engine.        |
|The 1999 a4 quattro has a 6 cylinder engine.        |
|The 2008 a4 quattro has a 6 cylinder engine.        |
|The 2008 a4 quattro has a 6 cylinder engine.        |
|The 1999 

## b. Transform the `trans` column so that it only contains either `manual` or `auto`.

In [13]:
smpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [14]:
smpg.where(smpg.trans == 'auto').show()

+------------+-----+-----+----+---+-----+---+---+---+---+-----+
|manufacturer|model|displ|year|cyl|trans|drv|cty|hwy| fl|class|
+------------+-----+-----+----+---+-----+---+---+---+---+-----+
+------------+-----+-----+----+---+-----+---+---+---+---+-----+



In [15]:
smpg.select('trans', substring_index(smpg.trans, '(', 1).alias('trimmed')).show()

+----------+-------+
|     trans|trimmed|
+----------+-------+
|  auto(l5)|   auto|
|manual(m5)| manual|
|manual(m6)| manual|
|  auto(av)|   auto|
|  auto(l5)|   auto|
|manual(m5)| manual|
|  auto(av)|   auto|
|manual(m5)| manual|
|  auto(l5)|   auto|
|manual(m6)| manual|
|  auto(s6)|   auto|
|  auto(l5)|   auto|
|manual(m5)| manual|
|  auto(s6)|   auto|
|manual(m6)| manual|
|  auto(l5)|   auto|
|  auto(s6)|   auto|
|  auto(s6)|   auto|
|  auto(l4)|   auto|
|  auto(l4)|   auto|
+----------+-------+
only showing top 20 rows



# 3. Load the `tips` dataset as a spark dataframe.

In [16]:
tip = data('tips')

In [17]:
tip.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
1,16.99,1.01,Female,No,Sun,Dinner,2
2,10.34,1.66,Male,No,Sun,Dinner,3


In [18]:
tips = spark.createDataFrame(tip)
tips

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]

## a. What percentage of observations are smokers?

In [19]:
tips.filter(tips.smoker == 'Yes').count()

93

In [20]:
tips.count()

244

In [21]:
((tips.filter(tips.smoker == 'Yes').count()) / (tips.count())) * 100

38.114754098360656

## b. Create a column that contains the tip percentage

In [22]:
tips.select((tips.tip / tips.total_bill).alias('tip_pct')).show()

+-------------------+
|            tip_pct|
+-------------------+
|0.05944673337257211|
|0.16054158607350097|
|0.16658733936220846|
| 0.1397804054054054|
|0.14680764538430255|
|0.18623962040332148|
|0.22805017103762829|
|0.11607142857142858|
|0.13031914893617022|
| 0.2185385656292287|
| 0.1665043816942551|
|0.14180374361883155|
|0.10181582360570687|
|0.16277807921866522|
|0.20364126770060686|
|0.18164967562557924|
| 0.1616650532429816|
|0.22774708410067526|
|0.20624631703005306|
|0.16222760290556903|
+-------------------+
only showing top 20 rows



In [23]:
tips = tips.select('*', (tips.tip / tips.total_bill).alias('tip_pct'))

In [37]:
tips.select(round(tips.tip_pct, 4).alias('r_tip_pct')).show()

+---------+
|r_tip_pct|
+---------+
|   0.0594|
|   0.1605|
|   0.1666|
|   0.1398|
|   0.1468|
|   0.1862|
|   0.2281|
|   0.1161|
|   0.1303|
|   0.2185|
|   0.1665|
|   0.1418|
|   0.1018|
|   0.1628|
|   0.2036|
|   0.1816|
|   0.1617|
|   0.2277|
|   0.2062|
|   0.1622|
+---------+
only showing top 20 rows



In [38]:
tips = tips.select('*', round(tips.tip_pct, 4).alias('r_tip_pct'))

In [39]:
tips.show()

+----------+----+------+------+---+------+----+-------------------+---------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|r_tip_pct|
+----------+----+------+------+---+------+----+-------------------+---------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|   0.0594|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|   0.1605|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|   0.1666|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|   0.1398|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|   0.1468|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|0.18623962040332148|   0.1862|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|   0.2281|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|   0.1161|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|0.13031914893617022|   0.1303|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.21853856562922

## c. Calculate the average tip percentage for each combination of sex and smoker.

In [42]:
tips.groupBy('smoker').pivot('sex').agg(round(mean('tip_pct'), 4)).show()

+------+------+------+
|smoker|Female|  Male|
+------+------+------+
|    No|0.1569|0.1607|
|   Yes|0.1822|0.1528|
+------+------+------+



In [41]:
tips.groupBy('smoker').pivot('sex').agg(round(mean('r_tip_pct'), 4)).show()

+------+------+------+
|smoker|Female|  Male|
+------+------+------+
|    No|0.1569|0.1607|
|   Yes|0.1821|0.1528|
+------+------+------+



# 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

In [44]:
from vega_datasets import data

In [45]:
weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
sea = spark.createDataFrame(weather)
sea.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows



## a. Convert the temperatures to fahrenheit.

In [47]:
sea.select((sea.temp_max * (9/5)) + 32).show()

+-----------------------+
|((temp_max * 1.8) + 32)|
+-----------------------+
|     55.040000000000006|
|                  51.08|
|                  53.06|
|                  53.96|
|     48.019999999999996|
|                  39.92|
|                  44.96|
|                   50.0|
|                  48.92|
|     42.980000000000004|
|     42.980000000000004|
|     42.980000000000004|
|                   41.0|
|                  39.92|
|                  33.98|
|                  35.06|
|                  37.94|
|                   32.0|
|                  30.02|
|                  44.96|
+-----------------------+
only showing top 20 rows



In [52]:
maxtf = ((sea.temp_max * 1.8) + 32).alias('max_temp_F')

In [53]:
mintf = ((sea.temp_min * 1.8) + 32).alias('min_temp_F')

In [56]:
sea.select('*', round(maxtf,2).alias('max_temp_F'), round(mintf, 2).alias('min_temp_F')).show(4)

+----------+-------------+--------+--------+----+-------+----------+----------+
|      date|precipitation|temp_max|temp_min|wind|weather|max_temp_F|min_temp_F|
+----------+-------------+--------+--------+----+-------+----------+----------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|     55.04|      41.0|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|     51.08|     37.04|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|     53.06|     44.96|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|     53.96|     42.08|
+----------+-------------+--------+--------+----+-------+----------+----------+
only showing top 4 rows



## b. Which month has the most rain, on AVERAGE?

In [61]:
sea.printSchema

<bound method DataFrame.printSchema of DataFrame[date: string, precipitation: double, temp_max: double, temp_min: double, wind: double, weather: string]>

In [57]:
from pyspark.sql.functions import month, year, quarter

In [92]:
(sea.withColumn("month", month("date"))
.groupBy("month")
.agg(round(mean("precipitation"), 2).alias("avg_rainfall"))
.sort(desc("avg_rainfall"))
.show(1))

+-----+------------+
|month|avg_rainfall|
+-----+------------+
|   11|        5.35|
+-----+------------+
only showing top 1 row



In [75]:
rain_df = (sea.withColumn("month", month("date"))
.groupBy("month")
.agg(round(sum("precipitation"), 2).alias("total_rainfall"))
.sort("month")
)

In [90]:
rain_df.sort(desc('total_rainfall')).show(1)

+-----+--------------+
|month|total_rainfall|
+-----+--------------+
|   11|         642.5|
+-----+--------------+
only showing top 1 row



## c. Which year was the windiest?

In [145]:
(sea.withColumn("year", month("date"))
.groupBy("year")
.agg(round(mean("wind"), 2).alias("avg_windspeed"))
.sort(desc("avg_windspeed"))
.show(1)
)

+----+-------------+
|year|avg_windspeed|
+----+-------------+
|   2|         3.79|
+----+-------------+
only showing top 1 row



In [94]:
(sea.withColumn("month", month("date"))
.groupBy("month")
.agg(round(mean("wind"), 2).alias("avg_windspeed"))
.sort(desc("avg_windspeed"))
.show(1)
)

+-----+-------------+
|month|avg_windspeed|
+-----+-------------+
|    2|         3.79|
+-----+-------------+
only showing top 1 row



In [96]:
wind_df = (sea.withColumn("month", month("date"))
.groupBy("month")
.agg(round(mean("wind"), 2).alias("avg_windspeed"))
.sort("month")
)

In [97]:
wind_df.sort(desc('avg_windspeed')).show(1)

+-----+-------------+
|month|avg_windspeed|
+-----+-------------+
|    2|         3.79|
+-----+-------------+
only showing top 1 row



## d. What is the most frequent type of weather in January?

In [104]:
sea.select('*', substring('date', 6, 2).alias('month')).show()

+----------+-------------+--------+--------+----+-------+-----+
|      date|precipitation|temp_max|temp_min|wind|weather|month|
+----------+-------------+--------+--------+----+-------+-----+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|   01|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|   01|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|   01|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|   01|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|   01|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|   01|
|2012-01-07|          0.0|     7.2|     2.8| 2.3|   rain|   01|
|2012-01-08|          0.0|    10.0|     2.8| 2.0|    sun|   01|
|2012-01-09|          4.3|     9.4|     5.0| 3.4|   rain|   01|
|2012-01-10|          1.0|     6.1|     0.6| 3.4|   rain|   01|
|2012-01-11|          0.0|     6.1|    -1.1| 5.1|    sun|   01|
|2012-01-12|          0.0|     6.1|    -1.7| 1.9|    sun|   01|
|2012-01-13|          0.0|     5.0|    -

In [105]:
sea = sea.select('*', substring('date', 6, 2).alias('month'))

In [115]:
sea.crosstab('weather', 'month').show()

+-------------+---+---+---+---+---+---+---+---+---+---+---+---+
|weather_month| 01| 02| 03| 04| 05| 06| 07| 08| 09| 10| 11| 12|
+-------------+---+---+---+---+---+---+---+---+---+---+---+---+
|         snow|  8|  3|  6|  1|  0|  0|  0|  0|  0|  0|  0|  5|
|          fog| 38| 36| 36| 34| 25| 14| 13| 16| 40| 55| 50| 54|
|          sun| 33| 30| 42| 61| 82| 85| 89| 94| 71| 45| 42| 40|
|         rain| 35| 40| 37| 20| 16| 19| 14|  6|  4| 20| 25| 23|
|      drizzle| 10|  4|  3|  4|  1|  2|  8|  8|  5|  4|  3|  2|
+-------------+---+---+---+---+---+---+---+---+---+---+---+---+



In [118]:
sea.filter(sea.month == '01').crosstab('weather', 'month').show()

+-------------+---+
|weather_month| 01|
+-------------+---+
|         snow|  8|
|          fog| 38|
|          sun| 33|
|         rain| 35|
|      drizzle| 10|
+-------------+---+



In [117]:
sea.filter(sea.month == '01').crosstab('weather', 'month').sort(desc('01')).show()

+-------------+---+
|weather_month| 01|
+-------------+---+
|          fog| 38|
|         rain| 35|
|          sun| 33|
|      drizzle| 10|
|         snow|  8|
+-------------+---+



In [119]:
sea.filter(sea.month == '01').crosstab('weather', 'month').sort(desc('01')).show(1)

+-------------+---+
|weather_month| 01|
+-------------+---+
|          fog| 38|
+-------------+---+
only showing top 1 row



## e. What is the average high and low temperature on sunny days in July in 2013 and 2014?

In [121]:
sea.select('*', substring('date', 1, 4).alias('year')).show()

+----------+-------------+--------+--------+----+-------+-----+----+
|      date|precipitation|temp_max|temp_min|wind|weather|month|year|
+----------+-------------+--------+--------+----+-------+-----+----+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|   01|2012|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|   01|2012|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|   01|2012|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|   01|2012|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|   01|2012|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|   01|2012|
|2012-01-07|          0.0|     7.2|     2.8| 2.3|   rain|   01|2012|
|2012-01-08|          0.0|    10.0|     2.8| 2.0|    sun|   01|2012|
|2012-01-09|          4.3|     9.4|     5.0| 3.4|   rain|   01|2012|
|2012-01-10|          1.0|     6.1|     0.6| 3.4|   rain|   01|2012|
|2012-01-11|          0.0|     6.1|    -1.1| 5.1|    sun|   01|2012|
|2012-01-12|          0.0|     6.1

In [122]:
sea = sea.select('*', substring('date', 1, 4).alias('year'))

In [125]:
sea.filter(sea.month == "07").show()

+----------+-------------+--------+--------+----+-------+-----+----+
|      date|precipitation|temp_max|temp_min|wind|weather|month|year|
+----------+-------------+--------+--------+----+-------+-----+----+
|2012-07-01|          0.0|    20.0|    12.2| 2.3|   rain|   07|2012|
|2012-07-02|          2.0|    18.9|    11.7| 2.1|   rain|   07|2012|
|2012-07-03|          5.8|    18.3|    10.6| 6.0|   rain|   07|2012|
|2012-07-04|          0.0|    20.6|     9.4| 3.8|    sun|   07|2012|
|2012-07-05|          0.0|    24.4|    10.6| 3.1|drizzle|   07|2012|
|2012-07-06|          0.0|    25.0|    11.1| 2.1|    sun|   07|2012|
|2012-07-07|          0.0|    26.7|    12.8| 3.8|    sun|   07|2012|
|2012-07-08|          0.0|    28.3|    14.4| 2.8|   rain|   07|2012|
|2012-07-09|          1.5|    25.0|    12.8| 2.0|   rain|   07|2012|
|2012-07-10|          0.0|    23.9|    11.1| 2.3|drizzle|   07|2012|
|2012-07-11|          0.0|    27.8|    13.3| 2.9|    fog|   07|2012|
|2012-07-12|          0.0|    25.6

In [130]:
sea.filter(sea.month == "07").where(sea.weather == "sun").show()

+----------+-------------+--------+--------+----+-------+-----+----+
|      date|precipitation|temp_max|temp_min|wind|weather|month|year|
+----------+-------------+--------+--------+----+-------+-----+----+
|2012-07-04|          0.0|    20.6|     9.4| 3.8|    sun|   07|2012|
|2012-07-06|          0.0|    25.0|    11.1| 2.1|    sun|   07|2012|
|2012-07-07|          0.0|    26.7|    12.8| 3.8|    sun|   07|2012|
|2012-07-17|          0.0|    21.7|    15.0| 2.6|    sun|   07|2012|
|2012-07-18|          0.0|    21.1|    14.4| 2.9|    sun|   07|2012|
|2012-07-19|          0.0|    25.0|    14.4| 2.2|    sun|   07|2012|
|2012-07-21|          0.0|    23.9|    13.9| 2.3|    sun|   07|2012|
|2012-07-24|          0.0|    23.3|    12.2| 4.3|    sun|   07|2012|
|2012-07-25|          0.0|    26.7|    12.8| 2.6|    sun|   07|2012|
|2012-07-29|          0.0|    22.8|    15.0| 2.0|    sun|   07|2012|
|2012-07-30|          0.0|    19.4|    13.3| 3.0|    sun|   07|2012|
|2012-07-31|          0.0|    22.8

In [131]:
(sea.filter(sea.month == "07")
.where(sea.weather == "sun")
.where((sea.year == "2013") | (sea.year == "2014"))
.show())

+----------+-------------+--------+--------+----+-------+-----+----+
|      date|precipitation|temp_max|temp_min|wind|weather|month|year|
+----------+-------------+--------+--------+----+-------+-----+----+
|2013-07-01|          0.0|    31.7|    18.3| 2.3|    sun|   07|2013|
|2013-07-02|          0.0|    28.3|    15.6| 3.0|    sun|   07|2013|
|2013-07-03|          0.0|    26.1|    16.7| 3.2|    sun|   07|2013|
|2013-07-05|          0.0|    23.3|    13.9| 2.6|    sun|   07|2013|
|2013-07-06|          0.0|    26.1|    13.3| 2.2|    sun|   07|2013|
|2013-07-07|          0.0|    23.9|    13.9| 2.9|    sun|   07|2013|
|2013-07-08|          0.0|    26.7|    13.3| 2.8|    sun|   07|2013|
|2013-07-09|          0.0|    30.0|    15.0| 2.5|    sun|   07|2013|
|2013-07-10|          0.0|    22.2|    13.9| 2.6|    sun|   07|2013|
|2013-07-11|          0.0|    22.8|    12.2| 3.0|    sun|   07|2013|
|2013-07-12|          0.0|    19.4|    13.3| 2.2|    sun|   07|2013|
|2013-07-13|          0.0|    26.1

In [137]:
(sea.filter(sea.month == "07")
.where(sea.weather == "sun")
.where((sea.year == "2013") | (sea.year == "2014"))
.groupBy('year').agg(round(mean('temp_max'), 2).alias('avg_max_temp_sunny_july'))
.show())

+----+-----------------------+
|year|avg_max_temp_sunny_july|
+----+-----------------------+
|2014|                  27.09|
|2013|                  26.59|
+----+-----------------------+



In [139]:
maxtemp7 = (sea.filter(sea.month == "07")
.where(sea.weather == "sun")
.where((sea.year == "2013") | (sea.year == "2014"))
.groupBy('year').agg(round(mean('temp_max'), 2).alias('avg_max_temp_sunny_july'))
)

In [138]:
(sea.filter(sea.month == "07")
.where(sea.weather == "sun")
.where((sea.year == "2013") | (sea.year == "2014"))
.groupBy('year').agg(round(mean('temp_min'), 2).alias('avg_min_temp_sunny_july'))
.show())

+----+-----------------------+
|year|avg_min_temp_sunny_july|
+----+-----------------------+
|2014|                   14.4|
|2013|                  13.98|
+----+-----------------------+



In [140]:
mintemp7 = (sea.filter(sea.month == "07")
.where(sea.weather == "sun")
.where((sea.year == "2013") | (sea.year == "2014"))
.groupBy('year').agg(round(mean('temp_min'), 2).alias('avg_min_temp_sunny_july'))
)

In [144]:
mintemp7.join(maxtemp7, on='year').show()

+----+-----------------------+-----------------------+
|year|avg_min_temp_sunny_july|avg_max_temp_sunny_july|
+----+-----------------------+-----------------------+
|2014|                   14.4|                  27.09|
|2013|                  13.98|                  26.59|
+----+-----------------------+-----------------------+



## f. What percentage of days were rainy in q3 of 2015?

In [149]:
(sea.filter(sea.year == "2015")
.where((sea.weather == "rain") | (sea.weather == "drizzle"))
.show())

+----------+-------------+--------+--------+----+-------+-----+----+
|      date|precipitation|temp_max|temp_min|wind|weather|month|year|
+----------+-------------+--------+--------+----+-------+-----+----+
|2015-01-18|         21.3|    13.9|     7.2| 6.6|   rain|   01|2015|
|2015-04-01|          5.1|    12.8|     5.6| 3.2|   rain|   04|2015|
|2015-06-15|          0.0|    30.0|    16.1| 3.5|drizzle|   06|2015|
|2015-07-06|          0.0|    29.4|    15.6| 3.2|drizzle|   07|2015|
|2015-07-08|          0.0|    30.0|    14.4| 1.9|drizzle|   07|2015|
|2015-08-12|          7.6|    28.3|    16.7| 2.7|   rain|   08|2015|
|2015-08-14|         30.5|    18.3|    15.0| 5.2|   rain|   08|2015|
|2015-08-19|          0.0|    31.7|    16.1| 2.1|drizzle|   08|2015|
|2015-08-22|          0.0|    26.7|    12.2| 2.5|drizzle|   08|2015|
|2015-08-23|          0.0|    27.8|    13.9| 1.8|drizzle|   08|2015|
|2015-10-06|          0.0|    18.3|    10.0| 2.6|drizzle|   10|2015|
|2015-10-25|          8.9|    19.4

In [152]:
(sea.filter(sea.year == "2015")
.withColumn("quarter", quarter("date"))
.crosstab("quarter", "year")
.sort("quarter_year")
.show())

+------------+----+
|quarter_year|2015|
+------------+----+
|           1|  90|
|           2|  91|
|           3|  92|
|           4|  92|
+------------+----+



In [153]:
quarters15 = (sea.filter(sea.year == "2015")
.withColumn("quarter", quarter("date"))
.crosstab("quarter", "year")
.sort("quarter_year")
)

In [156]:
(sea.filter(sea.year == "2015")).count()

365

In [158]:
(quarters15['2015'] / (sea.filter(sea.year == "2015")).count()).show()

TypeError: 'Column' object is not callable

In [None]:
quarters15.select()

## g. For each year, find what percentage of days it rained (had non-zero precipitation).