In [22]:
import pandas as pd
import numpy as np

import pyspark
from pydataset import data

#### Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [4]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [18]:
langs = pd.DataFrame(["python", "r", "java", "spark", "natural", "php"], columns=["language"])

In [19]:
df = spark.createDataFrame(langs)

In [20]:
df.show(5)

+--------+
|language|
+--------+
|  python|
|       r|
|    java|
|   spark|
| natural|
+--------+
only showing top 5 rows



#### Load the mpg dataset as a spark dataframe.

- Create 1 column of output that contains a message like the one below:
    - The 1999 audi a4 has a 4 cylinder engine.
    - For each vehicle.

- Transform the trans column so that it only contains either manual or auto.

In [40]:
mpg = spark.createDataFrame(data('mpg'))

In [46]:
from pyspark.sql.functions import concat, lit

In [49]:
mpg.show(3)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 3 rows



In [60]:
mpg.select(concat(lit('The '), mpg.year, lit(' '), mpg.manufacturer, lit(' '), \
                  mpg.model, lit(' has a '), mpg.cyl, lit(' cylinder engine.') ). alias("description")
          ).show(truncate=False)

+--------------------------------------------------------------+
|description                                                   |
+--------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.                     |
|The 1999 audi a4 has a 4 cylinder engine.                     |
|The 2008 audi a4 has a 4 cylinder engine.                     |
|The 2008 audi a4 has a 4 cylinder engine.                     |
|The 1999 audi a4 has a 6 cylinder engine.                     |
|The 1999 audi a4 has a 6 cylinder engine.                     |
|The 2008 audi a4 has a 6 cylinder engine.                     |
|The 1999 audi a4 quattro has a 4 cylinder engine.             |
|The 1999 audi a4 quattro has a 4 cylinder engine.             |
|The 2008 audi a4 quattro has a 4 cylinder engine.             |
|The 2008 audi a4 quattro has a 4 cylinder engine.             |
|The 1999 audi a4 quattro has a 6 cylinder engine.             |
|The 1999 audi a4 quattro

In [61]:
from pyspark.sql.functions import regexp_extract

In [63]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [81]:
mpg.select(
    regexp_extract(mpg.trans, r'(\w+)', 1).alias('transmission')
    ).show()

+------------+
|transmission|
+------------+
|        auto|
|      manual|
|      manual|
|        auto|
|        auto|
|      manual|
|        auto|
|      manual|
|        auto|
|      manual|
|        auto|
|        auto|
|      manual|
|        auto|
|      manual|
|        auto|
|        auto|
|        auto|
|        auto|
|        auto|
+------------+
only showing top 20 rows



#### Load the tips dataset as a spark dataframe.
- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker

In [84]:
tips = spark.createDataFrame(data('tips'))

In [85]:
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [101]:
tips.where(tips.smoker == 'Yes').count() / tips.count()

0.38114754098360654

In [110]:
from pyspark.sql.functions import round

In [155]:
# Create a column that contains the tip percentage
tips.select(
    round(tips.tip / tips.total_bill,2).alias('percent')
    ).show()

+-------+
|percent|
+-------+
|   0.06|
|   0.16|
|   0.17|
|   0.14|
|   0.15|
|   0.19|
|   0.23|
|   0.12|
|   0.13|
|   0.22|
|   0.17|
|   0.14|
|    0.1|
|   0.16|
|    0.2|
|   0.18|
|   0.16|
|   0.23|
|   0.21|
|   0.16|
+-------+
only showing top 20 rows



In [131]:
from pyspark.sql.functions import mean

In [154]:
# Calculate the average tip percentage for each combination of sex and smoker
tips.groupby('sex', 'smoker').agg(mean(tips.tip / tips.total_bill).alias('tip_percent')).show()

+------+------+-------------------+
|   sex|smoker|        tip_percent|
+------+------+-------------------+
|  Male|    No| 0.1606687151291298|
|  Male|   Yes| 0.1527711752024851|
|Female|    No| 0.1569209707691836|
|Female|   Yes|0.18215035269941035|
+------+------+-------------------+



#### Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to farenheight.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low tempurature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [159]:
from vega_datasets import data

In [174]:
weather = data('seattle_weather').assign(date=lambda df: df.date.astype(str))

In [175]:
df = spark.createDataFrame(weather)

In [177]:
df.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [183]:
# Convert the temperatures to farenheight.
df.select(
    (df.temp_max * 9/5 + 32).alias('temp_max_F'),
    (df.temp_min * 9/5 + 32).alias('temp_min_F')
).show(5)

+----------+----------+
|temp_max_F|temp_min_F|
+----------+----------+
|     55.04|      41.0|
|     51.08|     37.04|
|     53.06|     44.96|
|     53.96|     42.08|
|     48.02|     37.04|
+----------+----------+
only showing top 5 rows



In [192]:
from pyspark.sql.functions import month, year, quarter

In [202]:
# Which month has the most rain, on average?
# Which year was the windiest?
# What is the most frequent type of weather in January?
# What is the average high and low tempurature on sunny days in July in 2013 and 2014?
# What percentage of days were rainy in q3 of 2015?
# For each year, find what percentage of days it rained (had non-zero precipitation).
(
    df.withColumn('month', month('date'))
    .groupby('month')
    .sum()
).show()

+-----+------------------+------------------+------------------+------------------+----------+
|month|sum(precipitation)|     sum(temp_max)|     sum(temp_min)|         sum(wind)|sum(month)|
+-----+------------------+------------------+------------------+------------------+----------+
|   12|             622.7|1016.1000000000001|412.29999999999995|448.69999999999993|      1488|
|    1|465.99999999999994|1020.4000000000001| 334.3999999999999|             389.2|       124|
|    6|             132.9|2688.0000000000005|1469.2999999999997| 375.7000000000001|       720|
|    3|             606.2|1536.0000000000002|             602.5|443.90000000000003|       372|
|    5|             207.5|2392.7000000000003|1192.1999999999998|             386.9|       620|
|    9|235.49999999999997|            2630.9|            1483.0|             355.6|      1080|
|    4|             375.4|            1802.4|             763.5|             422.9|       480|
|    8|             163.7|3237.8999999999996|     