In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
import pandas as pd
import numpy as np

import pyspark.sql.functions as F

1. Create a spark data frame that contains your favorite programming languages.

    - The name of the column should be language
    - View the schema of the dataframe
    - Output the shape of the dataframe
    - Show the first 5 records in the dataframe

In [3]:
# create df, name column
df = pd.DataFrame(['python', 'sql', 'html', 'java', 'javascript', 'c', 'c++', 'basic', 'markdown', 'php']
)
df = df.rename(columns=({0: 'language'}))

In [4]:
# make it a spark df
df = spark.createDataFrame(df)


In [5]:
# view schema
df.printSchema()

root
 |-- language: string (nullable = true)



In [6]:
#output shape of df
print((df.count(), len(df.columns)))

(10, 1)


In [7]:
# show first 5 records
df.show(5)

+----------+
|  language|
+----------+
|    python|
|       sql|
|      html|
|      java|
|javascript|
+----------+
only showing top 5 rows



2. Load the mpg dataset as a spark dataframe.

    a. Create 1 column of output that contains a message like the one below:


> The 1999 audi a4 has a 4 cylinder engine. For each vehicle.

    b. Transform the trans column so that it only contains either manual or auto.

In [8]:
#load mpg data set make spark df
from pydataset import data
mpg = spark.createDataFrame(data('mpg'))

In [9]:
# create verbose column

mpg = mpg.withColumn("statement", (F.concat(F.lit('The '), 
                 mpg.year, 
                 F.lit(" "), 
                 mpg.manufacturer,
                 F.lit(" "),
                 mpg.model,
                 F.lit(" has a "),
                 mpg.cyl,
                 F.lit(" cylinder engine"),
                )
                            )
              )

In [10]:
# how can you show everything in the column
mpg.select(mpg.statement).show(truncate=False)

+-------------------------------------------------------------+
|statement                                                    |
+-------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine                     |
|The 1999 audi a4 has a 4 cylinder engine                     |
|The 2008 audi a4 has a 4 cylinder engine                     |
|The 2008 audi a4 has a 4 cylinder engine                     |
|The 1999 audi a4 has a 6 cylinder engine                     |
|The 1999 audi a4 has a 6 cylinder engine                     |
|The 2008 audi a4 has a 6 cylinder engine                     |
|The 1999 audi a4 quattro has a 4 cylinder engine             |
|The 1999 audi a4 quattro has a 4 cylinder engine             |
|The 2008 audi a4 quattro has a 4 cylinder engine             |
|The 2008 audi a4 quattro has a 4 cylinder engine             |
|The 1999 audi a4 quattro has a 6 cylinder engine             |
|The 1999 audi a4 quattro has a 6 cylind

In [11]:
# transform trans column to be manual or auto
mpg.select(mpg.trans, 
           F.regexp_replace(mpg.trans, r'\(.+\)', "").alias('new_trans')).show(5)

+----------+---------+
|     trans|new_trans|
+----------+---------+
|  auto(l5)|     auto|
|manual(m5)|   manual|
|manual(m6)|   manual|
|  auto(av)|     auto|
|  auto(l5)|     auto|
+----------+---------+
only showing top 5 rows



3. Load the tips dataset as a spark dataframe.

    a. What percentage of observations are smokers?
    
    b. Create a column that contains the tip percentage
    
    c. Calculate the average tip percentage for each combination of sex and smoker.

In [33]:
tips = spark.createDataFrame(data('tips'))

In [34]:
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [35]:
# percentage of observations are smokers
n_smokers = tips.select('smoker').where(tips.smoker == 'Yes').count() 
n_total = tips.select('smoker').count()

x = [n_smokers / n_total]

df = pd.DataFrame({"smoker_percent": x})

smoke = spark.createDataFrame(df)
# why do I have to turn it into a df first??

smoke.select((F.format_number('smoker_percent', 2)).alias('percent_smokers')).show()

smoke.printSchema()

+---------------+
|percent_smokers|
+---------------+
|           0.38|
+---------------+

root
 |-- smoker_percent: double (nullable = true)



In [53]:
# create a column with tip %
tips = tips.withColumn('tip_percentage', 
       (tips.tip / tips.total_bill))



tips.show(2)
# tips.printSchema()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|     tip_percentage|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
+----------+----+------+------+---+------+----+-------------------+
only showing top 2 rows



In [54]:
# Calculate the average tip percentage for each combination of sex and smoker.

tips.groupBy('sex', 'smoker').mean('tip_percentage').show()

+------+------+-------------------+
|   sex|smoker|avg(tip_percentage)|
+------+------+-------------------+
|  Male|    No| 0.1606687151291298|
|  Male|   Yes| 0.1527711752024851|
|Female|    No| 0.1569209707691836|
|Female|   Yes|0.18215035269941035|
+------+------+-------------------+



In [18]:
tips.groupby('sex').pivot('smoker').mean('tip_percentage').show()

+------+------------------+-------------------+
|   sex|                No|                Yes|
+------+------------------+-------------------+
|Female|0.1569209707691836|0.18215035269941035|
|  Male|0.1606687151291298| 0.1527711752024851|
+------+------------------+-------------------+



4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

    - Convert the temperatures to farenheight.
    - Which month has the most rain, on average?
    - Which year was the windiest?
    - What is the most frequent type of weather in January?
    - What is the average high and low tempurature on sunny days in July in 2013 and 2014?
    - What percentage of days were rainy in q3 of 2015?
    - For each year, find what percentage of days it rained (had non-zero precipitation).

In [55]:
from vega_datasets import data

In [56]:
weather = spark.createDataFrame(data('seattle-weather'))

In [57]:
# convert temperatures to farenheight

weather.select(
    F.expr("ROUND(temp_max * (9/5) +32) AS temp_max_f"),
    F.expr("ROUND(temp_min * (9/5) + 32) AS temp_min_f")
).show(2)
# how do change column values in rewrite

+----------+----------+
|temp_max_f|temp_min_f|
+----------+----------+
|      55.0|      41.0|
|      51.0|      37.0|
+----------+----------+
only showing top 2 rows



In [58]:
weather.show(2)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 2 rows



In [59]:
weather.printSchema()

root
 |-- date: timestamp (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- temp_max: double (nullable = true)
 |-- temp_min: double (nullable = true)
 |-- wind: double (nullable = true)
 |-- weather: string (nullable = true)



In [60]:
# Which month has the most rain, on average?

(weather.withColumn("month", F.month("date"))
     .groupBy('month')
     .agg(F.mean("precipitation").alias('avg_rainfall'))
     .sort(F.desc('avg_rainfall'))
     .show())
    

+-----+-------------------+
|month|       avg_rainfall|
+-----+-------------------+
|   11|  5.354166666666667|
|   12|  5.021774193548388|
|    3|  4.888709677419355|
|   10|  4.059677419354839|
|    1| 3.7580645161290316|
|    2|  3.734513274336283|
|    4|  3.128333333333333|
|    9| 1.9624999999999997|
|    5| 1.6733870967741935|
|    8| 1.3201612903225806|
|    6| 1.1075000000000002|
|    7|0.38870967741935486|
+-----+-------------------+



In [62]:
# Which year was the windiest?
(weather.withColumn('year', F.year('date'))
    .groupBy('year')
     .agg(F.mean('wind').alias('avg_wind'))
     .sort(F.desc('avg_wind'))
     .show()
    
)

+----+------------------+
|year|          avg_wind|
+----+------------------+
|2012| 3.400819672131147|
|2014|3.3876712328767136|
|2015|  3.15972602739726|
|2013|3.0158904109589044|
+----+------------------+



In [61]:
# What is the most frequent type of weather in January?
(weather.filter(F.month('date') == 1)
    .select('weather')
     .groupBy('weather')
     .agg(F.count('weather').alias('frequency'))
     .sort(F.desc('frequency'))
    .show()
)

+-------+---------+
|weather|frequency|
+-------+---------+
|    fog|       38|
|   rain|       35|
|    sun|       33|
|drizzle|       10|
|   snow|        8|
+-------+---------+



In [63]:
# What is the average high and low tempurature on sunny days in July in 2013 and 2014?
# avg min temp
(weather.select(
    F.expr("ROUND(temp_max * (9/5) +32) AS temp_max_f"),
    F.expr("ROUND(temp_min * (9/5) + 32) AS temp_min_f"))
     .filter(F.month('date') == 7)
    .filter(F.year('date') > 2012)
    .filter(F.year('date') < 2015)
     .filter(weather.weather == 'sun')
     .agg(F.mean('temp_min_f'))
).show()

+-----------------+
|  avg(temp_min_f)|
+-----------------+
|57.53846153846154|
+-----------------+



In [65]:
# avg max temp
(weather.select(
    F.expr("ROUND(temp_max * (9/5) +32) AS temp_max_f"),
    F.expr("ROUND(temp_min * (9/5) + 32) AS temp_min_f"))
     .filter(F.month('date') == 7)
    .filter(F.year('date') > 2012)
    .filter(F.year('date') < 2015)
     .filter(weather.weather == 'sun')
     .agg(F.mean('temp_max_f'), F.mean('temp_min_f'))
).show()

+-----------------+-----------------+
|  avg(temp_max_f)|  avg(temp_min_f)|
+-----------------+-----------------+
|80.28846153846153|57.53846153846154|
+-----------------+-----------------+



In [66]:
# What percentage of days were rainy in q3 of 2015?
rainy_days = (weather.select('weather')
     .where(F.year('date') == 2015)
     .where(F.month('date') > 6)
     .where(F.month('date') < 10)
    .where(weather.weather == 'rain')
).count()

total_days = (weather.select('weather')
     .where(F.year('date') == 2015)
     .where(F.month('date') > 6)
     .where(F.month('date') < 10)
).count()

rainy_days / total_days

0.021739130434782608

In [67]:
# For each year, find what percentage of days it rained (had non-zero precipitation).
rain_table = (weather.withColumn('it_rained', weather.precipitation > 0)
         .select('date', 'it_rained')
         .groupBy((F.year('date')).alias('year'))
         .pivot('it_rained')
         .agg(F.count('it_rained'))
)

In [68]:
(rain_table.withColumn(
    'percent_rainy_days',
    (F.format_number(rain_table.true / (rain_table.true + rain_table.false), 2)
    ))).show()

+----+-----+----+------------------+
|year|false|true|percent_rainy_days|
+----+-----+----+------------------+
|2015|  221| 144|              0.39|
|2013|  213| 152|              0.42|
|2014|  215| 150|              0.41|
|2012|  189| 177|              0.48|
+----+-----+----+------------------+

