In [1]:
import pandas as pd
import numpy as np
import pyspark
import pydataset
from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when
from vega_datasets import data
from pyspark.sql.functions import month, year, quarter
from pyspark.sql.functions import *

np.random.seed(123)

In [2]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/19 15:11:02 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


1. Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [3]:
pandas_dataframe = pd.DataFrame(
    {
        "language": np.random.choice(list(['Python', 'SQL','R','Java', 'JavaScript', 'Swift', 'C#']), 25)
    }
)

In [4]:
# create spark df 
df = spark.createDataFrame(pandas_dataframe)

In [5]:
# view the dataframe's schema
df.printSchema()

root
 |-- language: string (nullable = true)



In [6]:
# output the shape of the dataframe
print((df.count(), len(df.columns)))

[Stage 0:>                                                          (0 + 8) / 8]

(25, 1)


                                                                                

In [7]:
# Show the first five records in the dataframe
df.show(5)

+----------+
|  language|
+----------+
|        C#|
|     Swift|
|        C#|
|         R|
|JavaScript|
+----------+
only showing top 5 rows



***
2. Load the mpg dataset as a spark dataframe.

- a. Create 1 column of output that contains a message like the one below:
```
The 1999 audi a4 has a 4 cylinder engine.
```
For each vehicle.

- b. Transform the trans column so that it only contains either manual or auto.

In [9]:
mpg = spark.createDataFrame(pydataset.data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [10]:
mpg.select(
    concat(
        lit("The "),
        col("year"),
        lit(" "),
        col("manufacturer"),
        lit(" "),
        col("model"),
        lit(" has a "),
        col("cyl"),
        lit(" cylinder engine."),
    ).alias("vehicle_cylinder_desc")
).show(truncate=False)

+--------------------------------------------------------------+
|vehicle_cylinder_desc                                         |
+--------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.                     |
|The 1999 audi a4 has a 4 cylinder engine.                     |
|The 2008 audi a4 has a 4 cylinder engine.                     |
|The 2008 audi a4 has a 4 cylinder engine.                     |
|The 1999 audi a4 has a 6 cylinder engine.                     |
|The 1999 audi a4 has a 6 cylinder engine.                     |
|The 2008 audi a4 has a 6 cylinder engine.                     |
|The 1999 audi a4 quattro has a 4 cylinder engine.             |
|The 1999 audi a4 quattro has a 4 cylinder engine.             |
|The 2008 audi a4 quattro has a 4 cylinder engine.             |
|The 2008 audi a4 quattro has a 4 cylinder engine.             |
|The 1999 audi a4 quattro has a 6 cylinder engine.             |
|The 1999 audi a4 quattro

In [11]:
#select trans column
#replace contents in trans that start with special character, then some characters, to a special character being the last
#replace with nothing
#name the column 
#show
mpg.select('trans',regexp_replace('trans', r'\(\w+\)$', '').alias('transmission')).show()

+----------+------------+
|     trans|transmission|
+----------+------------+
|  auto(l5)|        auto|
|manual(m5)|      manual|
|manual(m6)|      manual|
|  auto(av)|        auto|
|  auto(l5)|        auto|
|manual(m5)|      manual|
|  auto(av)|        auto|
|manual(m5)|      manual|
|  auto(l5)|        auto|
|manual(m6)|      manual|
|  auto(s6)|        auto|
|  auto(l5)|        auto|
|manual(m5)|      manual|
|  auto(s6)|        auto|
|manual(m6)|      manual|
|  auto(l5)|        auto|
|  auto(s6)|        auto|
|  auto(s6)|        auto|
|  auto(l4)|        auto|
|  auto(l4)|        auto|
+----------+------------+
only showing top 20 rows



***
3. Load the tips dataset as a spark dataframe.

- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.

In [28]:
#tips data into spark dataframe
tips = spark.createDataFrame(pydataset.data("tips"))

In [24]:
#the number of smokers
smoker_count = tips.filter(tips.smoker == 'Yes').count()
print('There are', smoker_count, 'smokers from the dataset.')
#smoker percentage will be the number of smokers divided by number of entries
smoke_percentage = ((tips.filter(tips.smoker == 'Yes').count())/(tips.count()))*100
print('Smokers account for', smoke_percentage, 'percent of the dataframe')

There are 93 smokers from the dataset.
Smokers account for 38.114754098360656 percent of the dataframe


In [25]:
# create a column that contains the tip percentage
tip_percentage = round(((col("tip") / col("total_bill"))) * 100,2)

tips.select(
    col("tip").alias("tip_amount"),
    tips.total_bill.alias("bill"),
    tip_percentage.alias("tip%"),
).show(5)

+----------+-----+-----+
|tip_amount| bill| tip%|
+----------+-----+-----+
|      1.01|16.99| 5.94|
|      1.66|10.34|16.05|
|       3.5|21.01|16.66|
|      3.31|23.68|13.98|
|      3.61|24.59|14.68|
+----------+-----+-----+
only showing top 5 rows



In [29]:
#just select, put the calculation in, and give it an alias if you wish
tips = tips.select('*', (tips.tip / tips.total_bill).alias('tip_pct'))
tips.show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|            tip_pct|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [30]:
#group by sex and smoker-status, then calculate the mean tip pct for each group.
tips.groupby('sex').pivot('smoker').mean('tip_pct').show()

+------+------------------+-------------------+
|   sex|                No|                Yes|
+------+------------------+-------------------+
|Female|0.1569209707691836|0.18215035269941032|
|  Male|0.1606687151291298|0.15277117520248512|
+------+------------------+-------------------+



***
4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to fahrenheit.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [3]:
#import data from vega
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



                                                                                

In [4]:
def convert_temp(celsius):
    '''
    converts celcius to fahrenheit
    '''
    fahrenheit = (celsius*(9/5)) + 32
    return fahrenheit

In [5]:
#convert min temps and max temps to fahrenheit
weather = weather.select('*', convert_temp(weather.temp_max).alias('fahrenheit_max'), convert_temp(weather.temp_min).alias('fahrenheit_min'))

In [40]:
weather.show()

+----------+-------------+--------+--------+----+-------+------------------+------------------+
|      date|precipitation|temp_max|temp_min|wind|weather|    fahrenheit_max|    fahrenheit_min|
+----------+-------------+--------+--------+----+-------+------------------+------------------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|55.040000000000006|              41.0|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|             51.08|             37.04|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|             53.06|             44.96|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|             53.96|             42.08|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|48.019999999999996|             37.04|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|             39.92|             35.96|
|2012-01-07|          0.0|     7.2|     2.8| 2.3|   rain|             44.96|             37.04|
|2012-01-08|          0.0|    10.0|     

In [41]:
# which month has the most rain on average?

(weather.withColumn('month', month('date'))
    .groupBy('month')
    .agg(mean('precipitation').alias('avg_rain'))
    .sort(desc('avg_rain'))
    .show(1))

+-----+-----------------+
|month|         avg_rain|
+-----+-----------------+
|   11|5.354166666666667|
+-----+-----------------+
only showing top 1 row



22/05/18 15:45:26 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 1017976 ms exceeds timeout 120000 ms
22/05/18 15:45:26 WARN SparkContext: Killing executors is not supported by current scheduler.


In [6]:
# which year was the windiest?
(
    weather.withColumn("year", year("date"))
    .groupBy("year")
    .agg(sum("wind").alias("total_wind"))
    .sort("total_wind")
    .show()
)

+----+------------------+
|year|        total_wind|
+----+------------------+
|2013|1100.8000000000006|
|2015|1153.3000000000002|
|2014|1236.5000000000007|
|2012|            1244.7|
+----+------------------+



In [7]:
# what is the most frequent type of weather in January
weather = weather.withColumn('month', month('date'))
weather.crosstab('month', 'weather').show()
# It's close, but fog takes the cake

+-------------+-------+---+----+----+---+
|month_weather|drizzle|fog|rain|snow|sun|
+-------------+-------+---+----+----+---+
|            5|      1| 25|  16|   0| 82|
|           10|      4| 55|  20|   0| 45|
|            1|     10| 38|  35|   8| 33|
|            6|      2| 14|  19|   0| 85|
|            9|      5| 40|   4|   0| 71|
|            2|      4| 36|  40|   3| 30|
|           12|      2| 54|  23|   5| 40|
|            7|      8| 13|  14|   0| 89|
|            3|      3| 36|  37|   6| 42|
|           11|      3| 50|  25|   0| 42|
|            8|      8| 16|   6|   0| 94|
|            4|      4| 34|  20|   1| 61|
+-------------+-------+---+----+----+---+



In [8]:
# what is the average high and low temperature in july on sunny days for 2013 & 2014
weather = weather.withColumn('year', year('date'))
weather.filter(expr(
    '(year == 2013 OR year == 2014) AND month == 07')).groupby('weather').pivot('year').mean('temp_max').show()

+-------+------------------+------------------+
|weather|              2013|              2014|
+-------+------------------+------------------+
|    fog| 22.96666666666667|25.439999999999998|
|   rain|              22.2|              29.4|
|    sun|26.585185185185193|            27.092|
+-------+------------------+------------------+



In [10]:
weather.filter(expr(
    '(year == 2013 OR year == 2014) AND month == 07')).groupby('weather').pivot('year').mean('temp_min').show()

+-------+------------------+------------------+
|weather|              2013|              2014|
+-------+------------------+------------------+
|    fog|13.133333333333335|14.440000000000001|
|   rain|              15.0|              15.0|
|    sun|13.981481481481483|14.400000000000002|
+-------+------------------+------------------+



In [11]:
print('''
july '13: avg low for sunny days - 13.98 ; avg high - 26.59
july '14: avg low for sunny days - 14.40 ; avg high - 27.09
''')



july '13: avg low for sunny days - 13.98 ; avg high - 26.59
july '14: avg low for sunny days - 14.40 ; avg high - 27.09



In [13]:
# what percentage of days were rainy in q3 of 2015 
weather = weather.withColumn('quarter', quarter('date'))
weather.show(5)

+----------+-------------+--------+--------+----+-------+------------------+--------------+-----+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|    fahrenheit_max|fahrenheit_min|month|year|quarter|
+----------+-------------+--------+--------+----+-------+------------------+--------------+-----+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|55.040000000000006|          41.0|    1|2012|      1|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|             51.08|         37.04|    1|2012|      1|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|             53.06|         44.96|    1|2012|      1|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|             53.96|         42.08|    1|2012|      1|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|48.019999999999996|         37.04|    1|2012|      1|
+----------+-------------+--------+--------+----+-------+------------------+--------------+-----+----+-------+
o

In [14]:
rain_q3_2015 = weather.filter(expr('year == 2015 AND quarter == 3'))
rain_q3_2015.show(5)

+----------+-------------+--------+--------+----+-------+-----------------+-----------------+-----+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|   fahrenheit_max|   fahrenheit_min|month|year|quarter|
+----------+-------------+--------+--------+----+-------+-----------------+-----------------+-----+----+-------+
|2015-07-01|          0.0|    32.2|    17.2| 4.3|    sun|89.96000000000001|            62.96|    7|2015|      3|
|2015-07-02|          0.0|    33.9|    17.8| 3.4|    sun|            93.02|64.03999999999999|    7|2015|      3|
|2015-07-03|          0.0|    33.3|    17.8| 2.6|    sun|            91.94|64.03999999999999|    7|2015|      3|
|2015-07-04|          0.0|    33.3|    15.0| 2.9|    sun|            91.94|             59.0|    7|2015|      3|
|2015-07-05|          0.0|    32.8|    16.7| 2.1|    sun|91.03999999999999|            62.06|    7|2015|      3|
+----------+-------------+--------+--------+----+-------+-----------------+-----------------+---

In [15]:
rain_q3_2015.where(rain_q3_2015.weather=='rain').count() / rain_q3_2015.count()

0.021739130434782608

In [16]:
# For each year, find what percentage of days it rained (had non-zero precipitation)
weather.crosstab('year','weather').show()

+------------+-------+---+----+----+---+
|year_weather|drizzle|fog|rain|snow|sun|
+------------+-------+---+----+----+---+
|        2012|     31|  5| 191|  21|118|
|        2013|     16| 82|  60|   2|205|
|        2014|      0|151|   3|   0|211|
|        2015|      7|173|   5|   0|180|
+------------+-------+---+----+----+---+



In [17]:
def percent_day_rain(string):
    pct_rain = (weather.where(expr(string)).where(expr('precipitation > 0')).count()) / (weather.where(expr(string)).count())
    return pct_rain

In [19]:
print(f" The percentage of rainy days in 2012 is {percent_day_rain('year==2012')}")
print(f" The percentage of rainy days in 2013 is {percent_day_rain('year==2013')}")
print(f" The percentage of rainy days in 2014 is {percent_day_rain('year==2014')}")
print(f" The percentage of rainy days in 2015 is {percent_day_rain('year==2015')}")

 The percentage of rainy days in 2012 is 0.48360655737704916
 The percentage of rainy days in 2013 is 0.41643835616438357
 The percentage of rainy days in 2014 is 0.410958904109589
 The percentage of rainy days in 2015 is 0.39452054794520547
