In [1]:
import numpy as np
import pandas as pd
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()

from pyspark.sql.functions import col, expr

from pydataset import data
from pyspark.sql.functions import regexp_extract, regexp_replace
from pyspark.sql.functions import *

Create a spark data frame that contains your favorite programming languages.
- The name of the column should be <code>language</code>
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [2]:
programming2021 = pd.DataFrame(
    {'language': ['c++', 'javascript', 'PHP', 'python', 'go']}
)
programming2021

Unnamed: 0,language
0,c++
1,javascript
2,PHP
3,python
4,go


In [3]:
programming = spark.createDataFrame(programming2021)

In [4]:
# view the schema

programming.printSchema()

root
 |-- language: string (nullable = true)



In [5]:
# output the shape

print((programming.count(), len(programming.columns)))

(5, 1)


In [6]:
# show the first 5 reords

programming.show(5)

+----------+
|  language|
+----------+
|       c++|
|javascript|
|       PHP|
|    python|
|        go|
+----------+



Load the <code>mpg</code> dataset as a spark dataframe.
- Create 1 column of output that contains a message like the one below: <br>
    <code>The 1999 audi a4 has a 4 cylinder engine.</code> <br>
    For each vehicle.
- Transform the <code>trans</code> column so that it only contains either <code>manual</code> or <code>auto</code>

In [7]:
mpg = spark.createDataFrame(data('mpg'))
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [8]:
# Create 1 column of output that contains a message like the one below:

# The 1999 audi a4 has a 4 cylinder engine.

mpg.select(
    concat(
        lit('The '),
        col('year'),
        lit(' '),
        col('manufacturer'),
        lit(' '),
        col('model'),
        lit(' has a '),
        col('cyl'),
        lit('cylinder engine.')
    ).alias('vehicle_cyl')).show(truncate=False)

+-------------------------------------------------------------+
|vehicle_cyl                                                  |
+-------------------------------------------------------------+
|The 1999 audi a4 has a 4cylinder engine.                     |
|The 1999 audi a4 has a 4cylinder engine.                     |
|The 2008 audi a4 has a 4cylinder engine.                     |
|The 2008 audi a4 has a 4cylinder engine.                     |
|The 1999 audi a4 has a 6cylinder engine.                     |
|The 1999 audi a4 has a 6cylinder engine.                     |
|The 2008 audi a4 has a 6cylinder engine.                     |
|The 1999 audi a4 quattro has a 4cylinder engine.             |
|The 1999 audi a4 quattro has a 4cylinder engine.             |
|The 2008 audi a4 quattro has a 4cylinder engine.             |
|The 2008 audi a4 quattro has a 4cylinder engine.             |
|The 1999 audi a4 quattro has a 6cylinder engine.             |
|The 1999 audi a4 quattro has a 6cylinde

In [9]:
# Transform the trans column so that it only contains either manual or auto.

mpg.select(
    'trans',
    when(
        mpg.trans.like("auto%"), "auto"
    ).otherwise("manual").alias("when + like")
).show()

+----------+-----------+
|     trans|when + like|
+----------+-----------+
|  auto(l5)|       auto|
|manual(m5)|     manual|
|manual(m6)|     manual|
|  auto(av)|       auto|
|  auto(l5)|       auto|
|manual(m5)|     manual|
|  auto(av)|       auto|
|manual(m5)|     manual|
|  auto(l5)|       auto|
|manual(m6)|     manual|
|  auto(s6)|       auto|
|  auto(l5)|       auto|
|manual(m5)|     manual|
|  auto(s6)|       auto|
|manual(m6)|     manual|
|  auto(l5)|       auto|
|  auto(s6)|       auto|
|  auto(s6)|       auto|
|  auto(l4)|       auto|
|  auto(l4)|       auto|
+----------+-----------+
only showing top 20 rows



Load the <code>tips</code> dataset as a spark dataframe.
- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.

In [10]:
# tips as a spark dataframe

tips = spark.createDataFrame(data('tips'))
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [11]:
# what percentage of observations are smokers?
# group by first

tips.groupBy('smoker').count().show()

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+



In [16]:
# find percentage

tips.groupBy("smoker").count().withColumn(
    "percent",
    concat(round((col("count") / tips.count() * 100), 0).cast("int"), lit("%")),
).show()


+------+-----+-------+
|smoker|count|percent|
+------+-----+-------+
|    No|  151|    62%|
|   Yes|   93|    38%|
+------+-----+-------+



In [20]:
# create a column that shows the tip percentage.

tips.withColumn('tip_percentage',col('tip') / col('total_bill')).show()

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|     tip_percentage|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|0.18623962040332148|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|0.22805017103762829|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|0.11607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|0.13031914893617022|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2| 0.2185385656292287|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 0.1665043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|0

In [21]:
# calculate the average tip percentage for each combination of sex and smoker

(
    tips.withColumn('tip_percentage', col('tip') / col('total_bill'))
    .groupby('sex')
    .pivot('smoker')
    .agg(round(mean('tip_percentage'), 4))
    .show()
)

+------+------+------+
|   sex|    No|   Yes|
+------+------+------+
|Female|0.1569|0.1822|
|  Male|0.1607|0.1528|
+------+------+------+



Using the weather dataset
- Convert the temperatures to fahrenheit.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [25]:
from vega_datasets import data

weather = data.seattle_weather()

Did you mean:
water, Wheat, deaths, waders, Weimar, winter, watervoles, bcdeter, veteran, newpainters, Snow.deaths


TypeError: 'NoneType' object is not iterable