In [21]:
import pandas as pd
import numpy as np
import pyspark
from pydataset import data
from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when

np.random.seed(123)

In [3]:
# start spark session
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

## 1.Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [9]:
pandas_dataframe = pd.DataFrame(
    {
        "language": np.random.choice(list(['Python', 'SQL','React','Java']), 20)
    }
)

In [11]:
# create spark df and show first 5 records
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------+
|language|
+--------+
|   React|
|     SQL|
|   React|
|   React|
|  Python|
+--------+
only showing top 5 rows



In [16]:
# view schema of df
df.printSchema()

root
 |-- language: string (nullable = true)



In [17]:
# get shape of df
print((df.count(), len(df.columns)))

(20, 1)


## 2. Load the mpg dataset as a spark dataframe.

In [36]:
mpg = spark.createDataFrame(data("mpg"))

In [37]:
mpg.head()

Row(manufacturer='audi', model='a4', displ=1.8, year=1999, cyl=4, trans='auto(l5)', drv='f', cty=18, hwy=29, fl='p', class='compact')

### a. Create 1 column of output that contains a message like the one below for each vehicle:

The 1999 audi a4 has a 4 cylinder engine.


In [32]:
# mpg.select('*', lit('The 1999 audi a4 has a 4 cylinder engine.').alias('output')).show(5)

### b. Transform the trans column so that it only contains either manual or auto.

In [38]:
mpg = mpg.select("*", mpg.trans, regexp_extract("trans", r"(.\w+)", 1).alias('transformed_trans'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------+-----------------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|     trans|transformed_trans|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------+-----------------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|  auto(l5)|             auto|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|manual(m5)|           manual|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|manual(m6)|           manual|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|  auto(av)|             auto|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|  auto(l5)|             auto|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+----------+-----------------+
only showing top 5 rows



## 3. Load the tips dataset as a spark dataframe.

In [39]:
tips = spark.createDataFrame(data('tips'))

### a. What percentage of observations are smokers?

In [43]:
tips.show(3)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
+----------+----+------+------+---+------+----+
only showing top 3 rows



In [49]:
tips.createOrReplaceTempView("tips")

In [51]:
spark.sql(
    """
SELECT ((SELECT COUNT(smoker)
FROM tips
WHERE smoker = 'Yes') / 
(SELECT COUNT(smoker)
FROM tips)*100) as pct_smokers
FROM tips
"""
).show(1)

+------------------+
|       pct_smokers|
+------------------+
|38.114754098360656|
+------------------+
only showing top 1 row

