In [1]:
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
import pandas as pd
import numpy as np

from pydataset import data


# Exercises

### 1. Create a spark data frame that contains your favorite programming languages.

- The name of the column should be `language`
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [3]:
lang_df = pd.DataFrame({"language": ['python', 'spark', 'sql', 'swift', 'javascript']})

In [4]:
lang_df

Unnamed: 0,language
0,python
1,spark
2,sql
3,swift
4,javascript


In [5]:
sp_df = spark.createDataFrame(lang_df)
sp_df

DataFrame[language: string]

In [6]:
sp_df.printSchema()

root
 |-- language: string (nullable = true)



In [7]:
#printing the shape of the spark df:
print(sp_df.count(), len(sp_df.columns))

5 1


In [8]:
sp_df.show(5)

+----------+
|  language|
+----------+
|    python|
|     spark|
|       sql|
|     swift|
|javascript|
+----------+



### 2 Load the mpg dataset as a spark dataframe.

a. Create 1 column of output that contains a message like the one below:

`The 1999 audi a4 has a 4 cylinder engine. For each vehicle.`

b. Transform the trans column so that it only contains either manual or auto.

In [9]:
from pyspark.sql.functions import round, concat, sum, min, max, count, avg, mean
from pyspark.sql.functions import lit
from pyspark.sql.functions import when

In [10]:
# importing the data:
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [12]:
from pyspark.sql.functions import regexp_extract, regexp_replace

In [13]:
# Use lit

mpg.select(concat(lit("The "), mpg.year, lit(" "), mpg.manufacturer, lit(" "), mpg.model, lit(" has a "), mpg.cyl, lit(" engine")).alias("cylinders")).show(1)

+--------------------+
|           cylinders|
+--------------------+
|The 1999 audi a4 ...|
+--------------------+
only showing top 1 row



In [14]:
mpg.select(mpg.trans, when(mpg.trans.contains("auto"), "auto").otherwise("manual").alias("Trans")).show(5)

+----------+------+
|     trans| Trans|
+----------+------+
|  auto(l5)|  auto|
|manual(m5)|manual|
|manual(m6)|manual|
|  auto(av)|  auto|
|  auto(l5)|  auto|
+----------+------+
only showing top 5 rows



In [None]:
# Testing out how to use the regex for this problem. Although I think I'll just end up using basic python usage:



#### Scratch area

In [20]:
mpg.select(sum(mpg.hwy)/count(mpg.hwy).alias("avg_1"), avg(mpg.hwy).alias("avg_2")).show()

+----------------------------------+-----------------+
|(sum(hwy) / count(hwy) AS `avg_1`)|            avg_2|
+----------------------------------+-----------------+
|                 23.44017094017094|23.44017094017094|
+----------------------------------+-----------------+



In [23]:
mpg.select(concat(mpg.manufacturer, lit(" "), lit(mpg.model))).show(5)

+------------------------------+
|concat(manufacturer,  , model)|
+------------------------------+
|                       audi a4|
|                       audi a4|
|                       audi a4|
|                       audi a4|
|                       audi a4|
+------------------------------+
only showing top 5 rows

