# Exercises

### 1.) Create a spark data frame that contains your favorite programming languages.

The name of the column should be language

View the schema of the dataframe

Output the shape of the dataframe

Show the first 5 records in the dataframe

In [2]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [11]:
import pandas as pd
import numpy as np

np.random.seed(123)

table_data = {"language" : ["Python", "Ruby", "HTML", "Java", "C++", "RedLion", "Honeywell"],
             "experience_level" : [2, 1, 3, 1, 0, 3, 3],
             "teachers" : ["Ryan", "Zach", "Self", "Ryan", "Zach", "Self", "Randy"]}

pandas_df = pd.DataFrame(table_data, columns=["language", "experience_level", "teachers"])

pandas_df

Unnamed: 0,language,experience_level,teachers
0,Python,2,Ryan
1,Ruby,1,Zach
2,HTML,3,Self
3,Java,1,Ryan
4,C++,0,Zach
5,RedLion,3,Self
6,Honeywell,3,Randy


In [12]:
# Turning it into a Spark dataframe:

df = spark.createDataFrame(pandas_df)
df

DataFrame[language: string, experience_level: bigint, teachers: string]

In [25]:
# Show the schema (structure)

df.printSchema()

root
 |-- language: string (nullable = true)
 |-- experience_level: long (nullable = true)
 |-- teachers: string (nullable = true)



In [46]:
# Show the shape

df_shape = df.count(), len(df.columns)

print(df.count(), "rows", len(df.columns), "columns")



# def spark_shape(self):
#     return (self.count(), len(self.columns))
# pyspark.sql.dataframe.DataFrame.shape = spark_shape

7 rows 3 columns


In [10]:
df.show(5)

+---------+----------------+--------+
|languages|experience_level|teachers|
+---------+----------------+--------+
|   Python|               2|    Ryan|
|     Ruby|               1|    Zach|
|     HTML|               3|    Self|
|     Java|               1|    Ryan|
|      C++|               0|    Zach|
+---------+----------------+--------+
only showing top 5 rows



In [13]:
df.describe()

DataFrame[summary: string, language: string, experience_level: string, teachers: string]

In [15]:
df.describe().show()

+-------+--------+------------------+--------+
|summary|language|  experience_level|teachers|
+-------+--------+------------------+--------+
|  count|       7|                 7|       7|
|   mean|    null|1.8571428571428572|    null|
| stddev|    null|1.2149857925879117|    null|
|    min|     C++|                 0|   Randy|
|    max|    Ruby|                 3|    Zach|
+-------+--------+------------------+--------+



### 2.) Load the mpg dataset as a spark dataframe.

Create 1 column of output that contains a message like the one below:


The 1999 audi a4 has a 4 cylinder engine.
For each vehicle.

Transform the trans column so that it only contains either manual or auto.

In [33]:
# Loading mpg

from pydataset import data

mpg = spark.createDataFrame(data("mpg"))

mpg.show(10)
# from pydataset import data

# mpg = spark.createDataFrame(data("mpg"))
# mpg.show(5)

+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|     model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+----------+-----+----+---+----------+---+---+---+---+-------+
|        audi|        a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|        a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|        a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|        a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|        a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|        a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|        a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|compact|
|        audi|a4 quattro|  2.0|2008|  4|manual(m6)|  4| 20| 28|  p|compact|
+-----------

In [47]:
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [52]:
# creating a column of output with that reading:

from pyspark.sql.functions import col, expr, lit, when


mpg = mpg.withColumn('details', lit('these are the details'))

display(mpg)

# # The 1999 audi a4 has a 4 cylinder engine. For each vehicle.

# from pyspark.sql.functions import col, expr, lit, when

# def car_description(year, model, car):
    
#     date = mpg.select(mpg.year)
#     maker = mpg.select(mpg.manufacturer)
#     style = mpg.select(mpg.model)
#     pistons = mpg.select(mpg.cyl)

#     return f"The {date} {maker} {style} has a {pistons} engine."

# car_description("2008", "a4 quattro", "audi" )

DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string, details: string]

In [54]:
mpg.describe().show()

+-------+------------+-----------------+------------------+-----------------+------------------+----------+---+------------------+-----------------+----+-------+--------------------+
|summary|manufacturer|            model|             displ|             year|               cyl|     trans|drv|               cty|              hwy|  fl|  class|             details|
+-------+------------+-----------------+------------------+-----------------+------------------+----------+---+------------------+-----------------+----+-------+--------------------+
|  count|         234|              234|               234|              234|               234|       234|234|               234|              234| 234|    234|                 234|
|   mean|        null|             null| 3.471794871794873|           2003.5| 5.888888888888889|      null|4.0|16.858974358974358|23.44017094017094|null|   null|                null|
| stddev|        null|             null|1.2919590310839348|4.509646313320409|1.611534

**^^ Need Help with this one.  Spent WAYYYY too long on it.**

In [43]:
# transform the columns

mpg.explain() # '.explain()' shows us how spark is thinking about our data

== Physical Plan ==
Scan ExistingRDD[manufacturer#409,model#410,displ#411,year#412L,cyl#413L,trans#414,drv#415,cty#416L,hwy#417L,fl#418,class#419]


In [45]:
mpg.select(mpg.trans).explain() # shows us how Spark is thinking about our 'trans' column

== Physical Plan ==
*(1) Project [trans#414]
+- Scan ExistingRDD[manufacturer#409,model#410,displ#411,year#412L,cyl#413L,trans#414,drv#415,cty#416L,hwy#417L,fl#418,class#419]


In [None]:
### 3.) 