In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0 pyspark-shell


In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F

In [4]:
# get or create Spark session

app_name = "demographics-filtering"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [5]:
# Read in data from S3 Buckets
url = "https://s3.amazonaws.com/dataviz-curriculum/day_1/demographics.csv"
spark.sparkContext.addFile(url)

df = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', inferSchema="true") \
    .load(SparkFiles.get("demographics.csv"))
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height_meter: double (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- children: integer (nullable = true)
 |-- occupation: string (nullable = true)
 |-- academic_degree: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- location: string (nullable = true)



In [6]:
# show first row
df.head()

Row(id=0, name='Darlena Avila', age=58, height_meter=1.87, weight_kg=53, children=1, occupation='Choreographer', academic_degree='PhD', salary=68, location='South Dakota')

In [7]:
# What occupation had the highest salary?
df \
    .orderBy(F.col("Salary").desc()) \
    .select("occupation", "Salary") \
    .limit(1) \
    .show()

+-----------------+------+
|       occupation|Salary|
+-----------------+------+
|Medical Physicist|    90|
+-----------------+------+



In [8]:
# What occupation had the lowest salary?
df \
    .orderBy(F.col("Salary").asc()) \
    .select("occupation", "Salary") \
    .limit(1) \
    .show()

+--------------+------+
|    occupation|Salary|
+--------------+------+
|Window Dresser|    65|
+--------------+------+



In [9]:
# What is the mean salary of this dataset?
df.select(F.mean("Salary").alias("average Salary")).show()

+--------------+
|average Salary|
+--------------+
|        77.738|
+--------------+



In [10]:
# What is the max and min of the Salary column?
df.select(F.max("Salary"), F.min("Salary")).show()

+-----------+-----------+
|max(Salary)|min(Salary)|
+-----------+-----------+
|         90|         65|
+-----------+-----------+



In [11]:
# Show all of the occupations where salaries were above 80k
df.filter("Salary > 80").select("occupation").show()

+--------------------+
|          occupation|
+--------------------+
|              Hawker|
|       Choreographer|
|          Millwright|
|   Medical Physicist|
|           Scientist|
|     Claims Adjustor|
| Planning Technician|
|       Booking Clerk|
|      Sub-Postmaster|
|        Shelf Filler|
|             Chemist|
|        Betting Shop|
|     Hire Car Driver|
|    Heating Engineer|
|    Vehicle Assessor|
|   Building Surveyor|
|Advertising Contr...|
|   Medical Physicist|
|            Labourer|
|   Technical Analyst|
+--------------------+
only showing top 20 rows



In [12]:
# BONUS
# What is the average age and height for each academic degree type?
# HINT: You will need to use `groupby` to solve this
avg_df = df.groupBy("academic_degree").avg()
avg_df.select("academic_degree", "avg(age)", "avg(height_meter)").show()

+---------------+------------------+------------------+
|academic_degree|          avg(age)| avg(height_meter)|
+---------------+------------------+------------------+
|            PhD| 43.15976331360947|1.7438165680473379|
|         Master|43.139318885448915|1.7549226006191951|
|       Bachelor| 42.51032448377581| 1.757227138643069|
+---------------+------------------+------------------+



In [13]:
# Using agg
avg_df = df \
    .groupBy("academic_degree") \
    .agg(
        F.avg("age").alias("average age"), 
        F.avg("height_meter").alias("average height")
    ) \
    .show()

+---------------+------------------+------------------+
|academic_degree|       average age|    average height|
+---------------+------------------+------------------+
|            PhD| 43.15976331360947|1.7438165680473379|
|         Master|43.139318885448915|1.7549226006191951|
|       Bachelor| 42.51032448377581| 1.757227138643069|
+---------------+------------------+------------------+

