In [1]:
# Install Additional Python Libraries
!pip install -r requirements.txt

In [2]:
# Install Adition Java Libraries
from spark_libs import spark_submit
packages = ["com.databricks:spark-csv_2.11:1.5.0"]
spark_submit(packages=packages)

Adding environment variable `PYSPARK_SUBMIT_ARGS`
--packages com.databricks:spark-csv_2.11:1.5.0 pyspark-shell


In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.sql import DataFrame
import pyspark.sql.functions as F

In [4]:
# get or create Spark session

app_name = "demographics"
spark = SparkSession.builder.appName(app_name).getOrCreate()

In [5]:
# Read in data from S3 Buckets
url = "https://s3.amazonaws.com/dataviz-curriculum/day_1/demographics.csv"
spark.sparkContext.addFile(url)

# df = spark.read.csv(SparkFiles.get("demographics.csv"), sep=",", header=True)
df = spark.read \
    .format("com.databricks.spark.csv") \
    .options(header='true', sep=",", inferSchema="true") \
    .load(SparkFiles.get("demographics.csv"))
# Show DataFrame
df.head()

Row(id=0, name='Darlena Avila', age=58, height_meter=1.87, weight_kg=53, children=1, occupation='Choreographer', academic_degree='PhD', salary=68, location='South Dakota')

In [6]:
# Print the column names
df.columns

['id',
 'name',
 'age',
 'height_meter',
 'weight_kg',
 'children',
 'occupation',
 'academic_degree',
 'salary',
 'location']

In [7]:
# Print out the first 10 rows
df.show(10)

+---+--------------------+---+------------+---------+--------+------------------+---------------+------+-------------+
| id|                name|age|height_meter|weight_kg|children|        occupation|academic_degree|salary|     location|
+---+--------------------+---+------------+---------+--------+------------------+---------------+------+-------------+
|  0|       Darlena Avila| 58|        1.87|       53|       1|     Choreographer|            PhD|    68| South Dakota|
|  1|            Yan Boyd| 65|         1.8|       40|       0|         Cellarman|       Bachelor|    73|     Delaware|
|  2|         Joette Lane| 32|         1.8|       73|       1|Veterinary Surgeon|         Master|    69| South Dakota|
|  3|        Jazmine Hunt| 61|        1.79|       89|       0|            Hawker|            PhD|    88|    Louisiana|
|  4|      Remedios Gomez| 23|        1.64|       51|       2|     Choreographer|       Bachelor|    83|West Virginia|
|  5|        Myung Brewer| 20|        1.68|     

In [8]:
# Select the age, height_meter, and weight_kg columns and use describe to show the summary statistics
df.select(["age", "height_meter", "weight_kg"]).describe().show()

+-------+------------------+------------------+------------------+
|summary|               age|      height_meter|         weight_kg|
+-------+------------------+------------------+------------------+
|  count|              1000|              1000|              1000|
|   mean|            42.933|1.7519499999999995|            64.011|
| stddev|14.255445581556843|0.1436897499623555|15.005733939099779|
|    min|                18|               1.5|                38|
|    max|                67|               2.0|                90|
+-------+------------------+------------------+------------------+



In [9]:
# Print the schema to see the types
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- height_meter: double (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- children: integer (nullable = true)
 |-- occupation: string (nullable = true)
 |-- academic_degree: string (nullable = true)
 |-- salary: integer (nullable = true)
 |-- location: string (nullable = true)



In [10]:
# Rename the Salary column to `Salary (1k)` and show only this new column
df = df.withColumnRenamed('Salary', 'Salary (1k)')
df.select("Salary (1k)").show()

+-----------+
|Salary (1k)|
+-----------+
|         68|
|         73|
|         69|
|         88|
|         83|
|         65|
|         72|
|         65|
|         87|
|         72|
|         73|
|         90|
|         78|
|         69|
|         75|
|         77|
|         76|
|         90|
|         79|
|         77|
+-----------+
only showing top 20 rows



In [11]:
# Create a new column called `Salary` where the values are the `Salary (1k)` * 1000
# Show the columns `Salary` and `Salary (1k)`
df = df.withColumn("Salary", df["Salary (1k)"] * 1000)
df.select(["Salary", "Salary (1k)"]).show()

+------+-----------+
|Salary|Salary (1k)|
+------+-----------+
| 68000|         68|
| 73000|         73|
| 69000|         69|
| 88000|         88|
| 83000|         83|
| 65000|         65|
| 72000|         72|
| 65000|         65|
| 87000|         87|
| 72000|         72|
| 73000|         73|
| 90000|         90|
| 78000|         78|
| 69000|         69|
| 75000|         75|
| 77000|         77|
| 76000|         76|
| 90000|         90|
| 79000|         79|
| 77000|         77|
+------+-----------+
only showing top 20 rows

