**Udacity Data Engineering Capstone Project**<br/>
Avraam Marimpis <avraam.marimpis@gmail.com>, October 2020

- - -


__Load and Sample.ipynb__

In this notebook, we wil load the raw data from CSV into Spark with the given predefined Schemas.

We will make smaller samples of the datasets and store them into Parquet files (with proper partitioning if able) either locally, for development purposes, or in S3 buckets.

# Imports

In [1]:
import sys
sys.path.append('config/')

import config
import data as cnf_data

sys.path.append("common/")
import aws_dwh

In [2]:
import pyspark.sql.types as t
import pyspark.sql.functions as fn

In [3]:
import sqlite3

In [4]:
import pandas as pd

In [5]:
import json

# Clean up

In [6]:
!rm -rf {config.ARTIFACTS}/*

# Add AWS/S3 JARs to Spark

In [7]:
dwh = aws_dwh.parse_dwh()

In [8]:
spark.stop()

spark = SparkSession.builder \
            .appName("my_app") \
            .config('spark.sql.codegen.wholeStage', False) \
            .config("spark.driver.extraClassPath", "/home/vagrant/opt/libs/aws-java-sdk-1.7.4.jar") \
            .config("spark.jars", "/home/vagrant/opt/libs/hadoop-aws-2.7.2.jar") \
            .getOrCreate()

spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", dwh['aws']['access_key_id'])
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", dwh['aws']['secret_access_key'])

spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3a.S3AFileSystem")
# spark._jsc.hadoopConfiguration().set("fs.s3a.impl","org.apache.hadoop.fs.s3native.NativeS3FileSystem")
spark._jsc.hadoopConfiguration().set("com.amazonaws.services.s3.enableV4", "true")
spark._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider","org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider")
spark._jsc.hadoopConfiguration().set("fs.s3a.endpoint", f"s3.{dwh['aws']['region']}.amazonaws.com")

sc = spark.sparkContext
sc.setSystemProperty("com.amazonaws.services.s3.enableV4", "true")

# Load and sample dataset "Wildfires" 

In [9]:
sqlite_fname = f"{config.RAW_DATA}/FPA_FOD_20170508.sqlite"

In [10]:
conn = sqlite3.connect(sqlite_fname)

In [11]:
cursor = conn.cursor()

In [12]:
q = ("""
    SELECT
        OBJECTID,
        FOD_ID,
        FPA_ID,
        FIRE_CODE,
        FIRE_NAME,
        ICS_209_INCIDENT_NUMBER,
        ICS_209_NAME,
        MTBS_ID,
        MTBS_FIRE_NAME,
        COMPLEX_NAME,
        FIRE_YEAR,
        DISCOVERY_DATE,
        DISCOVERY_DOY,
        DISCOVERY_TIME,
        STAT_CAUSE_CODE,
        STAT_CAUSE_DESCR,
        CONT_DATE,
        CONT_DOY,
        CONT_TIME,
        FIRE_SIZE,
        FIRE_SIZE_CLASS,
        STATE,
        COUNTY,
        FIPS_CODE,
        FIPS_NAME
    FROM
        fires;
""")

In [13]:
df = pd.read_sql(q, conn)

In [14]:
cursor.close()

In [15]:
epoch = pd.to_datetime(0, unit='s').to_julian_date()

In [16]:
DISCOVERY_DATE_converted = pd.to_datetime(df['DISCOVERY_DATE'] - epoch, unit='D')

In [17]:
CONT_DATE_converted = pd.to_datetime(df['CONT_DATE'] - epoch, unit='D')

In [18]:
df['DISCOVERY_DATE_converted'] = DISCOVERY_DATE_converted

In [19]:
df['CONT_DATE_converted'] = CONT_DATE_converted

In [20]:
len(df)

1880465

In [21]:
if cnf_data.DATASET_SAMPLE:
    sample_wildfires = df.groupby("COUNTY").sample(frac=cnf_data.DATASET_SAMPLE_FRAC, random_state=1)
    df = sample_wildfires

In [22]:
len(df)

60026

In [23]:
if cnf_data.DATASET_STORE == "local":
    df.to_csv(path_or_buf=f"{config.ARTIFACTS}/sample_wildfires.csv", index=False)
else:
    from io import StringIO
    import boto3
    
    aws_s3 = boto3.resource(
        's3',
        region_name=dwh['aws']['region'],
        aws_access_key_id=dwh['aws']['access_key_id'],
        aws_secret_access_key=dwh['aws']['secret_access_key']
    )

    buffer = StringIO()
    df.to_csv(buffer, sep=",", index=False)
    aws_s3.Object(dwh['s3']['bucket-1']['name'], "sample_wildfires.csv").put(Body=buffer.getvalue())

# Load and sample dataset "Air Quality"

In [24]:
schema = t.StructType([
    t.StructField('state_code', t.IntegerType()),
    t.StructField('county_code',t.StringType()),
    t.StructField('site_num', t.StringType()),
    t.StructField('parameter_code', t.IntegerType()),
    t.StructField('poc', t.IntegerType()),
    t.StructField('latitude', t.FloatType()),
    t.StructField('longitude', t.FloatType()),
    t.StructField('datum', t.StringType()),
    t.StructField('parameter_name', t.StringType()),
    t.StructField('sample_duration', t.StringType()),
    t.StructField('pollutant_standard', t.StringType()),
    t.StructField('date_local', t.StringType()),
    t.StructField('units_of_measure', t.StringType()),
    t.StructField('event_type', t.StringType()),
    t.StructField('observation_count', t.IntegerType()),
    t.StructField('observation_percent', t.StringType()),
    t.StructField('arithmetic_mean', t.FloatType()),
    t.StructField('first_max_value', t.FloatType()),
    t.StructField('first_max_hour', t.IntegerType()),
    t.StructField('aqi', t.IntegerType()),
    t.StructField('method_code', t.IntegerType()),
    t.StructField('method_name', t.StringType()),
    t.StructField('local_site_name',t.StringType()),
    t.StructField('address', t.StringType()),
    t.StructField('state_name', t.StringType()),
    t.StructField('county_name', t.StringType()),
    t.StructField('city_name', t.StringType()),
    t.StructField('cbsa_name', t.StringType()),
    t.StructField('date_of_last_change', t.DateType())
])

In [25]:
df = spark.read.csv(f"{config.RAW_DATA}/co_daily_*.csv", header=True, schema=schema)

In [26]:
df.count()

1437692

In [27]:
if cnf_data.DATASET_SAMPLE:
    sample = df.sample(fraction=cnf_data.DATASET_SAMPLE_FRAC, withReplacement=True)
    df = sample

In [28]:
df.select("*").limit(10).toPandas().transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
state_code,38,46,38,38,46,38,38,38,5,46
county_code,015,127,015,015,127,015,015,015,093,127
site_num,0003,0001,0003,0003,0001,0003,0003,0003,0005,0001
parameter_code,42101,42101,42101,42101,42101,42101,42101,42101,42101,42101
poc,1,3,1,1,3,1,1,1,1,3
latitude,46.8254,42.7515,46.8254,46.8254,42.7515,46.8254,46.8254,46.8254,35.8909,42.7515
longitude,-100.768,-96.7072,-100.768,-100.768,-96.7072,-100.768,-100.768,-100.768,-89.784,-96.7072
datum,NAD83,WGS84,NAD83,NAD83,WGS84,NAD83,NAD83,NAD83,WGS84,WGS84
parameter_name,Carbon monoxide,Carbon monoxide,Carbon monoxide,Carbon monoxide,Carbon monoxide,Carbon monoxide,Carbon monoxide,Carbon monoxide,Carbon monoxide,Carbon monoxide
sample_duration,1 HOUR,1 HOUR,1 HOUR,8-HR RUN AVG END HOUR,8-HR RUN AVG END HOUR,8-HR RUN AVG END HOUR,8-HR RUN AVG END HOUR,1 HOUR,1 HOUR,8-HR RUN AVG END HOUR


In [29]:
if cnf_data.DATASET_STORE == "local":
    !rm -rf {config.ARTIFACTS}/sample_air_quality
    df.write.mode("overwrite").parquet("/tmp/sample_air_quality/")
    !mv /tmp/sample_air_quality/ {config.ARTIFACTS}/
else:
    df.write.mode("overwrite").parquet(f"{dwh['s3']['bucket-1']['FQN']}/sample_air_quality/")

# Load and sample dataset "US Droughts"

In [30]:
schema = t.StructType([
    t.StructField("releaseDate", t.DateType()),
    t.StructField("FIPS", t.LongType()),
    t.StructField("county", t.StringType()),
    t.StructField("state", t.StringType()),
    t.StructField("NONE", t.FloatType()),
    t.StructField("D0", t.FloatType()),
    t.StructField("D1", t.FloatType()),
    t.StructField("D2", t.FloatType()),
    t.StructField("D3", t.FloatType()),
    t.StructField("D4", t.FloatType()),
    t.StructField("validStart", t.DateType()),
    t.StructField("validEnd", t.DateType()),
    t.StructField("domStatisticFormatID", t.IntegerType()),
])

1. Clean "county" field, remove the text "county".

In [31]:
df = spark.read.csv(f"{config.RAW_DATA}/us-droughts.csv", header=True, schema=schema)

In [32]:
df = df.withColumn("county_cleaned", fn.trim(fn.regexp_replace(fn.col("county"), " County", "")))

In [33]:
# # New partition columns
# part_cols = {
#     "part_year": fn.year(fn.col("releaseDate")),
#     "part_month": fn.month(fn.col("releaseDate"))
# }

# for new_col, col_fn in part_cols.items():
#     df = df.withColumn(new_col, col_fn)

In [34]:
if cnf_data.DATASET_SAMPLE:
    sample = df.sample(fraction=cnf_data.DATASET_SAMPLE_FRAC, withReplacement=True)
    df = sample

In [35]:
if cnf_data.DATASET_STORE == "local":
    !rm -rf {config.ARTIFACTS}/sample_us_droughts
    df.write.mode("overwrite").parquet("/tmp/sample_us_droughts")
    !mv /tmp/sample_us_droughts/ {config.ARTIFACTS}/
else:
    df.write.mode("overwrite").parquet(f"{dwh['s3']['bucket-1']['FQN']}/sample_us_droughts")

# Load and sample dataset "Global Temperatures"

In [36]:
schema = t.StructType([
    t.StructField("dt", t.DateType()),
    t.StructField("AverageTemperature", t.FloatType()),
    t.StructField("AverageTemperatureUncertainty", t.FloatType()),
    t.StructField("State", t.StringType()),
    t.StructField("Country", t.StringType()),
])

1. We need to filter based on country 

In [37]:
df = spark.read.csv(f"{config.RAW_DATA}/GlobalLandTemperaturesByState.csv", header=True, schema=schema)

In [38]:
df.count()

645675

In [39]:
df = df.select("*").where(fn.col("Country") == "United States")

In [40]:
df = df.drop("Country")

In [41]:
df.count()

149745

In [42]:
df.select("*").limit(1).show()

+----------+------------------+-----------------------------+-------+
|        dt|AverageTemperature|AverageTemperatureUncertainty|  State|
+----------+------------------+-----------------------------+-------+
|1743-11-01|            10.722|                        2.898|Alabama|
+----------+------------------+-----------------------------+-------+



In [43]:
# # New partition columns
# part_cols = {
#     "part_year": fn.year(fn.col("dt")),
#     "part_month": fn.month(fn.col("dt"))
# }

# for new_col, col_fn in part_cols.items():
#     df = df.withColumn(new_col, col_fn)

In [44]:
if cnf_data.DATASET_SAMPLE:
    sample = df.sample(fraction=cnf_data.DATASET_SAMPLE_FRAC, withReplacement=True)
    df = sample

In [45]:
df.count()

7551

In [46]:
if cnf_data.DATASET_STORE == "local":
    !rm -rf {config.ARTIFACTS}/sample_global_temperatures
    df.write.mode("overwrite").parquet("/tmp/sample_global_temperatures")
    !mv /tmp/sample_global_temperatures/ {config.ARTIFACTS}/
else:
    df.write.mode("overwrite").parquet(f"{dwh['s3']['bucket-1']['FQN']}/sample_global_temperatures")