# Lab 2 - Automated Data Quality Assurance Pipeline
### Luke Zaruba
### GIS 5572: ArcGIS II
### 2023-03-09


In this lab, the goal is to build a pipeline that will extract data, perform QAQC operations on the data, and the save the data locally in a File Geodatabase, before saving it to a PostgreSQL database hosted on Google Cloud.

The way in which I do this is as follows:
1. Use OOP to abstract different parts of the pipeline such as:
    - Loading Data (RasterLoader, VectorLoader, DataLoader)
    - Processing Data (RasterProcessor, VectorProcessor)
    - Exporting Data (DataExporter)
2. Take any given input dataset and put it into the DataLoader, which standardizes the format (to NumPy Arrays, which can represent both raster and vector data)


In [2]:
# Packages for Processing Data
import pandas as pd
import numpy as np
import arcpy

# Packages for Accessing Data
import requests
from io import BytesIO
from zipfile import ZipFile

import os

In [4]:
# January Daily Weather Obs for MN Stations (GeoJSON)
weather_url = r"https://mesonet.agron.iastate.edu/api/1/daily.geojson?network=MN_RWIS&month=1&year=2023"

# EDD Maps BMSB Data - Requested and Received via Email. (CSV path)
bmsb_csv = r"C:\gitFiles\GIS5572\Lab2\Data\eddmaps_bmsb.csv"

# MN NLCD 2019 Land Cover (Zipped TIF)
landcover_url = r"https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dnr/biota_landcover_nlcd_mn_2019/tif_biota_landcover_nlcd_mn_2019.zip"

# MN 30m DEM (Zipped GDB)
elevation_url = r"https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dnr/elev_30m_digital_elevation_model/fgdb_elev_30m_digital_elevation_model.zip"

## Raster Pipeline

In [7]:
class RasterQAQC:
    def __init__(self, url, file_type, output_dir):
        self.url = url
        self.file_type = file_type
        self.output_dir = output_dir

    def unzip(self, zip_file_name):
        # Request Data
        response = requests.get(self.url)

        # Save Locally
        with open(os.path.join(self.output_dir, f"{zip_file_name}.zip"), "wb") as wf:
            wf.write(response.content)

        # Extract to File
        with ZipFile(os.path.join(self.output_dir, f"{zip_file_name}.zip"), "r") as uf:
            uf.extractall(self.output_dir)

    def reformat(self, file_path):
        if self.file_type == "TIF":
            self.file_path = file_path

        elif self.file_type == "GDB":
            # Split Input File Path to Name and Dir
            _, file_name = os.path.split(file_path)

            # Create New Output File Path
            self.file_path = os.path.join(self.output_dir, file_name)

            # Export to TIFF
            arcpy.management.CopyRaster(file_path, self.file_path, "TIFF")

    def check_raster(self, expected_cell_size, categorical=True, expected_srid=None, xmin=None, ymin=None, xmax=None, ymax=None):
        # Check for Null Values
        null_values = arcpy.management.GetRasterProperties(self.file_path, "ANYNODATA").getOutput(0)

        print(f"Null values exist: {null_values}.")

        # Check if Cell Size is Correct
        x_size = float(arcpy.management.GetRasterProperties(self.file_path, "CELLSIZEX").getOutput(0))
        y_size = float(arcpy.management.GetRasterProperties(self.file_path, "CELLSIZEY").getOutput(0))

        if x_size == expected_cell_size and y_size == expected_cell_size:
            print("Actual spatial resolution matches expected spatial resolution.")
        else:
            print("Actual spatial resolution does not match expected spatial resolution.")

        # If Dataset is not Categorical, Check if there are Outliers
        if categorical == False:
            mean_val = float(arcpy.management.GetRasterProperties(self.file_path, "MEAN").getOutput(0))
            std_val = float(arcpy.management.GetRasterProperties(self.file_path, "STD").getOutput(0))

            max_val = float(arcpy.management.GetRasterProperties(self.file_path, "MAXIMUM").getOutput(0))
            min_val = float(arcpy.management.GetRasterProperties(self.file_path, "MINIMUM").getOutput(0))

            # Check if Min < Mean - 3 Std Devs or if Max > Mean + 3 Std Devs
            if min_val < (mean_val - (3 * std_val)) or max_val > (mean_val + (3 * std_val)):
                print("Outliers exist within the dataset. Values exist outside of +- 3 standard deviations of the mean.")
            else:
                print("Outliers do not exist within the dataset. No values +- 3 standard deviations of the mean.")
        else:
            print("Raster is categorical. Not checking for outliers.")

        # Check CRS of Raster
        sr = arcpy.Describe(self.file_path).spatialReference

        if expected_srid == None:
            print(f"Coordinate system of the raster is: {sr}")
        else:
            arcpy_expected_sr = arcpy.SpatialReference(expected_srid)

            if arcpy_expected_sr == sr:
                print("Actual coordinate system matches expected coordinate system.")
            else:
                print("Actual coordinate system does not match expected coordinate system.")
                print(f"Coordinate system of the raster is: {sr}")

        # Check if Raster is within Bounding Box (an ArcPy Polygon, with the same CS)
        if None not in [xmin, ymin, xmax, ymax]:
            left = arcpy.management.GetRasterProperties(self.file_path, "LEFT").getOutput(0)
            bottom = arcpy.management.GetRasterProperties(self.file_path, "BOTTOM").getOutput(0)
            right = arcpy.management.GetRasterProperties(self.file_path, "RIGHT").getOutput(0)
            top = arcpy.management.GetRasterProperties(self.file_path, "TOP").getOutput(0)

            if left < xmin or bottom < ymin or right > xmax or top > ymax:
                print("Raster is not completely contained withing the bounding box coordinates.")
            else:
                print("Raster is completely contained withing the bounding box coordinates.")
        else:
            print("Not checking bounding box.")


In [9]:
# Set up QA QC Object
dem_qaqc = RasterQAQC(elevation_url, "GDB", r"C:\gitFiles\GIS5572\Lab2\Data\dem")

# Unzip File
dem_qaqc.unzip("dem_zip")

KeyboardInterrupt: 

In [None]:

# Reformat Raster
#dem_qaqc.reformat(r"C:\gitFiles\GIS5572\Lab2\Data\dem\mn_elevation.tif")