# Lab 2 - Automated Data Quality Assurance Pipeline
### Luke Zaruba
### GIS 5572: ArcGIS II
### 2023-03-09


In this lab, the goal is to build a pipeline that will extract data, perform QAQC operations on the data, and the save the data locally in a File Geodatabase, before saving it to a PostgreSQL database hosted on Google Cloud.

The way in which I do this is as follows:
1. Use OOP to abstract different parts of the pipeline such as:
    - Loading Data (RasterLoader, VectorLoader, DataLoader)
    - Processing Data (RasterProcessor, VectorProcessor)
    - Exporting Data (DataExporter)
2. Take any given input dataset and put it into the DataLoader, which standardizes the format (to NumPy Arrays, which can represent both raster and vector data)


In [None]:
import pandas as pd
import numpy as np
import arcpy
import requests
import os

In [None]:
class DataLoader:
    def __init__(self, filepath, data_type):
        self.filepath = filepath
        self.data_type = data_type
        self.data = None
    
    def load_data(self):
        if self.data_type == "raster":
            self.data = arcpy.RasterToNumPyArray(self.filepath)
        elif self.data_type == "vector":
            self.data = arcpy.FeatureClassToNumPyArray(self.filepath)


"""
>>> landcover_loader = DataLoader(r"file/path/lc.tif", "raster")
>>> landcover_loader.load_data()
>>> type(landcover_loader.data)
... numpy.ndarray

>>> landcover_processor = RasterDataProcessor(landcover_loader.data)
>>>
"""

In [None]:
class RasterDataProcessor:
    def __init__(self, data):
        self.data = data
        self.stats = None
        self.spatial_resolution = None
        self.crs = None
        self.bbox = None
    
    def calculate_stats(self):
        # Load to DF
        df = pd.DataFrame(self.data)
        # Calculate Stats
        self.stats = df.describe()
    
    def check_quality(self):
        # Check for missing or null values
        nodata_val = self.data.noDataValue
        if nodata_val is not None:
            null_count = np.count_nonzero(self.data == nodata_val)
            if null_count > 0:
                print(f"Warning: {null_count} null or missing values detected")
        
        # Validate spatial resolution
        cell_size_x = self.data.meanCellWidth
        cell_size_y = self.data.meanCellHeight
        self.spatial_resolution = (cell_size_x, cell_size_y)
        if cell_size_x != cell_size_y:
            print("Warning: non-square pixels detected")
        
        # Validate coordinate system
        self.crs = self.data.spatialReference
        if not self.crs.name:
            print("Warning: unknown coordinate system")
        else:
            print(f"Coordinate system: {self.crs.name}")
        
        # Check if dataset is within a bounding box
        extent = self.data.extent
        self.bbox = (extent.XMin, extent.YMin, extent.XMax, extent.YMax)
    
        if self.bbox[0] < -180 or self.bbox[1] < -90 or self.bbox[2] > 180 or self.bbox[3] > 90:
            print("Warning: data extends beyond the valid geographic extent (-180,-90,180,90)")

In [None]:
class VectorDataProcessor:
    def __init__(self, data):
        self.data = data
        self.stats = None
        self.crs = None
        self.bbox = None
        self.topology_errors = None
    
    def calculate_stats(self):
        if isinstance(self.data, arcpy._mp.Layer):
            df = pd.DataFrame.from_records(data=self.data)
        else:
            df = pd.DataFrame(self.data)
        self.stats = df.describe()
    
    def check_quality(self):
        # Check for missing or null values
        fields = arcpy.ListFields(self.data)
        for field in fields:
            if field.name not in ["OID", "Shape", "Shape_Length", "Shape_Area"] and field.type in ["Integer", "SmallInteger", "Single", "Double"]:
                null_count = arcpy.GetCount_management(self.data, f"{field.name}")
                if null_count > 0:
                    print(f"Warning: {null_count} null or missing values detected in field {field.name}")
        
        # Validate topology
        arcpy.CheckGeometry_management(self.data)
        if int(arcpy.GetMessageCount()) > 0:
            self.topology_errors = arcpy.GetMessages()
            print("Warning: topology errors detected")
        
        # Validate coordinate system
        self.crs = arcpy.Describe(self.data).spatialReference
        if not self.crs.name:
            print("Warning: unknown coordinate system")
        else:
            print(f"Coordinate system: {self.crs.name}")
        
        # Check if dataset is within a bounding box
        extent = arcpy.Describe(self.data).extent
        self.bbox = (extent.XMin, extent.YMin, extent.XMax, extent.YMax)
    
        if self.bbox[0] < -180 or self.bbox[1] < -90 or self.bbox[2] > 180 or self.bbox[3] > 90:
            print("Warning: data extends beyond the valid geographic extent (-180,-90,180,90)")
        
        # Remove outliers within a given field
        # To be implemented



In [None]:
class DataExporter:
    def __init__(self, data, filepath):
        self.data = data
        self.filepath = filepath
    
    def export_data(self):
        if isinstance(self.data, pd.DataFrame):
            arcpy.da.NumPyArrayToFeatureClass(self.data.to_records(), self.filepath)
        else:
            self.data.save(self.filepath)


In [None]:
class QAQC:
    def __init__(self, filepath, data_type):
        self.filepath = filepath
        self.data_type = data_type
        self.dataloader = DataLoader(filepath, data_type)
        self.dataprocessor = DataProcessor(self.dataloader.data)
        self.dataexporter = DataExporter(self.dataprocessor.stats, f"{filepath}_stats")
    
    def run_pipeline(self):
        self.dataloader.load_data()
        self.dataprocessor.calculate_stats()
        self.dataprocessor.check_quality()
        self.dataexporter.export_data()