### Lab 2 - Data Quality and Model Evaluation
#### Mattie Gisselbeck
#### GIS 5572 - ArcGIS II
#### March 20, 2023

The objective of this lab was to build a pipeline that extracts data from various platforms, perform QAQC operations on the imported data, saves the data to a local geodatabase, and then saves it to a PostgresSQL database hosted on Google Cloud.
<br>
<br>
Data Sources

[Digital Elevation Model (DEM) - 30 Meter Resolution, Minnesota Geospatial Commons](https://gisdata.mn.gov/dataset/elev-30m-digital-elevation-model) <br>
[NLCD Land Cover, 2019, Minnesota Geospatial Commons](https://gisdata.mn.gov/dataset/biota-landcover-nlcd-mn-2019) <br>
[Brown Marmorated Stink Bug (BMSB) Observances, Minnesota, EDDMapS](https://www.eddmaps.org/distribution/viewmap.cfm?sub=9328) <br>
[RWIS Daily Weather (Minnesota RWIS), Iowa Environmental Mesonet (IEM)](https://mesonet.agron.iastate.edu/api/1/docs#/)

In [None]:
import pandas as pd
import arcpy
import arcgis
import requests
import os
import warnings

In [None]:
# NCLD Land Cover (2019), Minnesota - Minnesota Geospatial Commons
landcover_path = r"/Users/mattiegisselbeck/Documents/GIS5572/Lab2/NLCD_2019_Land_Cover.tif"

# Digital Elevation Model (DEM), Minnesota - Minnesota Geospatial Commons
dem_path = r"/Users/mattiegisselbeck/Documents/GIS5572/Lab2/elev_30m_digital_elevation_model.gdb/digital_elevation_model_30m"

# Brown Marmorated Stink Bug (BMSB) Observances, Minnesota - EDDMapS
bmsb_path = r"/Users/mattiegisselbeck/Documents/GIS5572/Lab2/BMSBObservances_EDDMapS.csv"

In [None]:
# Output FGDB Path
local_path = r"/Users/mattiegisselbeck/Documents/GIS5572/Lab2/Lab2.gdb" 

## NCLD Land Cover and Digital Elevation Model (DEM), Minnesota - Minnesota Geospatial Commons
### Quality Assurance (QA) 

In [None]:
def check_raster(file_path, categorical=True, expected_cell_size=None, expected_srid=None, xmin=None, ymin=None, xmax=None, ymax=None):
    """
    A function to check the quality of a raster dataset prior to using any methods to fix issues.
    """
    # Check for Null Values
    null_values = arcpy.management.GetRasterProperties(file_path, "ANYNODATA").getOutput(0)

    if null_values == "1":
        print("Null values exist.")
    else:
        print("Null values do not exist.")

    # Check if Cell Size is Correct
    x_size = float(arcpy.management.GetRasterProperties(file_path, "CELLSIZEX").getOutput(0))
    y_size = float(arcpy.management.GetRasterProperties(file_path, "CELLSIZEY").getOutput(0))

    if x_size == expected_cell_size and y_size == expected_cell_size:
        print("Actual spatial resolution matches expected spatial resolution.")
    else:
        print("Actual spatial resolution does not match expected spatial resolution.")

    # If Dataset is not Categorical, Check if there are Outliers
    if categorical == False:
        mean_val = float(arcpy.management.GetRasterProperties(file_path, "MEAN").getOutput(0))
        std_val = float(arcpy.management.GetRasterProperties(file_path, "STD").getOutput(0))

        max_val = float(arcpy.management.GetRasterProperties(file_path, "MAXIMUM").getOutput(0))
        min_val = float(arcpy.management.GetRasterProperties(file_path, "MINIMUM").getOutput(0))

        # Check if Min < Mean - 3 Std Devs or if Max > Mean + 3 Std Devs
        if min_val < (mean_val - (3 * std_val)) or max_val > (mean_val + (3 * std_val)):
            print("Outliers exist within the dataset. Values exist outside of +- 3 standard deviations of the mean.")
            else:
            print("Outliers do not exist within the dataset. No values +- 3 standard deviations of the mean.")
    else:
        print("Raster is categorical. Not checking for outliers.")

    # Check CRS of Raster
    sr = arcpy.Describe(file_path).spatialReference

    if expected_srid == None:
        print(f"Coordinate system of the raster is: {sr}")
    else:
        arcpy_expected_sr = arcpy.SpatialReference(expected_srid)

        if arcpy_expected_sr.factoryCode == sr.factoryCode:
            print("Actual coordinate system matches expected coordinate system.")
        else:
            print("Actual coordinate system does not match expected coordinate system.")
            print(f"Coordinate system of the raster is: {sr.factoryCode}")

    # Check if Raster is within Bounding Box (an ArcPy Polygon, with the same CS)
    if None not in [xmin, ymin, xmax, ymax]:
        left = float(arcpy.management.GetRasterProperties(file_path, "LEFT").getOutput(0))
        bottom = float(arcpy.management.GetRasterProperties(file_path, "BOTTOM").getOutput(0))
        right = float(arcpy.management.GetRasterProperties(file_path, "RIGHT").getOutput(0))
        top = float(arcpy.management.GetRasterProperties(file_path, "TOP").getOutput(0))
        if left < xmin or bottom < ymin or right > xmax or top > ymax:
            if left < xmin - 1 or bottom < ymin - 1 or right > xmax + 1 or top > ymax + 1:
                print("Raster is within 1 degree of the bounding box coordinates. Use caution and consider inspecting manually.")
            else:
                print("Raster is not completely contained within the bounding box coordinates.")
        else:
            print("Raster is completely contained within the bounding box coordinates.")
    else:
        print("Not checking bounding box.")


In [None]:
# Checking NCLD Land Cover
check_raster(landcover_path, True, 30, 26915, -97.5, 43.0, -89.00, 49.5)

In [None]:
# Checking Digital Elevation Model (DEM)
check_raster(elevation_path, False, 30, 26915, -97.5, 43.0, -89.00, 49.5)

The extent of the NCLD Land Cover spans outside of state boundaries, so it was corrected by clipped to the Minnesota BBox.

In [None]:
# Clipping NCLD Land Cover to Minnesota BBox
arcpy.management.Clip(landcover_path, "132660 4774410 791819 5491608", os.path.join(out_local, "clipped_landcover"))

## Brown Marmorated Stink Bug (BMSB) Observances, Minnesota - EDDMapS
### Quality Assurance (QA)

In [None]:
# Loading in BMSB Observations
bmsb_df_raw = pd.read_csv(bmsb_path)

# Create Copy DF with only Certain Columns
bmsb_df = bmsb_df_raw[["objectid", "ObsDate", "Location", "Latitude", "Longitude", "NumCollect"]].copy()

# Filter where Location Contains 'Minnesota'
bmsb_df = bmsb_df[bmsb_df["Location"].str.contains("Minnesota")]

# Fill 'NumCollect' Nulls with 1
bmsb_df["NumCollect"].fillna(1, inplace=True)

# Drop Rows with Null 'Latitude' or 'Longitude'
bmsb_df = bmsb_df.dropna(subset=["Latitude", "Longitude"])

# Convert Data Types
bmsb_df["Location"] = bmsb_df["Location"].astype(str)
bmsb_df["ObsDate"] = bmsb_df["ObsDate"].astype('datetime64[ns]')
bmsb_df["NumCollect"] = bmsb_df["NumCollect"].astype(int)

# Reconfigure the Location Column to just show County Name
bmsb_df["Location"] = bmsb_df["Location"].apply(lambda x: x.replace('"', ''))
bmsb_df["County"] = bmsb_df["Location"].apply(lambda x: x.split(",")[0])
bmsb_df = bmsb_df.drop(["Location"], axis=1)

# Drop Rows where 'NumCollect' < 1
bmsb_df = bmsb_df.loc[~bmsb_df["NumCollect"] < 1]

# Drop Rows where 'NumCollect' are Outliers (> 1 Std Dev above the Mean)
numMean = bmsb_df["NumCollect"].mean()
numStd = bmsb_df["NumCollect"].std()

bmsb_df = bmsb_df.loc[~bmsb_df["NumCollect"] < numMean + numStd]

# Drop Rows where Lat/Lon are Outside MN BBox
bmsb_df = bmsb_df.loc[bmsb_df["Longitude"] > -97.5]
bmsb_df = bmsb_df.loc[bmsb_df["Longitude"] < -89.0]
bmsb_df = bmsb_df.loc[bmsb_df["Latitude"] > 43.0]
bmsb_df = bmsb_df.loc[bmsb_df["Latitude"] < 49.5]

# Result
bmsb_df

In [None]:
# Convert BMSB Observations from DF to SEDF
bmsb_sedf = arcgis.GeoAccessor.from_xy(bmsb_df, "Longitude", "Latitude")

# Convert BMSB Observations from SEDF to FC
bmsb_sedf.spatial.to_featureclass(location=os.path.join(out_local, "bmsb_observations"))

## Daily Weather, Minnesota - Iowa Environmental Mesonet
### GET Request

In [None]:
# Request weather data from 153 weather stations of the RWIS Minnesota Network selecting a random day
link = r'https://mesonet.agron.iastate.edu/api/1/daily.geojson?date=2023-03-15&network=MN_RWIS'
info = json.loads(requests.get(link).text)
location = []
for i in range(len(info['features'])):
    # Store the station and its coordinates
    location.append({'station': info['features'][i]['properties']['station'], 
                     'coordinates': info['features'][i]['geometry']['coordinates']})


# Request daily weather data by year (2023)
url = r'https://mesonet.agron.iastate.edu/api/1/daily.geojson?network=MN_RWIS&year=2023'
weather = json.loads(requests.get(url).text)

# Delete all the unnesscary weather variables and keep only minimum and maximum temperature
delete = [
      "tmpf_est",
      "precip",
      "precip_est",
      "max_gust",
      "snow",
      "snowd",
      "min_rh",
      "max_rh",
      "min_dwpf",
      "max_dwpf",
      "min_feel",
      "max_feel",
      "min_rstage",
      "max_rstage",
      "temp_hour",
      "max_gust_localts",
      "max_drct",
      "avg_feel", 
      "avg_sknt", 
      "vector_avg_drct", 
      "id"
]

for i in range(len(weather['features'])):
    for key in delete:
        del weather['features'][i]['properties'][key]

In [None]:
# Request Minnesota State Boundary from Minnesota Geospatial Commons
mnboundary_url = "https://resources.gisdata.mn.gov/pub/gdrs/data/pub/us_mn_state_dot/bdry_state/shp_bdry_state.zip"
mnboundary = requests.post(mnboundary_url)
zipfile.ZipFile(io.BytesIO(mnboundary.content)).extractall(wksp)

# Project Boundaries_of_Minnesota.shp
sr = arcpy.SpatialReference(4326)
arcpy.Project_management('Boundaries_of_Minnesota.shp', 'MinnesotaBoundry.shp', sr)

### Quality Assurance (QA)

Note: the optimum range is 62-99 F, but a different range is used to train the code as the temperature recorded for 2023 is still below the optimum range due to the winter.

In [None]:
def ranges_intersect(range1, range2):
    """
    Returns True if the two ranges intersect, False otherwise.
    Each range is a tuple of two numbers representing the minimum and maximum values of the range.
    """
    if range1[1] < range2[0] or range2[1] < range1[0]:
        return False
    else:
        return True
    
# Define optimal temperature for BMSB
lower_temp = 10 #62
upper_temp = 50 #99
opt_temp = range(lower_temp, upper_temp+1)

In [None]:
# Read in the shapefile data for Minnesota
sf = shapefile.Reader("minnesota.shp")
shapes = sf.shapes()
state_border = shapes[0]

# Create a shapely Polygon object from the state border shape
border_polygon = shape(state_border)

# Create an empty list to add the not useful readings
wrong = []

# Add all the not useful readings to a list
for i in range(len(weather['features'])):
    
    # Readings whose temp readings are None
    if weather['features'][i]['properties']['min_tmpf'] == None or weather['features'][i]['properties']['max_tmpf'] == None:
        wrong.append(weather['features'][i])
        continue
    
    # Stations outside of Minnesota
    point = Point(weather['features'][i]['geometry']['coordinates'])
    if not border_polygon.contains(point):
        wrong.append(weather['features'][i])
        continue
        
    # Readings whose min and max temp are the same. This is an indicator of wrong data
    if weather['features'][i]['properties']['min_tmpf'] == weather['features'][i]['properties']['max_tmpf']:
        wrong.append(weather['features'][i])
        continue
        
    # Readings whose temp is outside of the optimum range
    lower_limit = math.floor(weather['features'][i]['properties']['min_tmpf'])
    upper_limit = math.ceil(weather['features'][i]['properties']['max_tmpf'])
    range_temp = range(lower_limit, upper_limit)
    
    # Readings not representative of the broader region if max and min temp are similar
    if len(range_temp) == 1:
        wrong.append(weather['features'][i]) 
        continue
    
# Delete the not useful readings 
for element in wrong:
    weather['features'].remove(element)

In [None]:
# Checking the Monthly Average Temperature

stations = []
# Add the dictionaries to a data frame
for j in range(len(weather['features'])):
    stations.append(weather['features'][j]['properties'])
df = pd.DataFrame.from_dict(stations)

# Remove the day part from the date leaving only year and month
for i in range(len(df['date'])):
    df['date'][i] = df['date'][i][:7]
    
# Get monthly average min and max temperature for each station
grouped = df.groupby(['station', 'date', 'name']).agg('mean')
grouped.reset_index(inplace=True)

# Return data to a dictionary
mean = grouped.to_dict('records')

In [None]:
# Add the geometry to the stations
mean_tmp = []
for i in range(len(mean)):
    for j in range(len(location)):
        if mean[i]['station'] == location[j]['station']:
            mean_tmp.append({'type': 'Feature', 'properties': mean[i], 
                             'geometry': {'type': 'Point', 'coordinates': location[j]['coordinates']}})
            
# Remove monthly average temperature if outside of the optimum range
bad_data = []
for i in range(len(mean_tmp)):
    lower_limit = math.floor(mean_tmp[i]['properties']['min_tmpf'])
    upper_limit = math.ceil(mean_tmp[i]['properties']['max_tmpf'])
    range_temp = range(lower_limit, upper_limit)
    # Temperature outside the optimum range is flagged
    if ranges_intersect(opt_temp, range_temp) == False:
        bad_data.append(mean_tmp[i])
        
# Delete the not useful readings 
for element in bad_data:
    mean_tmp.remove(element)

In [None]:
schema =  {'geometry': 'Point', 'properties': {'station': 'str', 'date': 'str', 'name': 'str', 'max_tmpf': 'float', 'min_tmpf': 'float'}}

with fiona.open("stations.shp", 'w', crs = from_epsg(4326), driver = 'ESRI Shapefile', schema = schema) as output:
    for i in range(len(mean_tmp)):
          # geometry
          point = Point(mean_tmp[i]['geometry']['coordinates'])
          # attributes
          prop = mean_tmp[i]['properties']
          # write the row (geometry + attributes in GeoJSON format)
          output.write({'geometry': geometry.mapping(point), 'properties':prop})

#### Save Weather Data to PostgresSQL Database (Google Cloud)

In [None]:
# Connect to PostgreSQL Database (Google Cloud)
connection = psycopg2.connect(host = '34.27.219.64',
                              port = '5432',
                              database = 'lab1',
                              user = 'postgres',
                              password = 'student',
                             )

In [None]:
data = ("stations.shp")
# fields I want from shapefile
fields = ["station", "date", "name", "max_tmpf", "min_tmpf", "Shape@WKT"]

# pscopg2 connection, replace *** and *** with your values
cursor = connection.cursor()
cursor.execute("DROP TABLE IF EXISTS stations")
cursor.execute("""
    CREATE TABLE stations (
        id SERIAL,
        station VARCHAR,
        date VARCHAR,
        name VARCHAR,
        max_tmpf DOUBLE PRECISION,
        min_tmpf DOUBLE PRECISION)
""")

cursor.execute("""
    SELECT AddGeometryColumn('stations', 'geom', 4326, 'POINT', 2)
""")

# use arcpy to get attribute data, populate PostGIS using psycopg2
with arcpy.da.SearchCursor(data, fields) as da_cursor:
    for row in da_cursor:
        wkt = row[5]
        # this was tough - everything needs to be a string and text being inserted wrapped in '' including wkt
        cursor.execute("INSERT INTO stations (station, date, name, max_tmpf, min_tmpf, geom) VALUES (%s, %s, %s, %s, %s, ST_GeomFromText(%s, 4326))", (row[0], row[1], row[2], row[3], row[4], wkt))

connection.commit()

# Close database connection
connection.close()

#### Export from Local FGDB to PostgreSQL (Google Cloud)

In [None]:
# Set up SDE Connection using PGAdmin & Catalog Pane in ArcGIS Pro
sde = r"/Users/mattiegisselbeck/Documents/GIS5572/Lab2/Lab2.sde"

# Export Vectors to Postgres
arcpy.conversion.FeatureClassToGeodatabase(
    f'{os.path.join(out_local, "daily_weather")};{os.path.join(out_local, "bmsb_observations")}',
    sde
)

# Export Rasters to Postgres
arcpy.conversion.RasterToGeodatabase(
    f'{os.path.join(out_local, "land_cover")};{os.path.join(out_local, "digital_elevation_model_30m")}',
    sde
)