# Setup

In [13]:
import os
import geopandas as gpd
import pandas as pd

### Source files downloaded from Google Cloud Storage to ../data/tabular/
* gs://mpg-aerial-survey/ground_truth/horsepile/processed/horse_pile_presence.csv
* gs://mpg-aerial-survey/ground_truth/horsepile/processed/horse_pile_absence.csv

In [14]:
# External data download directory
tabular_dir = "../data/tabular/"

# Wrangle

### Key metadata notes:
- idx
- Presence
- Zone
- Period
- Description
- Datetime
- Latitude
- Longitude
- Ellipsoidal height

## Features

In [20]:
# Read presence and absence files in DataFrames
df_presence = pd.read_csv(os.path.join(tabular_dir, "horse_pile_presence.csv"))
df_absence = pd.read_csv(os.path.join(tabular_dir, "horse_pile_absence.csv"))

In [21]:
# Create presence/absence column (1 or 0)
df_presence['Presence'] = 1
df_absence['Presence'] = 0

In [22]:
# Cast presence Averaging start to MST datetime and convert to UTC
df_presence['Averaging start'] = pd.to_datetime(df_presence['Averaging start'], format='mixed')
df_presence['Averaging start'] = df_presence['Averaging start'].dt.tz_convert('UTC')

In [23]:
# Set absence Averaging start to 240724 12:00:00 MST cast to UTC datetime 
df_absence['Averaging start'] = pd.to_datetime('2024-07-24 12:00:00')  # Create naive datetime
df_absence['Averaging start'] = df_absence['Averaging start'].dt.tz_localize('US/Mountain')  # Localize to MST
df_absence['Averaging start'] = df_absence['Averaging start'].dt.tz_convert('UTC')  # Convert to UTC

In [24]:
# Rename Averaging start to datetime
df_presence.rename(columns={'Averaging start': 'Datetime'}, inplace=True)
df_absence.rename(columns={'Averaging start': 'Datetime'}, inplace=True)

In [25]:
# Concatenate presence and absence files
df_ground_truth = pd.concat([df_presence, df_absence])

In [None]:
# Create df_ground_truth['idx'] = range(len(df_ground_truth)) padded four digits
df_ground_truth['idx'] = [f'{i:04d}' for i in range(len(df_ground_truth))]

In [28]:
# Keep relevant columns
df_relevant = df_ground_truth[['idx', 'Presence', 'Zone', 'Period', 
                               'Datetime', 'Latitude', 'Longitude', 'Ellipsoidal height']]

In [29]:
# Convert to geopandas dataframe
gdf = gpd.GeoDataFrame(df_relevant, geometry=gpd.points_from_xy(df_relevant.Longitude, df_relevant.Latitude))
# Set to WGS84 since that's what the coordinates are
gdf.set_crs(epsg=4326, inplace=True)
# Convert to UTM Zone 11N
gdf_utm = gdf.to_crs(epsg=32611)

In [30]:
# Use geopandas to create easting and northing columns from latitude and longitude
gdf_utm['Easting'] = gdf_utm.geometry.x
gdf_utm['Northing'] = gdf_utm.geometry.y

In [51]:
# Reorder columns placing Latitude and Longitude and Easting and Northing next to each other
gdf_utm = gdf_utm[['idx', 'Presence', 'Zone', 'Period', 'Datetime', 
                   'Latitude', 'Longitude', 'Easting', 'Northing', 
                   'Ellipsoidal height', 'geometry']]

In [None]:
gdf_utm.info()

In [None]:
gdf_utm.head()

# Export

In [54]:
# Export to data/vector/groundtruth.geojson
processed_dir = "../data/vector"
gdf_utm.to_file(os.path.join(processed_dir, "groundtruth.geojson"), driver="GeoJSON")

## save as geojson in data/vector/groundtruth.geojson
