# Setup

In [1]:
import os
import geopandas as gpd
import pandas as pd
import subprocess
import uuid

In [2]:
# External data download directory
raw_dir = "../data/tabular/raw"

# Create raw data directory if it doesn't exist
os.makedirs(raw_dir, exist_ok=True)

# Source

In [3]:
# list all sources of ground truth files
src_presence = ["gs://mpg-aerial-survey/ground_truth/horsepile/processed/horse_pile_presence.csv"]
src_absence = ["gs://mpg-aerial-survey/ground_truth/horsepile/processed/horse_pile_absence.csv"]

In [None]:
for file_path in src_presence:
    dest_filename = os.path.join(raw_dir, os.path.basename(file_path))
    subprocess.run([
        "gcloud", "storage", "cp",
        file_path,
        dest_filename
    ])
    print(f"Downloaded {file_path} to {dest_filename}")

# Download absence files
for file_path in src_absence:
    dest_filename = os.path.join(raw_dir, os.path.basename(file_path))
    subprocess.run([
        "gcloud", "storage", "cp",
        file_path,
        dest_filename
    ])
    print(f"Downloaded {file_path} to {dest_filename}")

# Wrangle

### Key metadata notes:
- Point_Index
- Presence
- Zone
- Period
- Description
- Datetime
- Latitude
- Longitude
- Ellipsoidal height

## Features

In [20]:
# Read presence and absence files in DataFrames
df_presence = pd.read_csv(os.path.join(raw_dir, "horse_pile_presence.csv"))
df_absence = pd.read_csv(os.path.join(raw_dir, "horse_pile_absence.csv"))

In [21]:
# Create presence/absence column (1 or 0)
df_presence['Presence'] = 1
df_absence['Presence'] = 0

In [22]:
# Cast presence Averaging start to MST datetime and convert to UTC
df_presence['Averaging start'] = pd.to_datetime(df_presence['Averaging start'], format='mixed')
df_presence['Averaging start'] = df_presence['Averaging start'].dt.tz_convert('UTC')

In [23]:
# Set absence Averaging start to 240724 12:00:00 MST cast to UTC datetime 
df_absence['Averaging start'] = pd.to_datetime('2024-07-24 12:00:00')  # Create naive datetime
df_absence['Averaging start'] = df_absence['Averaging start'].dt.tz_localize('US/Mountain')  # Localize to MST
df_absence['Averaging start'] = df_absence['Averaging start'].dt.tz_convert('UTC')  # Convert to UTC

In [24]:
# Rename Averaging start to datetime
df_presence.rename(columns={'Averaging start': 'Datetime'}, inplace=True)
df_absence.rename(columns={'Averaging start': 'Datetime'}, inplace=True)

In [25]:
# Create Point_Index column with p_\d\d\d and a_\d\d\d with \d\d\d being the existing index
df_presence['Point_Index'] = 'present_' + df_presence.index.astype(str).str.zfill(3)
df_absence['Point_Index'] = 'absent_' + df_absence.index.astype(str).str.zfill(3)

## Concatenate

In [32]:
# Concatenate presence and absence files
df_ground_truth = pd.concat([df_presence, df_absence])

# Print total records
print(f"Total records: {len(df_ground_truth)}")
print(f"Presence records: {len(df_presence)}")
print(f"Absence records: {len(df_absence)}")


Total records: 1600
Presence records: 884
Absence records: 716


In [33]:
# Keep relevant columns
df_relevant = df_ground_truth[['Point_Index', 'Presence', 'Zone', 'Period', 
                               'Datetime', 'Latitude', 'Longitude', 'Ellipsoidal height']]

In [53]:
df_relevant.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1600 entries, 0 to 715
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   Point_Index         1600 non-null   object             
 1   Presence            1600 non-null   int64              
 2   Zone                884 non-null    float64            
 3   Period              884 non-null    float64            
 4   Datetime            1600 non-null   datetime64[ns, UTC]
 5   Latitude            1600 non-null   float64            
 6   Longitude           1600 non-null   float64            
 7   Ellipsoidal height  1600 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(5), int64(1), object(1)
memory usage: 112.5+ KB


In [54]:
df_relevant.head()

Unnamed: 0_level_0,Presence,Zone,Period,Datetime,Latitude,Longitude,Ellipsoidal height
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
f3bb1f,1,1.0,1.0,2024-04-25 06:25:00.600000+00:00,46.678295,-114.007977,1182.161
699cc7,1,1.0,1.0,2024-04-25 06:25:24.200000+00:00,46.678261,-114.00797,1181.317
37a932,1,1.0,1.0,2024-04-25 06:26:14+00:00,46.678299,-114.008067,1181.695
40b8a4,1,1.0,1.0,2024-04-25 06:26:49.400000+00:00,46.678331,-114.00822,1180.652
75915b,1,1.0,1.0,2024-04-25 06:27:05.200000+00:00,46.67828,-114.008122,1180.683


## Convert to geopandas dataframe
Metadata as features, point geometry in EPSG:32611

In [48]:
# Convert to geopandas dataframe
gdf = gpd.GeoDataFrame(df_relevant, geometry=gpd.points_from_xy(df_relevant.Longitude, df_relevant.Latitude))
# Set to WGS84 since that's what the coordinates are
gdf.set_crs(epsg=4326, inplace=True)
# Convert to UTM Zone 11N
gdf_utm = gdf.to_crs(epsg=32611)

In [50]:
# Use geopandas to create easting and northing columns from latitude and longitude
gdf_utm['Easting'] = gdf_utm.geometry.x
gdf_utm['Northing'] = gdf_utm.geometry.y

In [51]:
# Reorder columns placing Latitude and Longitude and Easting and Northing next to each other
gdf_utm = gdf_utm[['Point_Index', 'Presence', 'Zone', 'Period', 'Datetime', 
                   'Latitude', 'Longitude', 'Easting', 'Northing', 
                   'Ellipsoidal height', 'geometry']]

In [52]:
gdf_utm.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1600 entries, 0 to 715
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   Point_Index         1600 non-null   object             
 1   Presence            1600 non-null   int64              
 2   Zone                884 non-null    float64            
 3   Period              884 non-null    float64            
 4   Datetime            1600 non-null   datetime64[ns, UTC]
 5   Latitude            1600 non-null   float64            
 6   Longitude           1600 non-null   float64            
 7   Easting             1600 non-null   float64            
 8   Northing            1600 non-null   float64            
 9   Ellipsoidal height  1600 non-null   float64            
 10  geometry            1600 non-null   geometry           
dtypes: datetime64[ns, UTC](1), float64(7), geometry(1), int64(1), object(1)
memory usage: 150.0+ 

In [53]:
gdf_utm.head()

Unnamed: 0,Point_Index,Presence,Zone,Period,Datetime,Latitude,Longitude,Easting,Northing,Ellipsoidal height,geometry
0,present_000,1,1.0,1.0,2024-04-25 06:25:00.600000+00:00,46.678295,-114.007977,728825.99902,5173764.0,1182.161,POINT (728825.999 5173763.794)
1,present_001,1,1.0,1.0,2024-04-25 06:25:24.200000+00:00,46.678261,-114.00797,728826.721929,5173760.0,1181.317,POINT (728826.722 5173760.049)
2,present_002,1,1.0,1.0,2024-04-25 06:26:14+00:00,46.678299,-114.008067,728819.157326,5173764.0,1181.695,POINT (728819.157 5173764.064)
3,present_003,1,1.0,1.0,2024-04-25 06:26:49.400000+00:00,46.678331,-114.00822,728807.322341,5173767.0,1180.652,POINT (728807.322 5173767.093)
4,present_004,1,1.0,1.0,2024-04-25 06:27:05.200000+00:00,46.67828,-114.008122,728814.979743,5173762.0,1180.683,POINT (728814.98 5173761.785)


# Export

In [61]:
# Export to data/processed/vector/groundtruth.geojson
processed_dir = "../data/processed"
gdf_utm.to_file(os.path.join(processed_dir, "vector", "groundtruth.geojson"), driver="GeoJSON")

## save as geojson in data/vector/groundtruth.geojson
