# Setup

In [34]:
import os
import geopandas as gpd
import pandas as pd
import subprocess
import uuid

In [9]:
# External data download directory
raw_dir = "../data/raw"

# Create raw data directory if it doesn't exist
os.makedirs(raw_dir, exist_ok=True)

# Source

In [4]:
# list all sources of ground truth files
src_presence = ["gs://mpg-aerial-survey/ground_truth/horsepile/processed/horse_pile_presence.csv"]
src_absence = ["gs://mpg-aerial-survey/ground_truth/horsepile/processed/horse_pile_absence.csv"]

In [None]:
# Download presence files
for file_path in src_presence:
    dest_filename = os.path.join(raw_dir, os.path.basename(file_path))
    subprocess.run([
        "gsutil", "cp",
        file_path,
        dest_filename
    ])
    print(f"Downloaded {file_path} to {dest_filename}")

# Download absence files
for file_path in src_absence:
    dest_filename = os.path.join(raw_dir, os.path.basename(file_path))
    subprocess.run([
        "gsutil", "cp",
        file_path,
        dest_filename
    ])
    print(f"Downloaded {file_path} to {dest_filename}")



# Wrangle

### Key metadata notes:
- index
- Presence
- Zone
- Period
- Datetime
- Latitude
- Longitude
- Ellipsoidal height

## Features

In [37]:
# Read presence and absence files in DataFrames
df_presence = pd.read_csv(os.path.join(raw_dir, "horse_pile_presence.csv"))
df_absence = pd.read_csv(os.path.join(raw_dir, "horse_pile_absence.csv"))

In [38]:
# Create presence/absence column (1 or 0)
df_presence['Presence'] = 1
df_absence['Presence'] = 0

In [39]:
# Cast presence Averaging start to MST datetime and convert to UTC
df_presence['Averaging start'] = pd.to_datetime(df_presence['Averaging start'], format='mixed')
df_presence['Averaging start'] = df_presence['Averaging start'].dt.tz_convert('UTC')

In [40]:
# Set absence Averaging start to 240724 12:00:00 MST cast to UTC datetime 
df_absence['Averaging start'] = pd.to_datetime('2024-07-24 12:00:00')  # Create naive datetime
df_absence['Averaging start'] = df_absence['Averaging start'].dt.tz_localize('US/Mountain')  # Localize to MST
df_absence['Averaging start'] = df_absence['Averaging start'].dt.tz_convert('UTC')  # Convert to UTC

In [41]:
# Rename Averaging start to datetime
df_presence.rename(columns={'Averaging start': 'Datetime'}, inplace=True)
df_absence.rename(columns={'Averaging start': 'Datetime'}, inplace=True)

In [48]:
# Create unique 6-character index for presence and absence
df_presence['index'] = [str(uuid.uuid4())[:6] for _ in range(len(df_presence))]
df_absence['index'] = [str(uuid.uuid4())[:6] for _ in range(len(df_absence))]
# Set index
df_presence.set_index('index', inplace=True)
df_absence.set_index('index', inplace=True)

## Concatenate

In [50]:
# Concatenate presence and absence files
df_ground_truth = pd.concat([df_presence, df_absence])

# Print total records
print(f"Total records: {len(df_ground_truth)}")
print(f"Presence records: {len(df_presence)}")
print(f"Absence records: {len(df_absence)}")


Total records: 1600
Presence records: 884
Absence records: 716


In [51]:
# Keep relevant columns
df_relevant = df_ground_truth[['Presence', 'Zone', 'Period', 'Datetime', 'Latitude', 'Longitude', 'Ellipsoidal height']]

In [53]:
df_relevant.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1600 entries, f3bb1f to 83689e
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   Presence            1600 non-null   int64              
 1   Zone                884 non-null    float64            
 2   Period              884 non-null    float64            
 3   Datetime            1600 non-null   datetime64[ns, UTC]
 4   Latitude            1600 non-null   float64            
 5   Longitude           1600 non-null   float64            
 6   Ellipsoidal height  1600 non-null   float64            
dtypes: datetime64[ns, UTC](1), float64(5), int64(1)
memory usage: 100.0+ KB


In [54]:
df_relevant.head()

Unnamed: 0_level_0,Presence,Zone,Period,Datetime,Latitude,Longitude,Ellipsoidal height
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
f3bb1f,1,1.0,1.0,2024-04-25 06:25:00.600000+00:00,46.678295,-114.007977,1182.161
699cc7,1,1.0,1.0,2024-04-25 06:25:24.200000+00:00,46.678261,-114.00797,1181.317
37a932,1,1.0,1.0,2024-04-25 06:26:14+00:00,46.678299,-114.008067,1181.695
40b8a4,1,1.0,1.0,2024-04-25 06:26:49.400000+00:00,46.678331,-114.00822,1180.652
75915b,1,1.0,1.0,2024-04-25 06:27:05.200000+00:00,46.67828,-114.008122,1180.683


## Convert to geopandas dataframe
Metadata as features, point geometry in EPSG:32611

In [None]:
# Convert to geopandas dataframe
df_gdf = gpd.GeoDataFrame(df_relevant, geometry=gpd.points_from_xy(df_relevant.Longitude, df_relevant.Latitude))
df_gdf.crs = {'init': 'epsg:32611'}

In [58]:
df_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 1600 entries, f3bb1f to 83689e
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   Presence            1600 non-null   int64              
 1   Zone                884 non-null    float64            
 2   Period              884 non-null    float64            
 3   Datetime            1600 non-null   datetime64[ns, UTC]
 4   Latitude            1600 non-null   float64            
 5   Longitude           1600 non-null   float64            
 6   Ellipsoidal height  1600 non-null   float64            
 7   geometry            1600 non-null   geometry           
dtypes: datetime64[ns, UTC](1), float64(5), geometry(1), int64(1)
memory usage: 177.0+ KB


In [59]:
df_gdf.head()

Unnamed: 0_level_0,Presence,Zone,Period,Datetime,Latitude,Longitude,Ellipsoidal height,geometry
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
f3bb1f,1,1.0,1.0,2024-04-25 06:25:00.600000+00:00,46.678295,-114.007977,1182.161,POINT (-114.008 46.678)
699cc7,1,1.0,1.0,2024-04-25 06:25:24.200000+00:00,46.678261,-114.00797,1181.317,POINT (-114.008 46.678)
37a932,1,1.0,1.0,2024-04-25 06:26:14+00:00,46.678299,-114.008067,1181.695,POINT (-114.008 46.678)
40b8a4,1,1.0,1.0,2024-04-25 06:26:49.400000+00:00,46.678331,-114.00822,1180.652,POINT (-114.008 46.678)
75915b,1,1.0,1.0,2024-04-25 06:27:05.200000+00:00,46.67828,-114.008122,1180.683,POINT (-114.008 46.678)


# Export

## save as geojson in data/vector/groundtruth.geojson
