In [1]:
import os
from glob import glob
import pandas as pd
import geopandas as gpd
import datetime

In [71]:
delivery_name = 'Billabong'
tile_polygons_path = f'/mnt/datapool1/datapool1/datasets/cgsat/polygons/{delivery_name}'

tile_polygons = glob(os.path.join(tile_polygons_path, 'MSS', '*.geojson'))

tile_polygons_df = pd.DataFrame(tile_polygons, columns=['tile_polygon_path'])
tile_polygons_df['tile_polygon_name'] = tile_polygons_df['tile_polygon_path'].apply(lambda x: os.path.basename(x).replace('.geojson', ''))
tile_polygons_df['sattelite'] = tile_polygons_df['tile_polygon_name'].apply(lambda x: x.split('_')[1])
tile_polygons_df['date'] = tile_polygons_df['tile_polygon_name'].apply(lambda x: datetime.datetime.strptime(x.split('_')[3], '%Y%m%d').date())
tile_polygons_df.sort_values(by='date', inplace=True)
tile_polygons_df


Unnamed: 0,tile_polygon_path,tile_polygon_name,sattelite,date
1,/mnt/datapool1/datapool1/datasets/cgsat/polygo...,Billabong_JL1KF02B03_0028_20250126_1360599_MSS...,JL1KF02B03,2025-01-26
52,/mnt/datapool1/datapool1/datasets/cgsat/polygo...,Billabong_JL1KF02B03_0029_20250126_1360595_MSS...,JL1KF02B03,2025-01-26
27,/mnt/datapool1/datapool1/datasets/cgsat/polygo...,Billabong_JL1KF02B03_0028_20250126_1360595_MSS...,JL1KF02B03,2025-01-26
47,/mnt/datapool1/datapool1/datasets/cgsat/polygo...,Billabong_JL1KF02B03_0029_20250126_1360599_MSS...,JL1KF02B03,2025-01-26
0,/mnt/datapool1/datapool1/datasets/cgsat/polygo...,Billabong_JL1KF02B04_0019_20250206_1360592_MSS...,JL1KF02B04,2025-02-06
...,...,...,...,...
18,/mnt/datapool1/datapool1/datasets/cgsat/polygo...,Billabong_JL1KF02B04_0014_20250206_1360594_MSS...,JL1KF02B04,2025-02-06
24,/mnt/datapool1/datapool1/datasets/cgsat/polygo...,Billabong_JL1KF02B04_0017_20250206_1360593_MSS...,JL1KF02B04,2025-02-06
67,/mnt/datapool1/datapool1/datasets/cgsat/polygo...,Billabong_JL1KF02B04_0018_20250206_1360598_MSS...,JL1KF02B04,2025-02-06
59,/mnt/datapool1/datapool1/datasets/cgsat/polygo...,Billabong_JL1KF02B02_0009_20250208_1360596_MSS...,JL1KF02B02,2025-02-08


In [72]:
areas = []
total_area = 0
for group_name, group in tile_polygons_df.groupby(['sattelite', 'date']):
    sattelite, date = group_name
    print(f'Sattelite: {sattelite}, Date: {date}')
    # print(group)
    group_area = 0
    for tile_polygon_path in group['tile_polygon_path']:
        tile_polygon = gpd.read_file(tile_polygon_path)
        tile_polygon.to_crs(epsg=3857, inplace=True)  # Convert to Web Mercator
        group_area += tile_polygon.geometry.area.sum()
    group_area /= 10_000
    total_area += group_area
    print(f'Total area for {sattelite} on {date}: {group_area} ha')
    print('-' * 40)
    areas.append({
        'sattelite': sattelite,
        'date': date,
        'area': group_area
    })
print(f'Total area for all tiles: {total_area} ha')

Sattelite: JL1KF02B02, Date: 2025-02-08
Total area for JL1KF02B02 on 2025-02-08: 16541.037184663437 ha
----------------------------------------
Sattelite: JL1KF02B03, Date: 2025-01-26
Total area for JL1KF02B03 on 2025-01-26: 122587.0232286119 ha
----------------------------------------
Sattelite: JL1KF02B04, Date: 2025-02-06
Total area for JL1KF02B04 on 2025-02-06: 1415598.4485079504 ha
----------------------------------------
Total area for all tiles: 1554726.5089212258 ha


In [73]:
areas_df = pd.DataFrame(areas)
areas_df.sort_values(by=['date'], inplace=True)
areas_df['area'] = areas_df['area'].round()
areas_df

Unnamed: 0,sattelite,date,area
1,JL1KF02B03,2025-01-26,122587.0
2,JL1KF02B04,2025-02-06,1415598.0
0,JL1KF02B02,2025-02-08,16541.0


In [3]:
def get_polygon_area(file, decimals=6, clip_to=None, dissolve=False):
    """Calculate area of polygons in hectares, optionally clipped to a boundary
    
    Args:
        file (str): Path to vector file or geodataframe
        decimals (int): Number of decimal places to round to
        clip_to (GeoDataFrame, optional): GeoDataFrame to clip geometries to
        
    Returns:
        str: Area in hectares as string
    """
    if isinstance(file, str):
        # Check if file exists
        if not os.path.exists(file):
            raise FileNotFoundError(f"File {file} does not exist.")
        gdf = gpd.read_file(file)
    elif isinstance(file, gpd.GeoDataFrame):
        gdf = file
    else:
        raise ValueError("Input must be a file path or a GeoDataFrame.")
    # gdf['geometry'] = gdf['geometry'].apply(lambda geom: geometrycollection_to_multipolygon(geom))
    if clip_to is not None:
        # Handle clip_to as either string file path or GeoDataFrame
        if isinstance(clip_to, str):
            clip_gdf = gpd.read_file(clip_to)
        else:
            clip_gdf = clip_to

        # Ensure same CRS
        if gdf.crs != clip_gdf.crs:
            clip_gdf = clip_gdf.to_crs(gdf.crs)
        
        # Perform clip
        gdf = gpd.clip(gdf, clip_gdf)

    if dissolve:
        gdf = gdf.dissolve()
    return round(gdf.geometry.area.sum() / 10000, decimals)

def get_train_test_areas(train_test_polygons):
    """
    Get the train and test areas from the train_test_polygons files
    """
    train_test_areas = {'train': 0, 'test': 0}
    for file in train_test_polygons:
        file_gdf = gpd.read_file(file)
        train_gdf = file_gdf[file_gdf['train'] == 1]
        test_gdf = file_gdf[file_gdf['test'] == 1]
        train_test_areas['train'] += get_polygon_area(train_gdf)
        train_test_areas['test'] += get_polygon_area(test_gdf)
    print(f"Train area: {train_test_areas['train']} ha")
    print(f"Test area: {train_test_areas['test']} ha")
    print(f"Train percentage: {train_test_areas['train'] / (train_test_areas['train'] + train_test_areas['test']) * 100:.2f}%")
    print(f"Test percentage: {train_test_areas['test'] / (train_test_areas['train'] + train_test_areas['test']) * 100:.2f}%")


In [4]:
# train_test_polygons = glob('/mnt/datapool1/datapool1/datasets/nn_datasets/polygons/Tenterfield/*.geojson')
# train_test_polygons = glob('/mnt/datapool1/datapool1/datasets/nn_datasets/polygons/MulgaLands/*.geojson')
# train_test_polygons = glob('/mnt/datapool1/datapool1/datasets/nn_datasets/polygons/KI2020/*.geojson')
# train_test_polygons = glob('/mnt/datapool1/datapool1/datasets/nn_datasets/polygons/KI2025/*.geojson')
# train_test_polygons = glob('/mnt/datapool1/datapool1/datasets/nn_datasets/polygons/WASouth/*.geojson')
# train_test_polygons = glob('/mnt/datapool1/datapool1/datasets/nn_datasets/polygons/Billabong/*.geojson')
train_test_polygons = glob('/mnt/datapool1/datapool1/datasets/nn_datasets/polygons/gogango/*.geojson')
get_train_test_areas(train_test_polygons)


Train area: 7294.840603 ha
Test area: 1843.575894 ha
Train percentage: 79.83%
Test percentage: 20.17%
