In [None]:
"""
This script downloads raw EOPatches following the crop data_requirements.
By default, it uses S3BucketInputTask. To run it, please set the data
folders accordingly.
"""

import logging
import os
import time

import geopandas as gpd


import sys
sys.path.append("/agrilearn_app/agrilearn/")

from agrilearn.commons.downloader import EoPatchDownloader
from agrilearn.commons.utils.env_utils import load_agrilearn_dotenv
from agrilearn.mvp.gpkg_utils import (
    convert_gpkg_period_to_start_end_dates,
    get_gpkg_state_information,
)
from agrilearn.crop_classification import config as crop_config

In [None]:
load_agrilearn_dotenv()

WORKERS = 2

BASE_PATH = "/agrilearn_app/datasets"
DATASET_PATH = "/agrilearn_app/datasets/cana-v1/geopackage/cana_train_fields_filtered.gpkg"

EOPATCHES_FOLDER = BASE_PATH + "/eopatchs/processed/SUGAR_CANE/train/"
LOGS_FOLDER = BASE_PATH + "/logs"
OUTPUT_FOLDER = BASE_PATH + "/output"

os.makedirs(EOPATCHES_FOLDER, exist_ok=True)
os.makedirs(LOGS_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

In [None]:
gdf = gpd.read_file(DATASET_PATH)

In [None]:
gdf.head()

In [None]:
gdf.shape

In [None]:
gdf[gdf.duplicated('eopath_location')]

In [None]:
gdf['state'].value_counts()

In [None]:
download_samples = convert_gpkg_period_to_start_end_dates(gdf)

In [None]:
# separate error geometries
error_gdf = download_samples[
    (download_samples["start_season"].values == None)
    | (download_samples["end_season"].values == None)
]
error_gdf.shape

In [None]:
download_samples = download_samples[~download_samples.index.isin(error_gdf.index)]
download_samples.shape

In [None]:
# Create logger to measure download time
logger = logging.getLogger("download_logger")
logger.setLevel(level=logging.INFO)
fh = logging.FileHandler(os.path.join(LOGS_FOLDER, "log_download.log"))
fh_formatter = logging.Formatter(
    "%(asctime)s %(levelname)s %(lineno)d:%(filename)s(%(process)d) - %(message)s"
)
fh.setFormatter(fh_formatter)
logger.addHandler(fh)


start_time = time.time()
logger.info("Starting download.")

downloader = EoPatchDownloader(
    EOPATCHES_FOLDER,
    crop_config.DATA_REQUIREMENTS,
    provider_name="element84",
    cache_folder=os.environ.get("SENTINEL_CACHE_FOLDER"),
)

final_gdf = downloader.download(
    download_samples,
    remove_failed_samples=True,
    eopatch_path_collumn_name="eopath_location",
    workers=WORKERS,
    logs_folder=LOGS_FOLDER,
    skip_if_eopatch_exists=True,
    save_report=True,
    save_logs=True,
)

end_time = time.time() - start_time

logger.info("Downloaded finished.")
logger.info("Time of execution (seconds): %.2f", end_time)

final_gdf.to_file(os.path.join(OUTPUT_FOLDER, "final_valid_crops.gpkg"))

if len(error_gdf) > 0:
    error_gdf.to_file(os.path.join(OUTPUT_FOLDER, "error_valid_crops.gpkg"))
