In [1]:
# Load the auto reload extension to automatically reload modules when files on disk are updated
%load_ext autoreload
# it will automatically be reloaded without the need to restart the kernel.
%autoreload 2

### Imports

In [2]:
import boto3
import geopandas as gpd
from io import BytesIO
import pandas as pd
from folium import GeoJson
import folium
import numpy as np
from tqdm import tqdm
from glob import glob
#import local modules
import sys
sys.path.append("/agrilearn_app/agrilearn/")
from agrilearn.utils import s3_utils, str_utils
from agrilearn.mvp import gpkg_utils

ModuleNotFoundError: No module named 'boto3'

### Global Variables

In [None]:
DATASET_PATH = "/agrilearn_app/datasets/base/geopackage/processed/CORN_73080_SOYBEAN_29670_COTTON_1632_RICE_1172.gpkg"
#OUTPUT_DATASET_PATH = "/agrilearn_app/datasets/v1/geopackage/cana_fields_checked.gpkg"

# 1. Read Datasets

In [None]:
df = gpd.read_file(DATASET_PATH)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df['dataset_part'].value_counts(normalize=True)*100

In [None]:
report_train_val_test = df.groupby(['monitoring_class', 'dataset_part'], as_index=False)\
                            .agg({'period':'count'})\
                            .sort_values(['monitoring_class', 'period'], ascending=[False, False])
report_train_val_test

## 2. Check Eopatches from Local Path

In [None]:
EOPATCH_LOCAL_PATH="/agrilearn_app/datasets/base/eopatch/processed/**"
eopatches_path = [f for f in glob(EOPATCH_LOCAL_PATH, recursive=True) if f.endswith('/eopatch_0_col-0_row-0')]

In [None]:
len(eopatches_path)

In [None]:
df['local_eopatch_path'] = eopatches_path

In [None]:
df['local_eopatch_path'].str.split('/start_').str[0].value_counts()

## 2.1 Check total of missing images based on frequence time

In [None]:
dic_missing_images = gpkg_utils.get_number_of_missing_images(EOPATCH_IDS=df['local_eopatch_path'].values, freq='5D')

In [None]:
df_missing_images = pd.DataFrame(dic_missing_images).T.reset_index(names='path')

In [None]:
df_missing_images['status_missing_images'].value_counts()

In [None]:
df_missing_images

In [None]:
df_missing_images.to_csv('/agrilearn_app/datasets/v1/csvs/get_number_of_missing_images_based_freq5D.csv', 
                 sep=';', 
                 float_format='%.2f',
                 index=False)

In [None]:
df_missing_images.head()

In [None]:
df['total_imagens_do_intervalo'] = df_missing_images['total_imagens_do_intervalo']
df['total_de_imagem'] = df_missing_images['total_de_imagem']
df['status_missing_images'] = df_missing_images['status_missing_images']

## 2.2 Check time series missing

In [None]:
dic_get_time_series_missing = gpkg_utils.get_time_series_missing(EOPATCH_IDS=df['local_eopatch_path'].values)

In [None]:
df_time_series_missing = pd.DataFrame(dic_get_time_series_missing).T.reset_index(names='path')
df_time_series_missing['mean_dates_diff'] = df_time_series_missing['mean_dates_diff'].astype("float")

In [None]:
df_time_series_missing['status_series_missing'].value_counts()

In [None]:
df_time_series_missing.to_csv('/agrilearn_app/datasets/v1/csvs/get_time_series_missing.csv', 
                      sep=';', 
                      index=False,
                      float_format='%.2f')

In [None]:
df['days_gap'] = df_time_series_missing['days_gap']
df['dates_diff'] = df_time_series_missing['dates_diff']
df['mean_dates_diff'] = df_time_series_missing['mean_dates_diff'] 
df['status_series_missing'] = df_time_series_missing['status_series_missing']

In [None]:
df.sort_values('total_de_imagem', ascending=True)

In [None]:
df.sort_values('mean_dates_diff', ascending=True)

## 3. concat files and Save new GPKG

In [None]:
OUTPUT_DATASET_PATH

In [None]:
df.to_file(OUTPUT_DATASET_PATH, driver='GPKG', engine='fiona')

In [None]:
df