In [34]:
import sys
sys.path.append("../agrilearn/submodules/commons/")
from agrilearn.commons.s3 import s3_utils
from agrilearn.crop_classification import evaluate_utils, yaml_utils, processing
import geopandas as gpd
import pandas as pd
import os

## 1. Define Variables and Read Data

In [41]:
MAIN_PATH = "/agrilearn_app/output/experiment_12/data/"
GEOPACKAGE_RAW_PATH = os.path.join(MAIN_PATH, "geopackage/processed/SOYBEAN_48257_CORN_33809_SUGAR_CANE_17640_WHEAT_4226_COTTON_3178_RICE_925_test.gpkg")

NEW_EOPATCH_PATH_PROCESSED = os.path.join(MAIN_PATH, "eopatch/processed/")
#NEW_EOPATCH_PATH_INPUT_MODEL = os.path.join(MAIN_PATH, "eopatch/input_model/")

OLD_EOPATCH_PATH_PROCESSED = "/agrilearn_app/datasets/baselines/testes_de_concordancia/eopatch/processed_v2/"
#OLD_EOPATCH_PATH_INPUT_MODEL = "/agrilearn_app/datasets/SUGAR_CANE/eopatch/input_model/"

REPORT_PATH = os.path.join(MAIN_PATH, "reports")

LABEL_EOPATCH_LOCATION = "eopatch_location" 

In [42]:
df = gpd.read_file(GEOPACKAGE_RAW_PATH)

In [43]:
df.shape

(343, 36)

In [44]:
df.head()

Unnamed: 0,interest_area_id,period,start_season,emergence_date,gt_class,end_season,monitoring_class,state,peak_start,peak_end,...,dataset_part,geopackage,geopackage_class,eopatch_processed_exists,len_data,len_bands,len_time_interval,len_timestamp,error_processed,geometry
0,35,2020/2021,2020-10-01,,PASTURE,2021-05-01,SOYBEAN,GO,2020-12-01,2021-02-01,...,test,start_2020-10-01_end_2021-05-01_monitoring_cla...,SOYBEAN,True,4,10,2,42,,"MULTIPOLYGON (((-47.30475 -17.01249, -47.30447..."
1,36,2020/2021,2020-10-01,,PASTURE,2021-05-01,SOYBEAN,GO,2020-12-01,2021-02-01,...,test,start_2020-10-01_end_2021-05-01_monitoring_cla...,SOYBEAN,True,4,10,2,42,,"MULTIPOLYGON (((-47.54068 -17.00806, -47.54041..."
2,37,2020/2021,2020-10-01,,PASTURE,2021-05-01,SOYBEAN,GO,2020-12-01,2021-02-01,...,test,start_2020-10-01_end_2021-05-01_monitoring_cla...,SOYBEAN,True,4,10,2,42,,"MULTIPOLYGON (((-47.26563 -16.6992, -47.26474 ..."
3,38,2020/2021,2020-10-01,,PASTURE,2021-05-01,SOYBEAN,MG,2020-12-01,2021-02-01,...,test,start_2020-10-01_end_2021-05-01_monitoring_cla...,SOYBEAN,True,4,10,2,42,,"MULTIPOLYGON (((-47.19169 -17.17247, -47.19154..."
4,39,2020/2021,2020-10-01,,PASTURE,2021-05-01,SOYBEAN,MG,2020-12-01,2021-02-01,...,test,start_2020-10-01_end_2021-05-01_monitoring_cla...,SOYBEAN,True,4,10,2,42,,"MULTIPOLYGON (((-47.09819 -17.33197, -47.09819..."


In [45]:
df[LABEL_EOPATCH_LOCATION] = df['eopath_location']

In [46]:
del df['eopath_location']

## 2. Processing Geopackage

### 2.1 Drop Columns NaN

In [47]:
shape_before = df.shape[1]
df.dropna(axis=1, how='all', inplace=True)
print(f"Removed Columns: {shape_before - df.shape[1]}, Percentage: {(shape_before - df.shape[1]) / shape_before * 100:.2f}%")

Removed Columns: 1, Percentage: 2.78%


### 2.2 Check NaN values

In [48]:
# Calcula a quantidade de registros nulos e a porcentagem de registros nulos
df_check_NaN = pd.concat([df.isna().sum(), df.isna().sum() / df.shape[0] * 100], axis=1)
df_check_NaN.columns = ['Null Count', 'NaN percentage']
df_check_NaN.sort_values('NaN percentage', ascending=False, inplace=True)
df_check_NaN.head()

Unnamed: 0,Null Count,NaN percentage
last_date_crop_rnn,50,14.577259
crop_confidence_maha_sits,50,14.577259
emergence_date,50,14.577259
crop_distance_score_sits,50,14.577259
start_season,0,0.0


### 2.3 Transform all Geometry to Multipolygon

In [49]:
from shapely.geometry import Polygon, MultiPolygon

In [50]:
df['geometry'] = df['geometry'].apply(lambda geom: MultiPolygon([geom]) if isinstance(geom, Polygon) else geom)

### 2.4 Check Duplicates

In [51]:
subset_duplicate = ['geometry', 'period']
df['duplicates_id'] = None

duplicados = df[df.duplicated(subset=subset_duplicate, keep=False)]
# Adicione um identificador para os duplicados
duplicados['duplicates_id'] = duplicados.groupby(subset_duplicate).ngroup()
# Combine os dados duplicados com o DataFrame original
df.update(duplicados)
df.sort_values('duplicates_id', inplace=True)

In [52]:
df[df['duplicates_id']==1]

Unnamed: 0,interest_area_id,period,start_season,emergence_date,gt_class,end_season,monitoring_class,state,peak_start,peak_end,...,geopackage,geopackage_class,eopatch_processed_exists,len_data,len_bands,len_time_interval,len_timestamp,geometry,eopatch_location,duplicates_id


In [53]:
shape_before = df.shape[0]
df.drop_duplicates(subset=subset_duplicate, inplace=True)
print(f"Removed records: {shape_before - df.shape[0]}, Percentage: {(shape_before - df.shape[0]) / shape_before * 100:.2f}%")

Removed records: 0, Percentage: 0.00%


### 2.5 feature engineering

In [54]:
df['safra'] = df['period'].apply(lambda x: 'safrinha' if x.split('/')[0] == x.split('/')[1] else 'safra')

In [55]:
df['safra'].value_counts()

safra
safra       216
safrinha    127
Name: count, dtype: int64

## 3. Move eopatch to other

In [56]:
processing.check_and_delete_columns(df, columns_to_drop=['eopatch_processed_exists', 'len_data', 'len_bands', 'len_time_interval', 'len_timestamp', 'error_processed'])

Coluna 'eopatch_processed_exists' deletada.
Coluna 'len_data' deletada.
Coluna 'len_bands' deletada.
Coluna 'len_time_interval' deletada.
Coluna 'len_timestamp' deletada.
Coluna 'error_processed' não existe no DataFrame.


#### A) PROCESSED

In [57]:
df_results_processed = processing.check_load_eopatch_and_change_eopatch_dir(df,
                                        eopatch_path=OLD_EOPATCH_PATH_PROCESSED,
                                        new_eopatch_path=NEW_EOPATCH_PATH_PROCESSED,
                                        label_monitoring_class='monitoring_class',
                                        label_eopatch_path=LABEL_EOPATCH_LOCATION)

2025-04-22 23:39:27,143 - INFO - Iniciando a execução da função 'check_load_eopatch_and_change_eopatch_dir'


  0%|          | 0/343 [00:00<?, ?it/s]

2025-04-22 23:41:15,743 - INFO - Tempo de execução da função 'check_load_eopatch_and_change_eopatch_dir': 108.60 segundos


In [58]:
df_results_processed

Unnamed: 0,miss_geopackage,error_processed,rsync_output
0,start_2020-10-01_end_2021-05-01_monitoring_cla...,,
1,start_2020-10-01_end_2021-05-01_monitoring_cla...,,
2,start_2020-10-01_end_2021-05-01_monitoring_cla...,,
3,start_2020-10-01_end_2021-05-01_monitoring_cla...,,
4,start_2020-10-01_end_2021-05-01_monitoring_cla...,,
...,...,...,...
338,start_2023-02-01_end_2023-10-01_monitoring_cla...,,
339,start_2023-02-01_end_2023-10-01_monitoring_cla...,,
340,start_2023-02-01_end_2023-10-01_monitoring_cla...,,
341,start_2023-02-01_end_2023-10-01_monitoring_cla...,,


#### B) INPUT_MODEL

In [59]:
# df_results_input = processing.check_load_eopatch_and_change_eopatch_dir(df,
#                                         eopatch_path=OLD_EOPATCH_PATH_INPUT_MODEL,
#                                         new_eopatch_path=NEW_EOPATCH_PATH_INPUT_MODEL,
#                                         label_monitoring_class='monitoring_class',
#                                         label_eopatch_path='eopath_location')

In [60]:
# df_results_input['miss_geopackage'].nunique()

In [61]:
# df_results_input['error_processed'].value_counts()

## 4. Check processed eopatch

In [62]:
string_name = processing.get_geopackage_name(df)
string_name

'SOYBEAN_216_CORN_127'

In [63]:
df.head()

Unnamed: 0,interest_area_id,period,start_season,emergence_date,gt_class,end_season,monitoring_class,state,peak_start,peak_end,...,crop_distance_score_sits,crop_confidence_maha_sits,set,dataset_part,geopackage,geopackage_class,geometry,eopatch_location,duplicates_id,safra
0,35,2020/2021,2020-10-01,,PASTURE,2021-05-01,SOYBEAN,GO,2020-12-01,2021-02-01,...,,,run_2082_mvp_344_teste_concordancia_crop_120.gpkg,test,start_2020-10-01_end_2021-05-01_monitoring_cla...,SOYBEAN,"MULTIPOLYGON (((-47.30475 -17.01249, -47.30447...",start_2020-10-01_end_2021-05-01_monitoring_cla...,,safra
1,36,2020/2021,2020-10-01,,PASTURE,2021-05-01,SOYBEAN,GO,2020-12-01,2021-02-01,...,,,run_2082_mvp_344_teste_concordancia_crop_120.gpkg,test,start_2020-10-01_end_2021-05-01_monitoring_cla...,SOYBEAN,"MULTIPOLYGON (((-47.54068 -17.00806, -47.54041...",start_2020-10-01_end_2021-05-01_monitoring_cla...,,safra
2,37,2020/2021,2020-10-01,,PASTURE,2021-05-01,SOYBEAN,GO,2020-12-01,2021-02-01,...,,,run_2082_mvp_344_teste_concordancia_crop_120.gpkg,test,start_2020-10-01_end_2021-05-01_monitoring_cla...,SOYBEAN,"MULTIPOLYGON (((-47.26563 -16.6992, -47.26474 ...",start_2020-10-01_end_2021-05-01_monitoring_cla...,,safra
3,38,2020/2021,2020-10-01,,PASTURE,2021-05-01,SOYBEAN,MG,2020-12-01,2021-02-01,...,,,run_2082_mvp_344_teste_concordancia_crop_120.gpkg,test,start_2020-10-01_end_2021-05-01_monitoring_cla...,SOYBEAN,"MULTIPOLYGON (((-47.19169 -17.17247, -47.19154...",start_2020-10-01_end_2021-05-01_monitoring_cla...,,safra
4,39,2020/2021,2020-10-01,,PASTURE,2021-05-01,SOYBEAN,MG,2020-12-01,2021-02-01,...,,,run_2082_mvp_344_teste_concordancia_crop_120.gpkg,test,start_2020-10-01_end_2021-05-01_monitoring_cla...,SOYBEAN,"MULTIPOLYGON (((-47.09819 -17.33197, -47.09819...",start_2020-10-01_end_2021-05-01_monitoring_cla...,,safra


#### A) PROCESSED

In [64]:
df_processed, df_overview_processed = processing.check_processed_eopatches_features_based_geopackage(df, 
                                                                                         eopatch_path=NEW_EOPATCH_PATH_PROCESSED,
                                                                                         label_eopatch_path=LABEL_EOPATCH_LOCATION)

2025-04-22 23:41:15,924 - INFO - Iniciando a execução da função 'check_processed_eopatches_features_based_geopackage'


  0%|          | 0/343 [00:00<?, ?it/s]

2025-04-22 23:41:21,012 - INFO - Tempo de execução da função 'check_processed_eopatches_features_based_geopackage': 5.09 segundos


In [65]:
df_overview_processed

Unnamed: 0,0
total_eopatches,343.0
eopatch_processed_exists,343.0
eopatches_not_exist,0.0
percentage_eopatches_not_exist,0.0
len_data_exist,343.0
percentage_len_data_exist,100.0
len_bands_exist,343.0
percentage_len_bands_exist,100.0
len_time_interval_exist,343.0
percentage_len_time_interval_exist,100.0


In [66]:
os.makedirs(REPORT_PATH, exist_ok=True)
final_processed_report_name = f"{REPORT_PATH}/overview_{string_name}_processed.csv"
df_overview_processed.to_csv(final_processed_report_name)
final_processed_report_name

'/agrilearn_app/datasets/baselines/testes_de_concordancia/reports/overview_SOYBEAN_216_CORN_127_processed.csv'

In [67]:
df_processed[df_processed['eopatch_processed_exists']==False]

Unnamed: 0,eopatch_location_id,eopatch_processed_exists,len_data,len_bands,len_time_interval,len_timestamp,error_processed


In [68]:
df_processed[df_processed['eopatch_processed_exists']==True]

Unnamed: 0,eopatch_location_id,eopatch_processed_exists,len_data,len_bands,len_time_interval,len_timestamp,error_processed
0,start_2020-10-01_end_2021-05-01_monitoring_cla...,True,4,10,2,42,
1,start_2020-10-01_end_2021-05-01_monitoring_cla...,True,4,10,2,42,
2,start_2020-10-01_end_2021-05-01_monitoring_cla...,True,4,10,2,42,
3,start_2020-10-01_end_2021-05-01_monitoring_cla...,True,4,10,2,42,
4,start_2020-10-01_end_2021-05-01_monitoring_cla...,True,4,10,2,42,
...,...,...,...,...,...,...,...
338,start_2023-02-01_end_2023-10-01_monitoring_cla...,True,4,10,2,41,
339,start_2023-02-01_end_2023-10-01_monitoring_cla...,True,4,10,2,43,
340,start_2023-02-01_end_2023-10-01_monitoring_cla...,True,4,10,2,41,
341,start_2023-02-01_end_2023-10-01_monitoring_cla...,True,4,10,2,43,


#### B) INPUT_MODEL

In [69]:
# df_input_model, df_overview_input = processing.check_input_model_eopatches_features_based_geopackage(df, 
#                                                                                                      eopatch_path=NEW_EOPATCH_PATH_INPUT_MODEL,
#                                                                                                      label_eopatch_path=LABEL_EOPATCH_LOCATION)

In [70]:
# df_overview_input

In [71]:
# df_input_model[(df_input_model['label_unique'].notna()) & 
#                 (df_input_model['geopackage_class'] != df_input_model['label_unique'])]

In [72]:
# os.makedirs(REPORT_PATH, exist_ok=True)
# final_processed_report_name = f"{REPORT_PATH}/overview_{string_name}_input_model.csv"
# df_overview_input.to_csv(final_processed_report_name)
# final_processed_report_name

## 5. Merge Data

In [73]:
df_processed.shape

(343, 7)

In [74]:
#df_input_model.shape

In [75]:
# Usando pd.merge para juntar os DataFrames
df_merged = pd.merge(df, df_processed, 
                     left_on=LABEL_EOPATCH_LOCATION, 
                     right_on='eopatch_location_id', 
                     how='left')
df_merged.shape

(343, 39)

In [76]:
df_merged[df_merged['eopatch_processed_exists']==False]

Unnamed: 0,interest_area_id,period,start_season,emergence_date,gt_class,end_season,monitoring_class,state,peak_start,peak_end,...,eopatch_location,duplicates_id,safra,eopatch_location_id,eopatch_processed_exists,len_data,len_bands,len_time_interval,len_timestamp,error_processed


In [77]:
# df_full = pd.merge(df_merged, 
#                    df_input_model, 
#                    left_on=LABEL_EOPATCH_LOCATION, 
#                    right_on='eopatch_location_id', how='left')
# df_full.shape

## 6. Save data

In [78]:
result_string = processing.get_geopackage_name(df_merged, label_monitoring_class='monitoring_class')
GEOPACKAGE_PROCESSED_PATH = os.path.join(GEOPACKAGE_RAW_PATH.split('/raw/')[0]+"/", "processed", result_string+ "_.gpkg")
GEOPACKAGE_PROCESSED_PATH

'/agrilearn_app/datasets/baselines/testes_de_concordancia/geopackage/processed/SOYBEAN_216_CORN_127_.gpkg'

In [79]:
df_merged.to_file(GEOPACKAGE_PROCESSED_PATH, 
                driver='GPKG', 
                engine='fiona')

## df_full.to_file(GEOPACKAGE_PROCESSED_PATH, 
#                 driver='GPKG', 
#                 engine='fiona')