In [5]:
import sys
sys.path.append("../agrilearn/submodules/commons/")
from agrilearn.commons.s3 import s3_utils
from agrilearn.crop_classification import evaluate_utils, yaml_utils, processing
import geopandas as gpd
import os
import pandas as pd

## 1. Define Variables and Read Data

In [6]:
MAIN_PATH = "/agrilearn_app/datasets/SOYBEAN/2020_2021/"
GEOPACKAGE_RAW_PATH = os.path.join(MAIN_PATH, "geopackage/raw/SOYBEAN_29250.gpkg")

NEW_EOPATCH_PATH_PROCESSED = os.path.join(MAIN_PATH, "eopatch/processed/")
NEW_EOPATCH_PATH_INPUT_MODEL = os.path.join(MAIN_PATH, "eopatch/input_model/")

OLD_EOPATCH_PATH_PROCESSED = "/agrilearn_app/datasets/SOYBEAN/eopatch/processed/"
OLD_EOPATCH_PATH_INPUT_MODEL = "/agrilearn_app/datasets/SOYBEAN/eopatch/input_model/"

REPORT_PATH = os.path.join(MAIN_PATH, "reports")


LABEL_EOPATCH_LOCATION = "eopath_location" 

In [7]:
df = gpd.read_file(GEOPACKAGE_RAW_PATH)

In [8]:
df.shape

(29250, 73)

In [9]:
df.head()

Unnamed: 0,monitoring_class,period,fonte,state,area,micro,eopath_location,start_season,end_season,peak_start,...,obs_emergence,obs_senescence,obs_harvest,created_by,area_id,last_date_crop_rnn,crop_distance_score_sits,crop_confidence_maha_sits,set,geometry
0,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,MA,,,start_2020-10-01_end_2021-07-01_monitoring_cla...,2020-10-01,2021-07-01,2020-12-01,...,,,,,,,,,,"POLYGON ((-43.42963 -5.56194, -43.4298 -5.5633..."
1,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,SP,11126104.0,,start_2020-09-01_end_2021-05-01_monitoring_cla...,2020-09-01,2021-06-01,2020-11-01,...,,,,,,,,,,"MULTIPOLYGON (((-48.41515 -23.58491, -48.41567..."
2,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,SP,11126104.0,,start_2020-09-01_end_2021-05-01_monitoring_cla...,2020-09-01,2021-06-01,2020-11-01,...,,,,,,,,,,"POLYGON ((-48.40032 -23.57863, -48.40037 -23.5..."
3,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,SP,11126104.0,,start_2020-09-01_end_2021-05-01_monitoring_cla...,2020-09-01,2021-06-01,2020-11-01,...,,,,,,,,,,"POLYGON ((-48.40597 -23.5784, -48.40597 -23.57..."
4,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,SP,11126104.0,,start_2020-09-01_end_2021-05-01_monitoring_cla...,2020-09-01,2021-06-01,2020-11-01,...,,,,,,,,,,"POLYGON ((-48.42565 -23.59011, -48.42561 -23.5..."


## 2. Processing Geopackage

### 2.1 Drop Columns NaN

In [10]:
shape_before = df.shape[1]
df.dropna(axis=1, how='all', inplace=True)
print(f"Removed Columns: {shape_before - df.shape[1]}, Percentage: {(shape_before - df.shape[1]) / shape_before * 100:.2f}%")

Removed Columns: 17, Percentage: 23.29%


### 2.2 Check NaN values

In [11]:
# Calcula a quantidade de registros nulos e a porcentagem de registros nulos
df_check_NaN = pd.concat([df.isna().sum(), df.isna().sum() / df.shape[0] * 100], axis=1)
df_check_NaN.columns = ['Null Count', 'NaN percentage']
df_check_NaN.sort_values('NaN percentage', ascending=False, inplace=True)
df_check_NaN.head()

Unnamed: 0,Null Count,NaN percentage
last_date_crop_rnn,28622,97.852991
crop_distance_score_sits,28621,97.849573
crop_confidence_maha_sits,28621,97.849573
emergence_date,28620,97.846154
emergence_score,28620,97.846154


### 2.3 Transform all Geometry to Multipolygon

In [12]:
from shapely.geometry import Polygon, MultiPolygon

In [13]:
df['geometry'] = df['geometry'].apply(lambda geom: MultiPolygon([geom]) if isinstance(geom, Polygon) else geom)

### 2.4 Check Duplicates

In [14]:
df['duplicates_id'] = None
duplicados = df[df.duplicated(subset='eopath_location', keep=False)]
# Adicione um identificador para os duplicados
duplicados['duplicates_id'] = duplicados.groupby('eopath_location').ngroup()
# Combine os dados duplicados com o DataFrame original
df.update(duplicados)
df.sort_values('duplicates_id', inplace=True)

In [15]:
duplicados.shape

(29250, 57)

In [16]:
df[df['duplicates_id']==0]

Unnamed: 0,monitoring_class,period,fonte,state,area,eopath_location,start_season,end_season,peak_start,peak_end,...,obs_senescence,obs_harvest,created_by,area_id,last_date_crop_rnn,crop_distance_score_sits,crop_confidence_maha_sits,set,geometry,duplicates_id
21090,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,SP,698856,start_2020-09-01_end_2021-05-01_monitoring_cla...,2020-09-01,2021-06-01,2020-11-01,2021-03-01,...,,,,,,,,,"MULTIPOLYGON (((-45.29003 -22.87902, -45.29003...",0
6781,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,SP,698856,start_2020-09-01_end_2021-05-01_monitoring_cla...,2020-09-01,2021-06-01,2020-11-01,2021-03-01,...,,,,,,,,,"MULTIPOLYGON (((-45.29003 -22.87902, -45.29003...",0


In [17]:
shape_before = df.shape[0]
df.drop_duplicates(subset=['geometry', 'period'], inplace=True)
print(f"Removed records: {shape_before - df.shape[0]}, Percentage: {(shape_before - df.shape[0]) / shape_before * 100:.2f}%")

Removed records: 14941, Percentage: 51.08%


### 2.5 feature engineering

In [18]:
df['safra'] = df['period'].apply(lambda x: 'safrinha' if x.split('/')[0] == x.split('/')[1] else 'safra')

In [19]:
df['safra'].value_counts()

safra
safra    14309
Name: count, dtype: int64

## 3. Move eopatch to other

#### A) PROCESSED

In [20]:
df_results_processed = processing.check_load_eopatch_and_change_eopatch_dir(df,
                                        eopatch_path=OLD_EOPATCH_PATH_PROCESSED,
                                        new_eopatch_path=NEW_EOPATCH_PATH_PROCESSED,
                                        label_monitoring_class='monitoring_class',
                                        label_eopatch_path='eopath_location')

2025-04-19 00:45:18,789 - INFO - Iniciando a execução da função 'check_load_eopatch_and_change_eopatch_dir'


  0%|          | 0/14309 [00:00<?, ?it/s]

2025-04-19 01:22:29,698 - INFO - Tempo de execução da função 'check_load_eopatch_and_change_eopatch_dir': 2230.91 segundos


In [21]:
df_results_processed

Unnamed: 0,miss_geopackage,error_processed,rsync_output
0,start_2020-09-01_end_2021-05-01_monitoring_cla...,,
1,start_2020-09-01_end_2021-05-01_monitoring_cla...,,
2,start_2020-09-01_end_2021-05-01_monitoring_cla...,,
3,start_2020-09-01_end_2021-05-01_monitoring_cla...,,
4,start_2020-09-01_end_2021-05-01_monitoring_cla...,,
...,...,...,...
14304,start_2021-04-01_end_2021-11-01_monitoring_cla...,,
14305,start_2021-04-01_end_2021-11-01_monitoring_cla...,,
14306,start_2021-04-01_end_2021-11-01_monitoring_cla...,,
14307,start_2021-04-01_end_2021-11-01_monitoring_cla...,,


#### B) INPUT_MODEL

In [22]:
df_results_input = processing.check_load_eopatch_and_change_eopatch_dir(df,
                                        eopatch_path=OLD_EOPATCH_PATH_INPUT_MODEL,
                                        new_eopatch_path=NEW_EOPATCH_PATH_INPUT_MODEL,
                                        label_monitoring_class='monitoring_class',
                                        label_eopatch_path='eopath_location')

2025-04-19 01:22:29,727 - INFO - Iniciando a execução da função 'check_load_eopatch_and_change_eopatch_dir'


  0%|          | 0/14309 [00:00<?, ?it/s]

2025-04-19 01:36:32,089 - INFO - Tempo de execução da função 'check_load_eopatch_and_change_eopatch_dir': 842.36 segundos


In [23]:
df_results_input

Unnamed: 0,miss_geopackage,error_processed,rsync_output
0,start_2020-09-01_end_2021-05-01_monitoring_cla...,,
1,start_2020-09-01_end_2021-05-01_monitoring_cla...,,
2,start_2020-09-01_end_2021-05-01_monitoring_cla...,,
3,start_2020-09-01_end_2021-05-01_monitoring_cla...,,
4,start_2020-09-01_end_2021-05-01_monitoring_cla...,,
...,...,...,...
14304,start_2021-04-01_end_2021-11-01_monitoring_cla...,,
14305,start_2021-04-01_end_2021-11-01_monitoring_cla...,,
14306,start_2021-04-01_end_2021-11-01_monitoring_cla...,,
14307,start_2021-04-01_end_2021-11-01_monitoring_cla...,,


In [24]:
df_results_input['miss_geopackage'].nunique()

14309

In [25]:
df_results_input['error_processed'].value_counts()

error_processed
EOPatch not found    4
Name: count, dtype: int64

## 4. Check processed eopatch

In [26]:
string_name = processing.get_geopackage_name(df)
string_name

'SOYBEAN_14309'

#### A) PROCESSED

In [27]:
df_processed, df_overview_processed = processing.check_processed_eopatches_features_based_geopackage(df, 
                                                                                         eopatch_path=NEW_EOPATCH_PATH_PROCESSED,
                                                                                         label_eopatch_path=LABEL_EOPATCH_LOCATION)

2025-04-19 01:36:32,145 - INFO - Iniciando a execução da função 'check_processed_eopatches_features_based_geopackage'


  0%|          | 0/14309 [00:00<?, ?it/s]

2025-04-19 02:06:39,610 - INFO - Tempo de execução da função 'check_processed_eopatches_features_based_geopackage': 1807.47 segundos


In [28]:
df_overview_processed

Unnamed: 0,0
total_eopatches,14309.0
eopatch_processed_exists,14309.0
eopatches_not_exist,0.0
percentage_eopatches_not_exist,0.0
len_data_exist,14309.0
percentage_len_data_exist,100.0
len_bands_exist,14309.0
percentage_len_bands_exist,100.0
len_time_interval_exist,14309.0
percentage_len_time_interval_exist,100.0


In [29]:
os.makedirs(REPORT_PATH, exist_ok=True)
final_processed_report_name = f"{REPORT_PATH}/overview_{string_name}_processed.csv"
df_overview_processed.to_csv(final_processed_report_name)
final_processed_report_name

'/agrilearn_app/datasets/SOYBEAN/2020_2021/reports/overview_SOYBEAN_14309_processed.csv'

In [30]:
df_processed[df_processed['eopatch_processed_exists']==False]

Unnamed: 0,eopatch_location_id,eopatch_processed_exists,len_data,len_bands,len_time_interval,len_timestamp,error_processed


In [31]:
df_processed[df_processed['eopatch_processed_exists']==True]

Unnamed: 0,eopatch_location_id,eopatch_processed_exists,len_data,len_bands,len_time_interval,len_timestamp,error_processed
0,start_2020-09-01_end_2021-05-01_monitoring_cla...,True,2,10,2,30,
1,start_2020-09-01_end_2021-05-01_monitoring_cla...,True,2,10,2,34,
2,start_2020-09-01_end_2021-05-01_monitoring_cla...,True,2,10,2,28,
3,start_2020-09-01_end_2021-05-01_monitoring_cla...,True,2,10,2,28,
4,start_2020-09-01_end_2021-05-01_monitoring_cla...,True,2,10,2,31,
...,...,...,...,...,...,...,...
14304,start_2021-04-01_end_2021-11-01_monitoring_cla...,True,2,10,2,29,
14305,start_2021-04-01_end_2021-11-01_monitoring_cla...,True,2,10,2,51,
14306,start_2021-04-01_end_2021-11-01_monitoring_cla...,True,2,10,2,59,
14307,start_2021-04-01_end_2021-11-01_monitoring_cla...,True,2,10,2,57,


#### B) INPUT_MODEL

In [32]:
df_input_model, df_overview_input = processing.check_input_model_eopatches_features_based_geopackage(df, 
                                                                                                     eopatch_path=NEW_EOPATCH_PATH_INPUT_MODEL,
                                                                                                     label_eopatch_path=LABEL_EOPATCH_LOCATION)

2025-04-19 02:06:39,693 - INFO - Iniciando a execução da função 'check_input_model_eopatches_features_based_geopackage'


  0%|          | 0/14309 [00:00<?, ?it/s]

2025-04-19 02:08:07,539 - INFO - Tempo de execução da função 'check_input_model_eopatches_features_based_geopackage': 87.85 segundos


In [33]:
df_overview_input

Unnamed: 0,0
total_eopatches,14309.0
eopatch_input_model_exists,13741.0
eopatches_not_exist,568.0
percentage_eopatches_not_exist,3.96953
classes_equivalent,13741.0
classes_not_equivalent,568.0
percentage_classes_equivalent,96.03047
percentage_classes_not_equivalent,3.96953


In [34]:
df_input_model[(df_input_model['label_unique'].notna()) & 
                (df_input_model['geopackage_class'] != df_input_model['label_unique'])]

Unnamed: 0,eopatch_location_id,geopackage_class,eopatch_input_model_exists,shape_X_data,label_unique,shape_labels,shape_timestamp,error_input_model


In [35]:
os.makedirs(REPORT_PATH, exist_ok=True)
final_processed_report_name = f"{REPORT_PATH}/overview_{string_name}_input_model.csv"
df_overview_input.to_csv(final_processed_report_name)
final_processed_report_name

'/agrilearn_app/datasets/SOYBEAN/2020_2021/reports/overview_SOYBEAN_14309_input_model.csv'

## 5. Merge Data

In [36]:
df_overview_input

Unnamed: 0,0
total_eopatches,14309.0
eopatch_input_model_exists,13741.0
eopatches_not_exist,568.0
percentage_eopatches_not_exist,3.96953
classes_equivalent,13741.0
classes_not_equivalent,568.0
percentage_classes_equivalent,96.03047
percentage_classes_not_equivalent,3.96953


In [37]:
df_processed.shape

(14309, 7)

In [38]:
df_input_model.shape

(14309, 8)

In [39]:
# Usando pd.merge para juntar os DataFrames
df_merged = pd.merge(df, df_processed, left_on='eopath_location', right_on='eopatch_location_id', how='left')
df_merged.shape

(14309, 65)

In [40]:
df_full = pd.merge(df_merged, df_input_model, left_on='eopath_location', right_on='eopatch_location_id', how='left')
df_full.shape

(14309, 73)

## 6. Save data

In [41]:
result_string = processing.get_geopackage_name(df_full, label_monitoring_class='monitoring_class')
GEOPACKAGE_PROCESSED_PATH = os.path.join(GEOPACKAGE_RAW_PATH.split('/raw/')[0]+"/", "processed", result_string+ ".gpkg")
GEOPACKAGE_PROCESSED_PATH

'/agrilearn_app/datasets/SOYBEAN/2020_2021/geopackage/processed/SOYBEAN_14309.gpkg'

In [42]:
df_full.to_file(GEOPACKAGE_PROCESSED_PATH, 
                driver='GPKG', 
                engine='fiona')

