In [2]:
import boto3
import geopandas as gpd
from io import BytesIO
import pandas as pd
from folium import GeoJson
import folium
import numpy as np
from agrilearn.crop_classification import s3_utils, str_utils, eopatch_utils

ModuleNotFoundError: No module named 'agrilearn.commons'

In [None]:
import sys
sys.path.append("/agrilearn_app/agrilearn/")

## 1. Define Dataset Path

In [5]:
DATASET_PATH_1 = '/agrilearn_app/datasets/teste_pre_safra_2024_2025/geopackage/processed/250111_data_merged_crop_120_06_08_2024_input_mvp.gpkg'
DATASET_PATH_2 = "/agrilearn_app/datasets/base/geopackage/processed/CORN_73080_SOYBEAN_29670_COTTON_1632_RICE_1172.gpkg"

## 2. Read and Concat

In [6]:
df_1 = gpd.read_file(DATASET_PATH_1)


In [9]:
df_1['dataset_part'].value_counts()

dataset_part
test    8347
Name: count, dtype: int64

In [11]:
df_1['gt_class'].value_counts()

gt_class
SOYBEAN       7270
CORN           579
COTTON         267
RICE            81
PASTURE         50
SUGAR_CANE      50
WHEAT           50
Name: count, dtype: int64

In [13]:
df_1['monitoring_class'] = df_1['gt_class']

In [15]:
df_1.columns

Index(['interest_area_id', 'period', 'start_season', 'emergence_date',
       'gt_class', 'end_season', 'monitoring_class', 'state', 'peak_start',
       'peak_end', 'sentinel_eopatch_current', 'obs_contour',
       'obs_contour_score', 'agricultural_area', 'obs_farm_plot',
       'obs_emergence', 'obs_senescence', 'obs_harvest', 'obs_extra',
       'created_by', 'area_id', 'last_date_crop_rnn',
       'crop_distance_score_sits', 'crop_confidence_maha_sits', 'set',
       'eopath_location', 'dataset_part', 'start_of_season', 'peak_of_season',
       'end_of_season', 'length_of_season', 'is_valid_metrics', 'sos_valid',
       'pos_valid', 'eos_valid', 'los_valid', 'geometry'],
      dtype='object')

In [20]:
df_2 = gpd.read_file(DATASET_PATH_2)

In [22]:
df_2 = df_2[(df_2['monitoring_class'] == 'CORN') & (
    df_2['dataset_part'] == 'test') & (df_2['period'] == '2023/2023')]

In [23]:
# Concatenar os GeoDataFrames
df = pd.concat([df_1, df_2], ignore_index=True)

In [24]:
df

Unnamed: 0,interest_area_id,period,start_season,emergence_date,gt_class,end_season,monitoring_class,state,peak_start,peak_end,...,compac_index,key_bucket,local_eopatch_path,total_imagens_do_intervalo,total_de_imagem,status_missing_images,days_gap,dates_diff,mean_dates_diff,status_series_missing
0,0.0,2023/2024,2023-10-01,2024-01-12,SOYBEAN,2024-05-01,SOYBEAN,GO,2023-12-01,2024-02-01,...,,,,,,,,,,
1,1.0,2023/2024,2023-10-01,2024-01-12,SOYBEAN,2024-05-01,SOYBEAN,GO,2023-12-01,2024-02-01,...,,,,,,,,,,
2,2.0,2023/2024,2023-10-01,2023-12-06,SOYBEAN,2024-09-01,SOYBEAN,PA,2023-12-01,2024-06-01,...,,,,,,,,,,
3,3.0,2023/2024,2023-10-01,2024-01-17,SOYBEAN,2024-05-01,SOYBEAN,GO,2023-12-01,2024-02-01,...,,,,,,,,,,
4,4.0,2023/2024,2023-10-01,2024-01-17,SOYBEAN,2024-05-01,SOYBEAN,GO,2023-12-01,2024-02-01,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13299,,2023/2023,2023-02-01,,,2023-10-01,CORN,DF,2023-04-01,2023-07-01,...,,,,,,,,,,
13300,,2023/2023,2023-02-01,,,2023-10-01,CORN,DF,2023-04-01,2023-07-01,...,,,,,,,,,,
13301,,2023/2023,2023-02-01,,,2023-10-01,CORN,DF,2023-04-01,2023-07-01,...,,,,,,,,,,
13302,,2023/2023,2023-02-01,,,2023-10-01,CORN,DF,2023-04-01,2023-07-01,...,,,,,,,,,,


In [25]:
value_counts = df['monitoring_class'].value_counts()
filename = '_'.join(
    [f"{cls}_{count}" for cls, count in value_counts.items()]) + ".gpkg"
filename

'SOYBEAN_7270_CORN_5536_COTTON_267_RICE_81_PASTURE_50_SUGAR_CANE_50_WHEAT_50.gpkg'

In [28]:
OUTPUT_DATASET_FINAL = f"/agrilearn_app/datasets/teste_pre_safra_2024_2025/geopackage/processed/{filename}"

In [29]:
df.to_file(OUTPUT_DATASET_FINAL, driver='GPKG', engine='fiona')

## 3. Adjusting features

In [10]:
# replacement_dict = {
#     'SUGAR_CANE': 'cana',
#     'SOYBEAN': 'soybean',
#     'CORN': 'corn',
#     'COTTON': 'cotton',
#     'RICE': 'rice'
# }

# df['monitoring_class_path'] = df['monitoring_class'].replace(replacement_dict)

In [10]:
# eopatch_utils.create_local_eopatch_path(df, 
#                           path_local="/agrilearn_app/datasets/eopatchs/processed",
#                           label_monitoring_class='monitoring_class_path',
#                           label_dataset_part='dataset_part',
#                           label_eopatch_location='eopath_location',
#                           )

In [11]:
df.isna().sum()

monitoring_class                  0
period                            0
fonte                             0
cultura_2                     38192
state                             0
area                              0
meso                              0
obs_extra                         0
eopath_location                   0
start_season                      0
end_season                        0
peak_start                        0
peak_end                          0
sentinel_eopatch_current          0
contour_score                  2997
contour_selected_timestamp        0
field_id                          0
compac_index                      0
planting_start                35276
planting_end                  35276
cycle_start                   35276
cycle_end                     35276
LOS                           35276
is_valid                      35276
is_valid_cvt                  35276
start_of_season               35276
end_of_season                 35276
peaks                       