In [1]:
import boto3
import geopandas as gpd
import pandas as pd
import folium
import numpy as np
from folium import GeoJson
from io import BytesIO

In [2]:
import sys
sys.path.append("/agrilearn_app/agrilearn/")
from agrilearn.utils import s3_utils, str_utils
from agrilearn.crop_classification import evaluate_utils as eval_util
from agrilearn.crop_classification import yaml_utils

2025-01-08 12:32:05.985159: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-08 12:32:06.001997: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736339526.021121   11420 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736339526.026966   11420 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-08 12:32:06.046964: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

### Global Variables

In [3]:
config = yaml_utils.load_config("/agrilearn_app/output/experiment/experiment_08.yaml")

In [4]:
GEOPACKAGE_PATH = config['data']['geopackage_train_data_path']
GEOPACKAGE_PATH

'/agrilearn_app/datasets/base/geopackage/processed/SOYBEAN_29670_CORN_21919_COTTON_1619_RICE_1172.gpkg'

# 1. Read Datasets

In [10]:
df = gpd.read_file(GEOPACKAGE_PATH)

In [11]:
df['monitoring_class'].value_counts()

monitoring_class
SOYBEAN    29670
CORN       21919
COTTON      1619
RICE        1172
Name: count, dtype: int64

In [12]:
df['state'].value_counts()

state
PR    8579
MG    7472
MT    7290
SP    6698
RS    4814
GO    4658
MS    3036
MA    2096
TO    2082
SC    1544
RO    1514
PA    1389
BA    1330
PI    1307
RR     275
DF     245
AP      39
AC      12
Name: count, dtype: int64

In [14]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns")

There are 54380 rows and 54 columns


In [15]:
print(f"Dados de {df['start_season'].min()} a {df['end_season'].max()}")

Dados de 2017-09-01 00:00:00 a 2024-09-01 00:00:00


In [16]:
list(df.columns)

['monitoring_class',
 'period',
 'fonte',
 'state',
 'area',
 'meso',
 'eopath_location',
 'start_season',
 'end_season',
 'peak_start',
 'peak_end',
 'field_id',
 'planting_start',
 'planting_end',
 'cycle_start',
 'cycle_end',
 'LOS',
 'is_valid',
 'is_valid_cvt',
 'start_of_season',
 'end_of_season',
 'peaks',
 'length_of_season',
 'set_type',
 'sampled_date',
 'micro',
 'peak_of_season',
 'is_valid_metrics',
 'sos_valid',
 'pos_valid',
 'eos_valid',
 'los_valid',
 'start_of_cycle',
 'end_of_cycle',
 'length_of_cycle',
 'is_valid_POS',
 'is_valid_LOS',
 'dataset_part',
 'cultura_2',
 'obs_extra',
 'sentinel_eopatch_current',
 'contour_score',
 'contour_selected_timestamp',
 'compac_index',
 'key_bucket',
 'local_eopatch_path',
 'total_imagens_do_intervalo',
 'total_de_imagem',
 'status_missing_images',
 'days_gap',
 'dates_diff',
 'mean_dates_diff',
 'status_series_missing',
 'geometry']

In [17]:
df['dataset_part'].value_counts(normalize=True)*100

dataset_part
train    69.98345
test     15.01655
val      15.00000
Name: proportion, dtype: float64

# 2. Data Understading

### 2.1 Data description
- monitoring_class: é a cultura de interesse dp cliente ou da amostra de treinamento (SOYBEAN, CORN)
- period:  a safra agricola (2023/2024, 2024/2025)
- state: é o estado do polígono
- field_id: id do talhão
- fonte: a origem dos dados (mapas temáticos, banco de dados)
- area:
- micro:
- start_season: é a data que inicia a safra pra determinada cultura (monitoring_class) e safra (period)
- end_season: é a data que termina a safra pra determinada cultura (monitoring_class) e safra (period)
- peak_start: é a data que inicia o período em que pode ocorrer o pico pra determinada cultura (monitoring_class) e safra (period)
- peak_end: é a data que termina o período em que pode ocorrer o pico pra determinada cultura (monitoring_class) e safra (period) (Obs: temos um calendário pra isso, um arquivo yaml)
- start_of_cycle: data do inio do cultivo segundo o calendário agrícola
- end_of_cycle: data do fim do caledário de cuiltivo segundo calendário agrícola
- length_of_cycle:
- start_of_season: data da emergencia da cultura
- end_of_season: data da colheita da cultura
- peak_of_season: data do pico vegetativo da cultura
- length_of_season: Duração do cultivo
- eopath_location: é caminho onde o eopatch está salvo (imagens p/ inferência)
- geometry: é a geometria do polígono]

# 3. Data Processing

### 3.1 Check NaN Values

In [18]:
df.isna().sum()

monitoring_class                  0
period                            0
fonte                             0
state                             0
area                              0
meso                          18576
eopath_location                   0
start_season                      0
end_season                        0
peak_start                        0
peak_end                          0
field_id                      18576
planting_start                    0
planting_end                      0
cycle_start                   18576
cycle_end                     18576
LOS                           18576
is_valid                          0
is_valid_cvt                  18576
start_of_season                   0
end_of_season                     0
peaks                         18576
length_of_season                  0
set_type                      34020
sampled_date                  34020
micro                         35804
peak_of_season                35804
is_valid_metrics            

### 3.2 Check Duplicated

In [20]:
shape_before = df.shape[0]
idx_drop_duplicated = df[df.duplicated(subset=['period', 'geometry'])].index
df.drop(idx_drop_duplicated, inplace=True)
shape_after = df.shape[0]
print(f"{shape_before-shape_after} registros duplicados foram encontrados")

0 registros duplicados foram encontrados


### 3.3 Data Formatation

In [21]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 54380 entries, 0 to 54379
Data columns (total 54 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   monitoring_class            54380 non-null  object        
 1   period                      54380 non-null  object        
 2   fonte                       54380 non-null  object        
 3   state                       54380 non-null  object        
 4   area                        54380 non-null  float64       
 5   meso                        35804 non-null  float64       
 6   eopath_location             54380 non-null  object        
 7   start_season                54380 non-null  datetime64[ms]
 8   end_season                  54380 non-null  datetime64[ms]
 9   peak_start                  54380 non-null  datetime64[ms]
 10  peak_end                    54380 non-null  datetime64[ms]
 11  field_id                    35804 non-null  ob

In [25]:
datetime_columns = ['start_season', 
                    'end_season', 
                    'peak_start', 
                    'peak_end']

for col in datetime_columns:
    df[col] = pd.to_datetime(df[col])

df[datetime_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54380 entries, 0 to 54379
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   start_season  54380 non-null  datetime64[ms]
 1   end_season    54380 non-null  datetime64[ms]
 2   peak_start    54380 non-null  datetime64[ms]
 3   peak_end      54380 non-null  datetime64[ms]
dtypes: datetime64[ms](4)
memory usage: 1.7 MB


### 3.4 Vamos manter os dados nulos no campo ``contour_score``

In [26]:
df[df['contour_score'].isna()]

Unnamed: 0,monitoring_class,period,fonte,state,area,meso,eopath_location,start_season,end_season,peak_start,...,key_bucket,local_eopatch_path,total_imagens_do_intervalo,total_de_imagem,status_missing_images,days_gap,dates_diff,mean_dates_diff,status_series_missing,geometry
0,CORN,2022/2023,Agrosatélite - Grãos Amazonia e Cerrado 2022/2023,AC,1.527073e+06,,start_2022-10-01_end_2023-06-01_monitoring_cla...,2022-10-01,2023-06-01,2022-12-01,...,,,,,,,,,,"MULTIPOLYGON (((-67.78402 -10.10449, -67.78391..."
1,CORN,2022/2023,Agrosatélite - Grãos Amazonia e Cerrado 2022/2023,AC,7.104539e+05,,start_2022-10-01_end_2023-06-01_monitoring_cla...,2022-10-01,2023-06-01,2022-12-01,...,,,,,,,,,,"MULTIPOLYGON (((-68.37328 -10.75255, -68.37328..."
2,CORN,2022/2023,Agrosatélite - Grãos Amazonia e Cerrado 2022/2023,AC,6.305511e+05,,start_2022-10-01_end_2023-06-01_monitoring_cla...,2022-10-01,2023-06-01,2022-12-01,...,,,,,,,,,,"MULTIPOLYGON (((-67.61672 -10.05572, -67.61681..."
3,CORN,2022/2023,Agrosatélite - Grãos Amazonia e Cerrado 2022/2023,AC,1.507290e+05,,start_2022-10-01_end_2023-06-01_monitoring_cla...,2022-10-01,2023-06-01,2022-12-01,...,,,,,,,,,,"MULTIPOLYGON (((-67.60459 -10.10615, -67.60459..."
4,CORN,2022/2023,Agrosatélite - Grãos Amazonia e Cerrado 2022/2023,AC,1.688786e+05,,start_2022-10-01_end_2023-06-01_monitoring_cla...,2022-10-01,2023-06-01,2022-12-01,...,,,,,,,,,,"MULTIPOLYGON (((-67.70848 -10.3767, -67.70844 ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54330,RICE,2018/2019,Conab - Arroz Irrigado 2018/2019,SC,1.918965e+06,469.0,start_2018-09-01_end_2019-06-01_monitoring_cla...,2018-09-01,2019-06-01,2018-11-01,...,datasets/culture/culture_v02/rice_test_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,55,35,OK,,,5.44,OK,"POLYGON ((-49.5122 -28.69859, -49.51208 -28.69..."
54353,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,RS,7.142620e+05,505.0,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,2020-06-01,2019-11-01,...,datasets/culture/culture_v02/rice_test_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,68,34,OK,,,5.15,OK,"POLYGON ((-52.55275 -32.29614, -52.55275 -32.2..."
54369,RICE,2018/2019,Conab - Arroz Irrigado 2018/2019,SC,1.918965e+06,469.0,start_2018-09-01_end_2019-06-01_monitoring_cla...,2018-09-01,2019-06-01,2018-11-01,...,datasets/culture/culture_v02/rice_test_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,43,30,OK,,,5.69,OK,"POLYGON ((-49.51818 -28.68918, -49.51812 -28.6..."
54370,RICE,2018/2019,Conab - Arroz Irrigado 2018/2019,SC,1.918965e+06,469.0,start_2018-09-01_end_2019-06-01_monitoring_cla...,2018-09-01,2019-06-01,2018-11-01,...,datasets/culture/culture_v02/rice_test_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,43,36,OK,,,5.57,OK,"POLYGON ((-49.52119 -28.70226, -49.52169 -28.7..."


In [27]:
df[df['contour_score'].isna()].shape[0] / gdf.shape[0]

NameError: name 'gdf' is not defined

In [21]:
gdf[gdf['contour_score'].isna()]['monitoring_class'].value_counts()

monitoring_class
CORN       67810
SOYBEAN      825
RICE          79
COTTON        44
Name: count, dtype: int64

# 4. Resample data

In [33]:
pd.concat([gdf['monitoring_class'].value_counts(),
          gdf['monitoring_class'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
monitoring_class,Unnamed: 1_level_1,Unnamed: 2_level_1
CORN,73080,69.234705
SOYBEAN,29670,28.108835
COTTON,1632,1.546128
RICE,1172,1.110332


In [24]:
# gdf[gdf['monitoring_class'].isin(['CORN'])].groupby(['state', 'period']).agg(count=('monitoring_class', 'count')).sort_values('count')

In [44]:
df_result = eval_util.undersample(gdf, 
                                  label_class='monitoring_class',
                                  classe_value='CORN',
                                  percentual_drop=0.80, 
                                  stratify=['state', 'period'])

In [45]:
pd.concat([df_result['monitoring_class'].value_counts(),
          df_result['monitoring_class'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
monitoring_class,Unnamed: 1_level_1,Unnamed: 2_level_1
SOYBEAN,29670,62.973575
CORN,14641,31.075029
COTTON,1632,3.463865
RICE,1172,2.487531


In [101]:
# df_result[df_result['monitoring_class'].isin(['CORN'])].groupby(['state', 'period']).agg(count=('monitoring_class', 'count')).sort_values('count')

# 5. Split Train, Validation and Test

In [102]:
gdf=df_result

In [103]:
gdf.shape

(47115, 54)

In [104]:
gdf_filter = gdf.groupby(['state', 'monitoring_class', 'period']).filter(lambda x: len(x) > 10)
gdf_filter.shape[0]

47061

In [105]:
train_, test_and_val = eval_util.split_dataset_by_state_and_strafity(gdf_filter, 
                                                                     label_state='state', 
                                                                     stratify_labels=['monitoring_class', 'period'],
                                                                     train_size=0.7)

val_, test_ = eval_util.split_dataset_by_state_and_strafity(test_and_val, 
                                                            label_state='state', 
                                                            stratify_labels=['monitoring_class', 'period'],
                                                            train_size=0.5)
gdf['dataset_part'] = None
gdf.loc[train_.index, 'dataset_part'] = 'train'
gdf.loc[val_.index, 'dataset_part'] = 'val'
gdf.loc[test_.index, 'dataset_part'] = 'test'

In [106]:
shape_before = gdf.shape[0]
gdf.dropna(subset=['dataset_part'], inplace=True)
shape_after = gdf.shape[0]
print(f"{shape_before-shape_after} registros foram deletados pois o grupo possuem poucos registros para train, val and test")

54 registros foram deletados pois o grupo possuem poucos registros para train, val and test


# 6. Data Analysis

### 6.1 Analysing Target y (monitoring_class)

In [107]:
pd.concat([gdf['monitoring_class'].value_counts(),
          gdf['monitoring_class'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
monitoring_class,Unnamed: 1_level_1,Unnamed: 2_level_1
SOYBEAN,29670,63.045834
CORN,14600,31.023565
COTTON,1619,3.440216
RICE,1172,2.490385


### 5.2 Analysing fonte de dados

In [108]:
pd.concat([gdf['fonte'].value_counts(), 
           gdf['fonte'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
fonte,Unnamed: 1_level_1,Unnamed: 2_level_1
Agrosatélite - Grãos Brasil 2021/2022,17259,36.673679
Agrosatélite - Grãos Brasil 2020/2021,16252,34.533903
Agrosatélite - Grãos 2022/2023,6611,14.047725
Agrosatélite - Grãos Sul 2022/2023,2960,6.289709
Agrosatélite - Grãos Amazonia e Cerrado 2022/2023,2807,5.964599
Conab - Arroz Irrigado 2019/2020,595,1.264317
Conab - Arroz Irrigado 2018/2019,340,0.722467
Conab - Arroz Irrigado 2017/2018,206,0.43773
Conab - Arroz Irrigado 2021/2022,31,0.065872


### 5.3 Analysing dados por estado e classes

In [109]:
pd.concat([gdf['state'].value_counts(),
          gdf['state'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
state,Unnamed: 1_level_1,Unnamed: 2_level_1
PR,7035,14.948684
MT,6641,14.111472
MG,6110,12.98315
SP,5871,12.475298
RS,4154,8.826842
GO,4051,8.607977
MS,2801,5.95185
MA,1887,4.00969
TO,1867,3.967192
RO,1357,2.883492


In [110]:
gdf.groupby(['state', 'monitoring_class']).agg({'period':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,period
state,monitoring_class,Unnamed: 2_level_1
AP,SOYBEAN,39
BA,CORN,258
BA,COTTON,383
BA,SOYBEAN,561
DF,CORN,90
DF,SOYBEAN,95
GO,CORN,1213
GO,COTTON,108
GO,RICE,86
GO,SOYBEAN,2644


### 5.4 Analysis field_id (existem IDs duplicados para os talhões)

In [111]:
gdf['field_id'].nunique()

33049

In [112]:
gdf[gdf.duplicated(['field_id','period'])]

Unnamed: 0,monitoring_class,period,fonte,state,area,meso,eopath_location,start_season,end_season,peak_start,...,key_bucket,local_eopatch_path,total_imagens_do_intervalo,total_de_imagem,status_missing_images,days_gap,dates_diff,mean_dates_diff,status_series_missing,geometry
25,CORN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,BA,953564.0,220.0,start_2020-10-01_end_2021-07-01_monitoring_cla...,2020-10-01,2021-07-01,2021-01-01,...,datasets/culture/culture_v02/corn_train_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,55,24,OK,,,5.00,OK,"POLYGON ((-46.23854 -11.65442, -46.23862 -11.6..."
48,CORN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,BA,12082522.0,222.0,start_2020-10-01_end_2021-07-01_monitoring_cla...,2020-10-01,2021-07-01,2021-01-01,...,datasets/culture/culture_v02/corn_train_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,55,28,OK,,,5.00,OK,"POLYGON ((-46.15444 -13.83715, -46.15445 -13.8..."
53,CORN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,BA,14934182.0,63.0,start_2020-10-01_end_2021-07-01_monitoring_cla...,2020-10-01,2021-07-01,2021-01-01,...,,,,,,,,,,"MULTIPOLYGON (((-46.57307 -11.36588, -46.57307..."
58,CORN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,BA,4422121.0,222.0,start_2020-10-01_end_2021-07-01_monitoring_cla...,2020-10-01,2021-07-01,2021-01-01,...,datasets/culture/culture_v02/corn_train_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,55,33,OK,,,5.00,OK,"POLYGON ((-45.7407 -14.02816, -45.72816 -14.01..."
74,CORN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,BA,1096067.0,220.0,start_2020-10-01_end_2021-07-01_monitoring_cla...,2020-10-01,2021-07-01,2021-01-01,...,,,,,,,,,,"MULTIPOLYGON (((-45.92617 -11.53204, -45.92617..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47070,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,RS,108551.0,488.0,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,2020-06-01,2019-11-01,...,datasets/culture/culture_v02/rice_test_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,55,28,OK,,,5.19,OK,"POLYGON ((-54.15517 -29.92234, -54.15512 -29.9..."
47074,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,RS,384161.0,493.0,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,2020-06-01,2019-11-01,...,datasets/culture/culture_v02/rice_test_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,55,28,OK,,,5.19,OK,"POLYGON ((-51.41222 -29.70816, -51.41241 -29.7..."
47088,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,RS,714262.0,505.0,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,2020-06-01,2019-11-01,...,datasets/culture/culture_v02/rice_test_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,68,34,OK,,,5.15,OK,"POLYGON ((-52.55275 -32.29614, -52.55275 -32.2..."
47092,RICE,2021/2022,Conab - Arroz Irrigado 2021/2022,GO,125860.0,551.0,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,2022-06-01,2021-12-01,...,datasets/culture/culture_v02/rice_test_v2.gpkg,/agrilearn_app/datasets/eopatchs/processed/SOY...,68,28,OK,,,5.00,OK,"POLYGON ((-47.03756 -14.50845, -47.03755 -14.5..."


In [113]:
gdf[gdf.duplicated(['field_id','period'])]['field_id'].unique()

array(['2143_1', '2229_2', '538_16', '2259_0', '2164_3', '2145_2',
       '2192_5', '2209_5', '2136_0', '11054_4', '537_14', None, '11506_5',
       '11502_20', '11502_38', '1305_0', '1279_3', '11203_0', '11219_3',
       '1255_2', '11404_1', '11013_4', '11353_0', '10760_2', '11399_2',
       '11205_0', '1276_2', '11327_9', '11463_1', '1274_2', '11452_2',
       '11313_9', '10783_1', '11480_0', '557_2', '568_2', '1887_3',
       '1650_0', '1800_0', '1584_3', '1765_5', '593_3', '1667_1', '563_4',
       '1733_1', '2292_2', '2779_1', '2549_3', '598_4', '2477_0',
       '3622_2', '2312_17', '653_2', '2644_2', '3938_2', '2414_1',
       '4038_5', '645_1', '2809_5', '3728_3', '3964_0', '2449_3',
       '4166_1', '3617_2', '3285_1', '2846_0', '2287_5', '3835_4',
       '2544_10', '2404_10', '2344_4', '3043_4', '3982_1', '2937_5',
       '3210_0', '2355_3', '3729_3', '3602_0', '674_5', '2471_2',
       '3038_0', '2448_8', '3937_2', '659_2', '3763_1', '10311_4',
       '10345_1', '10205_2', '1

In [114]:
#df[df['field_id'].isin(['0_6'])].to_csv('/agrilearn_app/datasets/cana-v1/csvs/duplicated_lines_CANA-sample-02.csv')

In [115]:
gdf[['cultura_2', 'monitoring_class']]

Unnamed: 0,cultura_2,monitoring_class
19,,CORN
20,,CORN
21,,CORN
22,,CORN
23,,CORN
...,...,...
47110,,RICE
47111,,RICE
47112,,RICE
47113,,RICE


### 5.5 Analysis of period

In [116]:
gdf.groupby(['dataset_part', 'period']).agg({'monitoring_class':'count'})

Unnamed: 0_level_0,Unnamed: 1_level_0,monitoring_class
dataset_part,period,Unnamed: 2_level_1
test,2017/2018,31
test,2018/2019,51
test,2019/2020,89
test,2020/2021,2442
test,2021/2022,2597
test,2022/2023,867
test,2023/2023,992
train,2017/2018,144
train,2018/2019,237
train,2019/2020,416


In [117]:
gdf['monitoring_class'].value_counts()

monitoring_class
SOYBEAN    29670
CORN       14600
COTTON      1619
RICE        1172
Name: count, dtype: int64

In [118]:
gdf['monitoring_class'].value_counts().idxmax()

'SOYBEAN'

## 6. Save processed file

In [147]:
value_counts = gdf['monitoring_class'].value_counts()
filename = '_'.join([f"{cls}_{count}" for cls, count in value_counts.items()]) + ".gpkg"
filename

'SUGAR_CANE_35276_SOYBEAN_29670_CORN_5710_COTTON_1639_RICE_1173.gpkg'

In [148]:
gdf.to_file(f"/agrilearn_app/datasets/base/geopackage/processed/{filename}", driver='GPKG', engine='fiona')

## 7. Generating Html Report

In [None]:
# from ydata_profiling import ProfileReport
# from ydata_profiling.config import Settings
# %matplotlib inline

In [None]:
# df_copy = gdf.drop(columns=['cultura_2', 'geometry'])  # Supondo que 'cultura_2' seja a coluna problemática

In [None]:
# df_e_object = df_copy.select_dtypes(exclude=['object'])
# df_e_object

In [None]:
# df_i_object = df_copy.select_dtypes(include=['object'])
# df_i_object

In [None]:
# df_e_object_colums = df_copy.select_dtypes(exclude=['object']).columns

In [None]:
# title = "data_report_06_12_2024_cana_data"
# config = Settings()
# config.vars.cat.words = False  # Desativa a nuvem de palavras
# report  = ProfileReport(df=df_copy[list(df_e_object_colums) + ['monitoring_class', 'state', 'period', 'fonte']], 
#                         title=title,
#                         minimal=False,
#                         config=config)

# report.to_file(f'{title}.html')

In [None]:
# df_number = gdf.select_dtypes(exclude=['float','int', 'datetime'])
# df_number