In [1]:
import boto3
import geopandas as gpd
import pandas as pd
import folium
import numpy as np
from folium import GeoJson
from io import BytesIO

In [2]:
from agrilearn.crop_classification import s3_utils, str_utils
import sys
sys.path.append("/agrilearn_app/agrilearn/")
from agrilearn.crop_classification import evaluate_utils as eval_util
from agrilearn.crop_classification import yaml_utils, processing

2025-01-16 16:35:19.641895: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-16 16:35:19.660233: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737045319.680203    9880 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737045319.686300    9880 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-16 16:35:19.706681: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

### Global Variables

In [3]:
config = yaml_utils.load_config("/agrilearn_app/output/experiment/experiment_10.yaml")

In [4]:
config

{'data': {'geopackage_train_data_path': '/agrilearn_app/datasets/base/geopackage/processed/SOYBEAN_29670_CORN_5710_COTTON_1639_RICE_1173.gpkg',
  'label_dataset_part': 'dataset_part',
  'eopatch_folder': '/agrilearn_app/datasets/base/eopatch/input_model',
  'experiment_path': '/agrilearn_app/output/experiment_02/'},
 'processing': {'interp_day_range': 12,
  'ts_sample_n': 3,
  'labels_to_use': ['CORN', 'SOYBEAN', 'COTTON', 'RICE'],
  'translate_labels': True,
  'score_mean': None},
 'model': {'name': 'crop_classification',
  'version': 'v1.4.0',
  'parameters': {'conv_filters': 32,
   'kernel_size': 3,
   'lstm_layers': 2,
   'lstm_units': 64}},
 'train': {'learning_rate': 0.001,
  'batch_size': 32,
  'epochs': 300,
  'monitor': 'val_loss',
  'patience_early_stop': 15,
  'decay_denom': 100,
  'train_batch_size': 8,
  'val_batch_size': 1,
  'save_model_files': True,
  'verbose': 1,
  'shuffle': True,
  'target': 'monitoring_class'},
 'evaluate': {'dataset_name': 'SOYBEAN_7270_CORN_5536_

In [None]:
GEOPACKAGE_PATH = config['evaluate']['geopackage_test_data_path']
label_monitoring_class = 'monitoring_class'
label_pred = 'crop_class_rnn'
label_eopatch_path = 'eopath_location'
label_los = 'length_of_season'

# 1. Read Datasets

### Data description
- monitoring_class: é a cultura de interesse dp cliente ou da amostra de treinamento (SOYBEAN, CORN)
- period:  a safra agricola (2023/2024, 2024/2025)
- state: é o estado do polígono
- field_id: id do talhão
- fonte: a origem dos dados (mapas temáticos, banco de dados)
- area:
- micro:
- start_season: é a data que inicia a safra pra determinada cultura (monitoring_class) e safra (period)
- end_season: é a data que termina a safra pra determinada cultura (monitoring_class) e safra (period)
- peak_start: é a data que inicia o período em que pode ocorrer o pico pra determinada cultura (monitoring_class) e safra (period)
- peak_end: é a data que termina o período em que pode ocorrer o pico pra determinada cultura (monitoring_class) e safra (period) (Obs: temos um calendário pra isso, um arquivo yaml)
- start_of_cycle: data do inio do cultivo segundo o calendário agrícola
- end_of_cycle: data do fim do caledário de cuiltivo segundo calendário agrícola
- length_of_cycle:
- start_of_season: data da emergencia da cultura
- end_of_season: data da colheita da cultura
- peak_of_season: data do pico vegetativo da cultura
- length_of_season: Duração do cultivo
- eopath_location: é caminho onde o eopatch está salvo (imagens p/ inferência)
- geometry: é a geometria do polígono

In [None]:
df = gpd.read_file(GEOPACKAGE_PATH)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
pd.concat([
    df[label_monitoring_class].value_counts(),
    df[label_monitoring_class].value_counts(normalize=True) * 100],
    axis=1)

In [None]:
df['state'].value_counts()

In [None]:
pd.concat([
    df['dataset_part'].value_counts(),
    df['dataset_part'].value_counts(normalize=True)],
    axis=1)

In [None]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns")

# 2. Data Processing

### 2.1 Check NaN Values

In [None]:
df.shape

In [None]:
colums_to_drop = df.isna().sum().sort_values(ascending=False).iloc[0:51]
colums_to_drop

### 2.2 Drop Columns with NaNs Values

In [None]:
for c in colums_to_drop.keys():
    if c in list(df.columns):
        print(f"drop colum: {c}")
        del df[c]

In [None]:
df.shape

### 2.3 Check Duplicated

In [None]:
shape_before = df.shape[0]
idx_drop_duplicated = df[df.duplicated(subset=['period', 'geometry'])].index
df.drop(idx_drop_duplicated, inplace=True)
shape_after = df.shape[0]
print(f"{shape_before-shape_after} registros duplicados foram encontrados")

### 2.4 Data Formatation

In [None]:
list(df.columns)

In [None]:
datetime_columns = ['start_season', 
                    'end_season', 
                    'peak_start', 
                    'peak_end']

for col in datetime_columns:
    df[col] = pd.to_datetime(df[col])

df[datetime_columns].info()

### 2.5 Drop data out cicle

In [None]:
processing.create_check_culture_cycle(df,
                                        label_monitoring_class=label_monitoring_class,
                                          label_los=label_los,
                                          culture_cycles={
                                               'COTTON': (140, 220),
                                               'CORN': (105, 160),
                                               'SOYBEAN': (90, 160),
                                               'WHEAT': (100, 160),
                                               'RICE': (100, 150),
                                               'BEAN': (60, 100),
                                               'SUGAR_CANE': (300, 570)
                                          })

In [None]:
pd.concat([
            df['check_los'].value_counts(),
            df['check_los'].value_counts(normalize=True) * 100], axis=True)

# 3. Data Analysis

### 3.1 Analysing Target y (monitoring_class)

In [None]:
pd.concat([df[label_monitoring_class].value_counts(),
          df[label_monitoring_class].value_counts(normalize=True)*100], axis=1)

### 3.2 Analysing dados por estado e classes

In [None]:
pd.concat([df['state'].value_counts(),
          df['state'].value_counts(normalize=True)*100], axis=1)

In [None]:
df.groupby(['state', 'monitoring_class']).agg({'period':'count'})

### 3.5 Analysis of length_of_season (LOS)

In [None]:
df.groupby(label_los).agg(omission_count=('geometry','count')).sort_values('omission_count', ascending=False)

In [None]:
value_counts = df['gt_class'].value_counts()
filename = '_'.join(
    [f"{cls}_{count}" for cls, count in value_counts.items()]) + ".gpkg"

OUTPUT_PATH = f"/agrilearn_app/datasets/teste_pre_safra_2024_2025/geopackage/processed/{filename}"
OUTPUT_PATH

In [None]:
df.to_file(OUTPUT_PATH, driver='GPKG', engine='fiona')