In [1]:
### Load the auto reload extension to automatically reload modules when files on disk are updated
%load_ext autoreload
# it will automatically be reloaded without the need to restart the kernel.
%autoreload 2

In [2]:
import sys
sys.path.append("../agrilearn/submodules/commons/")
sys.path.append("../agrilearn/submodules/crop_rnn/")
sys.path.append("../agrilearn/")

In [3]:
import boto3
import os
import geopandas as gpd
import pandas as pd
import folium
import numpy as np
from folium import GeoJson
from io import BytesIO
from tqdm import tqdm
from eolearn.core import EOPatch
from glob import glob

In [4]:
from agrilearn.crop_classification import yaml_utils, processing

ModuleNotFoundError: No module named 's2cloudless'

### Global Variables

In [None]:
config = yaml_utils.load_config("/agrilearn_app/output/config/experiment_12.yaml")

In [None]:
index=0
DATASET_NAME = config['evaluate'][index]['dataset_name']
GEOPACKAGE_PATH = config['evaluate'][index]['geopackage_test_data_path']
EOPATCH_PATH = config['evaluate'][index]['eopatch_folder']
LABEL_MONITORING_CLASS = config['evaluate'][index]['label_true']
LABEL_EOPATCH_LOCATION = config['evaluate'][index]['label_eopatch_location']
DATASET_NAME

# 1. Read Datasets

In [None]:
df = gpd.read_file(GEOPACKAGE_PATH)

In [None]:
df['dataset_part'].value_counts()

In [None]:
eval_util.get_dataset_distribution(df)

In [None]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns")

# 2. Create Omission df and processing

In [None]:
report_inclusion_and_omission = eval_util.calculate_omission_inclusion(df, 
                                                label_true=label_monitoring_class, 
                                                label_pred=label_pred)

In [None]:
report_inclusion_and_omission

## 2.1 Check the Data formatation

### 2.2.1 Datetime variables

In [None]:
datetime_columns = ['start_season', 
                    'end_season', 
                    'peak_start', 
                    'peak_end']

df[datetime_columns].info()

In [None]:
for col in datetime_columns:
    print(col)
    df[col] = pd.to_datetime(df[col])

#### 2.2.2 Integer variables

In [None]:
df.info()

# 3. Análise das OMISSÕES de Soja

In [None]:
report_inclusion_and_omission

In [None]:
cls = 'SOYBEAN'
omission_soybean = f'omission_{cls}'
df[omission_soybean] = np.NaN

df.loc[df[
        (df[label_pred] != cls) & 
        (df[label_monitoring_class] == cls)].index, omission_soybean] = True

df.loc[df[
    (df[label_pred] == cls) &
    (df[label_monitoring_class] == cls)].index, omission_soybean] = False

In [None]:
df_soja = df.dropna(subset=[omission_soybean])
df_soja.shape

In [None]:
pd.concat([df_soja[omission_soybean].value_counts(),
          df_soja[omission_soybean].value_counts(normalize=True)*100], axis=1)

In [None]:
df_soja[df_soja[omission_soybean]][label_pred].value_counts()

### Questão 01: Por que todas as omissões de soja são em períodos de transição anual, por exemplo, 2000/2001?

R: Todos os dados de soja são safras

In [None]:
period_df = pd.concat([
    df_soja[df_soja[omission_soybean]==True].groupby(
        'period').agg(omission_count=('geometry', 'count')),
    df_soja.groupby(
        'period').agg(total=('geometry', 'count'))
], axis=1)

period_df['omission_perc'] = period_df['omission_count'] / period_df['total'] * 100
period_df.sort_values('total', ascending=False)

### Questão 02: Existem diferenças entre o plantio de SOJA da região NORTE do Brasil em relação as demais regiões?

R: A Região norte está acima do linha de equador, logo o período do plantio é diferente. O tempo do período das safras são diferentes entre regiões.

In [None]:
df_soja['monitoring_class'].value_counts()

In [None]:
state_df = pd.concat([
    df_soja[df_soja[omission_soybean]==True].groupby('state').agg(
        omission_count=('geometry', 'count')),
    df_soja.groupby(
        'state').agg(total=('geometry', 'count'))
], axis=1)

state_df['omission_perc'] = state_df['omission_count'] / state_df['total'] * 100
state_df.sort_values('total', ascending=False)
state_df.fillna(0)

### Questão 03: Existem erros que são influenciados por ``emergence_sensor``?

In [None]:
df_soja[df_soja[omission_soybean] == True]['emergence_sensor'].value_counts()

In [None]:
emergence_sensor_df = pd.concat([
    df_soja[df_soja[omission_soybean]].groupby(
        'emergence_sensor').agg(omission_count=('geometry', 'count')),
    df_soja.groupby(
        'emergence_sensor').agg(total=('geometry', 'count'))
], axis=1)

emergence_sensor_df['omission_perc'] = emergence_sensor_df['omission_count'] / emergence_sensor_df['total'] * 100
emergence_sensor_df.sort_values('total', ascending=False)
emergence_sensor_df.fillna(0)

### Questão 04: Existem erros que são influenciados pelo tamanho do ciclo completo em dias ``LOS``?
- culture_cycles = {
    'COTTON': (140, 220), 
    'CORN': (105, 160), 
    'SOYBEAN': (90, 160), 
    'WHEAT': (100, 160),
    'RICE': (100, 150), 
    'BEAN': (60, 100), 
    'SUGAR_CANE': (300, 570)
}

In [None]:
LOS_df = pd.concat([
    df_soja[df_soja[omission_soybean]==True].groupby(
        'LOS').agg(omission_count=('geometry', 'count')),
    df_soja.groupby(
        'LOS').agg(total=('geometry', 'count'))
], axis=1)

LOS_df['omission_perc'] = LOS_df['omission_count'] / LOS_df['total'] * 100
LOS_df.sort_values('omission_count', ascending=False)
LOS_df.fillna(0)

### Questão 05: O tamanho do talhão influencia os erros do modelo?

In [None]:
# discretização ()
df_soja['area_ha_cat'] = pd.cut(df_soja['area_ha'], bins=[0, 10, 100, np.inf], labels=[
                                'pequena', "média", "grande"])

In [None]:
area_ha_df = pd.concat([
    df_soja[df_soja[omission_soybean]==True].groupby('area_ha_cat').agg(
        omission_count=('geometry', 'count')),
    df_soja.groupby(
        'area_ha_cat').agg(total=('geometry', 'count'))
], axis=1)

area_ha_df['omission_perc'] = area_ha_df['omission_count'] / area_ha_df['total'] * 100
area_ha_df.sort_values('omission_count', ascending=False)

# 4. Análise das omissões de Milho

In [None]:
cls = 'CORN'
omission_corn = f'omission_{cls}'
df[omission_corn] = np.NaN
df.loc[df[
        (df[label_pred] != cls) & 
        (df[label_monitoring_class] == cls)].index, omission_corn] = True

df.loc[df[
    (df[label_pred] == cls) &
    (df[label_monitoring_class] == cls)].index, omission_corn] = False

In [None]:
df_corn = df.dropna(subset=[omission_corn])

In [None]:
pd.concat([df_corn[omission_corn].value_counts(),
          df_corn[omission_corn].value_counts(normalize=True)*100], axis=1)

### Questão 01: Avaliar as omissões de milho

In [None]:
period_df = pd.concat([
    df_corn[df_corn[omission_corn] == True].groupby('period').agg(
        omission_count=('geometry', 'count')),
    df_corn.groupby(
        'period').agg(total=('geometry', 'count'))
], axis=1)

period_df['omission_perc'] = period_df['omission_count'] / period_df['total'] * 100
period_df.sort_values('total', ascending=False)
period_df.fillna(0)

### Questão 02: Existem diferenças entre o plantio de MILHO em diferentes regiões do Brasil?

In [None]:
state_df = pd.concat([
    df_corn[df_corn[omission_corn] == True].groupby(
        'state').agg(omission_count=('geometry', 'count')),
    df_corn.groupby(
        'state').agg(total=('geometry', 'count'))
], axis=1)

state_df['omission_perc'] = state_df['omission_count'] / state_df['total'] * 100
state_df.sort_values('omission_count', ascending=False)
state_df.fillna(0)

### Questão 03: Existem erros que são influenciados por ``emergence_sensor``?

In [None]:
df_corn[df_corn['emergence_sensor'].isna()==False].shape

In [None]:
emergence_sensor_df = pd.concat([
    df_corn[df_corn[omission_corn] == True].groupby(
        'emergence_sensor').agg(omission_count=('geometry', 'count')),
    df_corn.groupby(
        'emergence_sensor').agg(total=('geometry', 'count'))
], axis=1)

emergence_sensor_df['omission_perc'] = emergence_sensor_df['omission_count'] / emergence_sensor_df['total'] * 100
emergence_sensor_df.sort_values('total', ascending=False)
emergence_sensor_df.fillna(0)

### Questão 04: Existem erros de MILHO que são influenciados pelo tamanho do ciclo completo em dias ``LOS``?
- culture_cycles = {
    'COTTON': (140, 220), 
    'CORN': (105, 160), 
    'SOYBEAN': (90, 160), 
    'WHEAT': (100, 160),
    'RICE': (100, 150), 
    'BEAN': (60, 100), 
    'SUGAR_CANE': (300, 570)
}

In [None]:
LOS_df = pd.concat([
    df_corn[df_corn[omission_corn] == True].groupby(
        'LOS').agg(omission_count=('geometry', 'count')),
    df_corn.groupby(
        'LOS').agg(total=('geometry', 'count'))
], axis=1)

LOS_df['omission_perc'] = LOS_df['omission_count'] / LOS_df['total'] * 100
LOS_df.sort_values('omission_count', ascending=False)
LOS_df.fillna(0)

### Questão 05: O tamanho do talhão influencia os erros do modelo?

In [None]:
# discretização ()
df_corn['area_ha_cat'] = pd.cut(df_corn['area_ha'], bins=[0, 10, 100, np.inf], labels=[
                                'pequena', "média", "grande"])

In [None]:
area_ha_df = pd.concat([
    df_corn[df_corn[omission_corn]].groupby(
        'area_ha_cat').agg(omission_count=('geometry', 'count')),
    df_corn.groupby(
        'area_ha_cat').agg(total=('geometry', 'count'))
], axis=1)

area_ha_df['omission_perc'] = area_ha_df['omission_count'] / area_ha_df['total'] * 100
area_ha_df.sort_values('omission_count', ascending=False)