In [26]:
import boto3
import geopandas as gpd
import pandas as pd
import folium
import numpy as np
from folium import GeoJson
from io import BytesIO

In [27]:
import sys
sys.path.append("/agrilearn_app/agrilearn/submodules/commons/")

In [28]:
from agrilearn.crop_classification import evalutate_utils
from agrilearn.crop_classification import yaml_utils
from agrilearn.commons.s3 import s3_utils

## 1. Read EDF datasets

In [29]:
GEOPACKAGE_PATH = ["/agrilearn_app/datasets/meso-soja/ref_edf_soja 1.gpkg"]

In [30]:
gdfs = []

# Lê cada Geopackage e adiciona o GeoDataFrame à lista
for path in GEOPACKAGE_PATH:
    gdf = gpd.read_file(path)
    gdf['dataset_source'] = path  # Adiciona a coluna de origem
    gdfs.append(gdf)

# Concatena todos os GeoDataFrames em um único GeoDataFrame
if gdfs:
    df = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))
    print(f"Dataset final possui {df.shape[0]} linhas")
    df.head()
else:
    print("Nenhum GeoDataFrame válido encontrado.")

Dataset final possui 5570 linhas


In [31]:
df.head()

Unnamed: 0,name,uf,macro_edf,edf,geometry,dataset_source
0,Alta Floresta D'Oeste,RO,4,402,"MULTIPOLYGON (((-62.19465 -11.82746, -62.19332...",/agrilearn_app/datasets/meso-soja/ref_edf_soja...
1,Ariquemes,RO,4,402,"MULTIPOLYGON (((-62.53648 -9.73222, -62.52765 ...",/agrilearn_app/datasets/meso-soja/ref_edf_soja...
2,Cabixi,RO,4,402,"MULTIPOLYGON (((-60.37119 -13.36655, -60.37134...",/agrilearn_app/datasets/meso-soja/ref_edf_soja...
3,Cacoal,RO,4,402,"MULTIPOLYGON (((-61.0008 -11.29737, -61.00103 ...",/agrilearn_app/datasets/meso-soja/ref_edf_soja...
4,Cerejeiras,RO,4,402,"MULTIPOLYGON (((-61.49976 -13.00525, -61.49809...",/agrilearn_app/datasets/meso-soja/ref_edf_soja...


In [32]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 5570 entries, 0 to 5569
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   name            5570 non-null   object  
 1   uf              5570 non-null   object  
 2   macro_edf       5570 non-null   object  
 3   edf             5570 non-null   object  
 4   geometry        5570 non-null   geometry
 5   dataset_source  5570 non-null   object  
dtypes: geometry(1), object(5)
memory usage: 261.2+ KB


## 2. Improving the Data Quality

### 2.1 Check and Drop NaN Columns

In [33]:
shape_before = df.shape[1]
df.dropna(axis=1, how='all', inplace=True)
print(f"Removed Columns: {shape_before - df.shape[1]}, Percentage: {(shape_before - df.shape[1]) / shape_before * 100:.2f}%")

Removed Columns: 0, Percentage: 0.00%


### 2.2 Check NaN Values

In [34]:
# Calcula a quantidade de registros nulos e a porcentagem de registros nulos
df_check_NaN = pd.concat([df.isna().sum(), df.isna().sum() / df.shape[0] * 100], axis=1)
df_check_NaN.columns = ['Null Count', 'NaN percentage']
df_check_NaN.sort_values('NaN percentage', ascending=False)
df_check_NaN

Unnamed: 0,Null Count,NaN percentage
name,0,0.0
uf,0,0.0
macro_edf,0,0.0
edf,0,0.0
geometry,0,0.0
dataset_source,0,0.0


### 2.3 Check duplicated Values

In [35]:
subset=['name', 'uf', 'edf', 'macro_edf']
df_check_duplicated = df[df.duplicated(subset=subset, keep=False)].copy()
df_check_duplicated['id_duplicado'] = (df_check_duplicated.groupby(subset).ngroup() + 1)
df_check_duplicated.sort_values('id_duplicado', inplace=True)
df_check_duplicated

Unnamed: 0,name,uf,macro_edf,edf,geometry,dataset_source,id_duplicado


## 3. Univariate Data Analysis

### 3.1 ``edf``

In [36]:
df_check_NaN.loc['edf']

Null Count        0.0
NaN percentage    0.0
Name: edf, dtype: float64

In [37]:
df['edf'].nunique()

21

In [38]:
pd.concat([df['edf'].value_counts(),
          df['edf'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
edf,Unnamed: 1_level_1,Unnamed: 2_level_1
,1378,24.739677
501.0,622,11.166966
303.0,536,9.62298
203.0,408,7.324955
102.0,402,7.217235
302.0,353,6.337522
502.0,245,4.398564
304.0,236,4.236984
103.0,214,3.842011
101.0,203,3.644524


In [39]:
df['edf'] = df['edf'].replace('NULL', 9999).astype(int)

### 3.2 ``macro_edf``

In [40]:
df_check_NaN.loc['macro_edf']

Null Count        0.0
NaN percentage    0.0
Name: macro_edf, dtype: float64

In [41]:
df['macro_edf'].nunique()

6

In [42]:
pd.concat([df['macro_edf'].value_counts(),
          df['macro_edf'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
macro_edf,Unnamed: 1_level_1,Unnamed: 2_level_1
,1378,24.739677
3.0,1166,20.933573
1.0,970,17.414722
5.0,898,16.122083
2.0,751,13.482944
4.0,407,7.307002


In [43]:
df['macro_edf'] = df['macro_edf'].replace('NULL', 9999).astype(int)

### 3.3 ``macro_edf``

In [44]:
df_check_NaN.loc['name']

Null Count        0.0
NaN percentage    0.0
Name: name, dtype: float64

In [45]:
df['name'].nunique()

5297

In [46]:
pd.concat([df['name'].value_counts(),
          df['name'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
name,Unnamed: 1_level_1,Unnamed: 2_level_1
São Domingos,5,0.089767
Bom Jesus,5,0.089767
Santa Terezinha,4,0.071813
Vera Cruz,4,0.071813
Santa Luzia,4,0.071813
...,...,...
Conceição da Feira,1,0.017953
Cocos,1,0.017953
Coaraci,1,0.017953
Cipó,1,0.017953


### 3.4 ``uf``

In [47]:
df_check_NaN.loc['uf']

Null Count        0.0
NaN percentage    0.0
Name: uf, dtype: float64

In [48]:
df['uf'].nunique()

27

In [49]:
pd.concat([df['uf'].value_counts(),
          df['uf'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
uf,Unnamed: 1_level_1,Unnamed: 2_level_1
MG,853,15.314183
SP,645,11.579892
RS,497,8.922801
BA,417,7.486535
PR,399,7.163375
SC,295,5.29623
GO,246,4.416517
PI,224,4.021544
PB,223,4.003591
MA,217,3.895871


## 04. Save New Processed File

In [50]:
df.to_file(f"/agrilearn_app/datasets/meso-soja/ref_edf_soja_processed.gpkg", driver='GPKG', engine='fiona')