In [29]:
import boto3
import geopandas as gpd
import pandas as pd
import folium
import numpy as np
from folium import GeoJson
from io import BytesIO

In [30]:
import sys
sys.path.append("/agrilearn_app/agrilearn/")
from agrilearn.utils import s3_utils, str_utils

### Global Variables

In [31]:
DATASET_PATH_TRAIN = "/agrilearn_app/datasets/v1/geopackage/cana_train_fields_filtered.gpkg"
DATASET_PATH_VAL = "/agrilearn_app/datasets/v1/geopackage/cana_val_fields_filtered.gpkg"
DATASET_PATH_TEST = "/agrilearn_app/datasets/v1/geopackage/cana_test_fields_filtered.gpkg"

# 1. Read Datasets

In [32]:
df_train = gpd.read_file(DATASET_PATH_TRAIN)
df_val = gpd.read_file(DATASET_PATH_VAL)
df_test = gpd.read_file(DATASET_PATH_TEST)

df_train['dataset_part'] = 'train'
df_val['dataset_part'] = 'val'
df_test['dataset_part'] =  'test'

df = pd.concat([df_train, df_val, df_test])

In [33]:
df['dataset_part'].value_counts()

dataset_part
train    32229
test      1591
val       1456
Name: count, dtype: int64

In [34]:
df['monitoring_class'].value_counts()

monitoring_class
SUGAR_CANE    35276
Name: count, dtype: int64

In [35]:
df['cultura_2'].value_counts()

cultura_2
cana_soca         30377
cana_reformada     4019
cana_expansao       880
Name: count, dtype: int64

In [36]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns")

There are 35276 rows and 20 columns


In [37]:
print(f"Dados de {df['start_season'].min()} a {df['end_season'].max()}")

Dados de 2020-04-01 a 2023-04-01


In [38]:
list(df.columns)

['monitoring_class',
 'period',
 'fonte',
 'cultura_2',
 'state',
 'area',
 'meso',
 'obs_extra',
 'eopath_location',
 'start_season',
 'end_season',
 'peak_start',
 'peak_end',
 'sentinel_eopatch_current',
 'contour_score',
 'contour_selected_timestamp',
 'field_id',
 'compac_index',
 'geometry',
 'dataset_part']

In [39]:
df['dataset_part'].value_counts(normalize=True)*100

dataset_part
train    91.362399
test      4.510149
val       4.127452
Name: proportion, dtype: float64

In [40]:
df.to_file("/agrilearn_app/datasets/v1/geopackage/cana_fields_filtered.gpkg", 
           driver='GPKG', 
           engine='fiona')

# 2. Data Understading

### 2.1 Data description
- monitoring_class:** é a cultura de interesse dp cliente ou da amostra de treinamento (SOYBEAN, CORN)
- period:  a safra agricola (2023/2024, 2024/2025)
- state: é o estado do polígono
- field_id: id do talhão
- fonte: a origem dos dados (mapas temáticos, banco de dados)
- area:
- micro:
- start_season: é a data que inicia a safra pra determinada cultura (monitoring_class) e safra (period)
- end_season: é a data que termina a safra pra determinada cultura (monitoring_class) e safra (period)
- peak_start: é a data que inicia o período em que pode ocorrer o pico pra determinada cultura (monitoring_class) e safra (period)
- peak_end: é a data que termina o período em que pode ocorrer o pico pra determinada cultura (monitoring_class) e safra (period) (Obs: temos um calendário pra isso, um arquivo yaml)
- start_of_cycle: data do inio do cultivo segundo o calendário agrícola
- end_of_cycle: data do fim do caledário de cuiltivo segundo calendário agrícola
- length_of_cycle:
- start_of_season: data da emergencia da cultura
- end_of_season: data da colheita da cultura
- peak_of_season: data do pico vegetativo da cultura
- length_of_season: Duração do cultivo
- eopath_location: é caminho onde o eopatch está salvo (imagens p/ inferência)
- geometry: é a geometria do polígono]

# 3. Data Processing

### 3.1 Check the Data formatation

In [41]:
df.isna().sum()

monitoring_class                 0
period                           0
fonte                            0
cultura_2                        0
state                            0
area                             0
meso                             0
obs_extra                        0
eopath_location                  0
start_season                     0
end_season                       0
peak_start                       0
peak_end                         0
sentinel_eopatch_current         0
contour_score                 1823
contour_selected_timestamp       0
field_id                         0
compac_index                     0
geometry                         0
dataset_part                     0
dtype: int64

### 3.2 Vamos manter os dados nulos no campo ``contour_score``

In [42]:
df[df['contour_score'].isna()]

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,area,meso,obs_extra,eopath_location,start_season,end_season,peak_start,peak_end,sentinel_eopatch_current,contour_score,contour_selected_timestamp,field_id,compac_index,geometry,dataset_part
317,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_reformada,TO,4247067.0,61,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,2022-04-01,2021-04-01,2022-04-01,start_2021-04-01_end_2022-04-01_monitoring_cla...,,2022-03-13,33_3,1.422140,"POLYGON ((-48.02433 -9.30124, -48.0243 -9.3012...",train
318,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_reformada,TO,4247067.0,61,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,2022-04-01,2021-04-01,2022-04-01,start_2021-04-01_end_2022-04-01_monitoring_cla...,,2022-03-13,33_7,1.633626,"POLYGON ((-48.03278 -9.30478, -48.03278 -9.304...",train
319,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_reformada,TO,4247067.0,61,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,2022-04-01,2021-04-01,2022-04-01,start_2021-04-01_end_2022-04-01_monitoring_cla...,,2022-03-13,33_8,1.524014,"POLYGON ((-48.01997 -9.30964, -48.02001 -9.309...",train
320,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_reformada,TO,4247067.0,61,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,2022-04-01,2021-04-01,2022-04-01,start_2021-04-01_end_2022-04-01_monitoring_cla...,,2022-03-13,33_9,1.683029,"POLYGON ((-48.03207 -9.30769, -48.03215 -9.307...",train
321,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_reformada,TO,4247067.0,61,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,2022-04-01,2021-04-01,2022-04-01,start_2021-04-01_end_2022-04-01_monitoring_cla...,,2022-03-13,33_10,1.527210,"POLYGON ((-48.02722 -9.30705, -48.02712 -9.307...",train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,GO,1567914.0,547,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,,2021-02-01,520_7,1.445176,"POLYGON ((-49.55284 -15.48442, -49.55274 -15.4...",test
1413,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,GO,1567914.0,547,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,,2021-02-01,520_8,1.453617,"POLYGON ((-49.54446 -15.4856, -49.54436 -15.48...",test
1414,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,GO,1567914.0,547,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,,2021-02-01,520_11,1.710181,"POLYGON ((-49.55261 -15.49029, -49.55302 -15.4...",test
1415,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,GO,1567914.0,547,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,,2021-02-01,520_13,1.484750,"POLYGON ((-49.54809 -15.48912, -49.54799 -15.4...",test


In [43]:
df[df['contour_score'].isna()].shape[0] / df.shape[0]

0.051678194806667424

In [44]:
df[df['contour_score'].isna()]['monitoring_class'].value_counts()

monitoring_class
SUGAR_CANE    1823
Name: count, dtype: int64

### 3.3 Data Formatation

In [45]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 35276 entries, 0 to 1590
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   monitoring_class            35276 non-null  object  
 1   period                      35276 non-null  object  
 2   fonte                       35276 non-null  object  
 3   cultura_2                   35276 non-null  object  
 4   state                       35276 non-null  object  
 5   area                        35276 non-null  float64 
 6   meso                        35276 non-null  int64   
 7   obs_extra                   35276 non-null  object  
 8   eopath_location             35276 non-null  object  
 9   start_season                35276 non-null  object  
 10  end_season                  35276 non-null  object  
 11  peak_start                  35276 non-null  object  
 12  peak_end                    35276 non-null  object  
 13  sentinel_eopat

### 3.3.1 Datetime variables

In [46]:
datetime_columns = ['start_season', 
                    'end_season', 
                    'peak_start', 
                    'peak_end']

df[datetime_columns].info()

<class 'pandas.core.frame.DataFrame'>
Index: 35276 entries, 0 to 1590
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   start_season  35276 non-null  object
 1   end_season    35276 non-null  object
 2   peak_start    35276 non-null  object
 3   peak_end      35276 non-null  object
dtypes: object(4)
memory usage: 1.3+ MB


In [47]:
for col in datetime_columns:
    print(col)
    df[col] = pd.to_datetime(df[col])

start_season
end_season
peak_start
peak_end


#### 3.3.2 Integer variables

In [48]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 35276 entries, 0 to 1590
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   monitoring_class            35276 non-null  object        
 1   period                      35276 non-null  object        
 2   fonte                       35276 non-null  object        
 3   cultura_2                   35276 non-null  object        
 4   state                       35276 non-null  object        
 5   area                        35276 non-null  float64       
 6   meso                        35276 non-null  int64         
 7   obs_extra                   35276 non-null  object        
 8   eopath_location             35276 non-null  object        
 9   start_season                35276 non-null  datetime64[ns]
 10  end_season                  35276 non-null  datetime64[ns]
 11  peak_start                  35276 non-null  datetime

# 4. Data Analysis

### 4.1 Analysing Target y (monitoring_class)

In [49]:
pd.concat([df['monitoring_class'].value_counts(), df['monitoring_class'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
monitoring_class,Unnamed: 1_level_1,Unnamed: 2_level_1
SUGAR_CANE,35276,100.0


### 4.2 Analysing fonte de dados

In [50]:
pd.concat([df['fonte'].value_counts(), df['fonte'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
fonte,Unnamed: 1_level_1,Unnamed: 2_level_1
Agrosatélite - Canasat 2022/23,12061,34.190384
Agrosatélite - Canasat 2021/2022,11903,33.742488
Agrosatélite - Canasat 2023/2024,11312,32.067128


### 4.3 Analysing dados por estado

In [51]:
pd.concat([df['state'].value_counts(), df['state'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
state,Unnamed: 1_level_1,Unnamed: 2_level_1
SP,11447,32.449824
MG,5054,14.327021
PR,4778,13.54462
MS,4732,13.414219
GO,4407,12.492913
MT,3795,10.758022
TO,1063,3.01338


### 4.6 Analysis field_id (existem IDs duplicados para os talhões)

In [52]:
df['field_id'].nunique()

34726

In [53]:
df[df.duplicated(['field_id','period'])]

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,area,meso,obs_extra,eopath_location,start_season,end_season,peak_start,peak_end,sentinel_eopatch_current,contour_score,contour_selected_timestamp,field_id,compac_index,geometry,dataset_part
1,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,6375372.0,61,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,1.000000,2020-04-27,0_5,1.463128,"POLYGON ((-48.1934 -9.13491, -48.1933 -9.135, ...",val
2,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,6375372.0,61,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,1.000000,2020-04-27,0_6,1.353872,"POLYGON ((-48.20115 -9.136, -48.20115 -9.13609...",val
3,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,6375372.0,61,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,1.000000,2020-04-27,0_7,1.387676,"POLYGON ((-48.19504 -9.13663, -48.19504 -9.136...",val
4,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,6375372.0,61,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,1.000000,2020-04-27,0_8,1.471264,"POLYGON ((-48.18773 -9.13856, -48.18735 -9.138...",val
6,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,6375372.0,61,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,1.000000,2020-04-27,0_10,1.427358,"POLYGON ((-48.19905 -9.13935, -48.19905 -9.139...",val
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1580,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,GO,520114.0,558,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,0.993173,2021-02-01,558_2,1.393636,"POLYGON ((-50.97791 -19.31829, -50.9795 -19.31...",test
1581,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_soca,GO,7609666.0,558,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,2022-04-01,2021-04-01,2022-04-01,start_2021-04-01_end_2022-04-01_monitoring_cla...,1.000000,2022-03-23,559_2,2.512049,"POLYGON ((-50.95173 -19.29924, -50.95077 -19.2...",test
1582,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_soca,GO,7609666.0,558,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,2022-04-01,2021-04-01,2022-04-01,start_2021-04-01_end_2022-04-01_monitoring_cla...,1.000000,2022-03-23,559_3,1.519327,"POLYGON ((-50.96747 -19.30071, -50.96747 -19.3...",test
1587,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_soca,GO,7609666.0,558,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,2022-04-01,2021-04-01,2022-04-01,start_2021-04-01_end_2022-04-01_monitoring_cla...,1.000000,2022-03-23,559_9,1.762405,"POLYGON ((-50.96251 -19.31623, -50.96498 -19.3...",test


In [54]:
df[df.duplicated(['field_id','period'])]['field_id'].unique()

array(['0_5', '0_6', '0_7', '0_8', '0_10', '0_11', '0_12', '0_13', '0_14',
       '1_6', '1_7', '1_8', '1_9', '1_11', '1_12', '1_13', '1_14', '1_15',
       '1_17', '4_6', '4_8', '4_10', '4_11', '4_12', '4_13', '4_17',
       '4_20', '4_22', '4_26', '4_34', '5_3', '33_9', '54_5', '79_6',
       '79_9', '115_2', '119_6', '119_11', '155_1', '187_1', '200_7',
       '214_3', '230_2', '231_3', '252_4', '268_1', '268_5', '295_3',
       '305_0', '333_5', '335_5', '340_3', '366_1', '386_5', '417_2',
       '419_2', '419_7', '425_2', '428_2', '433_1', '444_5', '444_7',
       '444_8', '444_9', '444_12', '444_15', '444_20', '444_24', '444_25',
       '444_28', '444_29', '444_30', '470_1', '481_2', '481_4', '491_1',
       '533_1', '6_2', '6_3', '18_2', '20_2', '22_2', '23_1', '27_12',
       '29_1', '29_8', '40_1', '41_1', '41_3', '44_2', '46_2', '54_3',
       '55_6', '72_6', '72_7', '72_15', '72_25', '72_27', '72_28',
       '72_36', '72_39', '72_41', '72_47', '84_1', '89_5', '90_4', '91_4',

In [55]:
#df[df['field_id'].isin(['0_6'])].to_csv('/agrilearn_app/datasets/cana-v1/csvs/duplicated_lines_CANA-sample-02.csv')

In [56]:
df[['cultura_2', 'monitoring_class']]

Unnamed: 0,cultura_2,monitoring_class
0,cana_soca,SUGAR_CANE
1,cana_soca,SUGAR_CANE
2,cana_soca,SUGAR_CANE
3,cana_soca,SUGAR_CANE
4,cana_soca,SUGAR_CANE
...,...,...
1586,cana_soca,SUGAR_CANE
1587,cana_soca,SUGAR_CANE
1588,cana_soca,SUGAR_CANE
1589,cana_soca,SUGAR_CANE


## 5. Generating Html Report

In [57]:
from ydata_profiling import ProfileReport
from ydata_profiling.config import Settings
%matplotlib inline

In [58]:
df_copy = df.drop(columns=['cultura_2', 'geometry'])  # Supondo que 'cultura_2' seja a coluna problemática

In [59]:
df_e_object = df_copy.select_dtypes(exclude=['object'])
df_e_object

Unnamed: 0,area,meso,start_season,end_season,peak_start,peak_end,contour_score,compac_index
0,3603383.0,58,2020-04-01,2021-04-01,2020-04-01,2021-04-01,1.0,1.433508
1,3603383.0,58,2020-04-01,2021-04-01,2020-04-01,2021-04-01,1.0,1.705125
2,3603383.0,58,2020-04-01,2021-04-01,2020-04-01,2021-04-01,1.0,1.501758
3,3603383.0,58,2020-04-01,2021-04-01,2020-04-01,2021-04-01,1.0,1.493624
4,3603383.0,58,2020-04-01,2021-04-01,2020-04-01,2021-04-01,1.0,1.746982
...,...,...,...,...,...,...,...,...
1586,7609666.0,558,2021-04-01,2022-04-01,2021-04-01,2022-04-01,1.0,1.758666
1587,7609666.0,558,2021-04-01,2022-04-01,2021-04-01,2022-04-01,1.0,1.762405
1588,7609666.0,558,2021-04-01,2022-04-01,2021-04-01,2022-04-01,1.0,1.455945
1589,7609666.0,558,2021-04-01,2022-04-01,2021-04-01,2022-04-01,1.0,2.231472


In [60]:
df_i_object = df_copy.select_dtypes(include=['object'])
df_i_object

Unnamed: 0,monitoring_class,period,fonte,state,obs_extra,eopath_location,sentinel_eopatch_current,contour_selected_timestamp,field_id,dataset_part
0,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,TO,,start_2020-04-01_end_2021-04-01_monitoring_cla...,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-27,0_2,train
1,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,TO,,start_2020-04-01_end_2021-04-01_monitoring_cla...,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-27,0_3,train
2,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,TO,,start_2020-04-01_end_2021-04-01_monitoring_cla...,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-27,0_5,train
3,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,TO,,start_2020-04-01_end_2021-04-01_monitoring_cla...,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-27,0_6,train
4,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,TO,,start_2020-04-01_end_2021-04-01_monitoring_cla...,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-27,0_7,train
...,...,...,...,...,...,...,...,...,...,...
1586,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,GO,,start_2021-04-01_end_2022-04-01_monitoring_cla...,start_2021-04-01_end_2022-04-01_monitoring_cla...,2022-03-23,559_8,test
1587,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,GO,,start_2021-04-01_end_2022-04-01_monitoring_cla...,start_2021-04-01_end_2022-04-01_monitoring_cla...,2022-03-23,559_9,test
1588,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,GO,,start_2021-04-01_end_2022-04-01_monitoring_cla...,start_2021-04-01_end_2022-04-01_monitoring_cla...,2022-03-23,559_11,test
1589,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,GO,,start_2021-04-01_end_2022-04-01_monitoring_cla...,start_2021-04-01_end_2022-04-01_monitoring_cla...,2022-03-23,559_12,test


In [61]:
df_e_object_colums = df_copy.select_dtypes(exclude=['object']).columns

In [62]:
title = "data_report_14_11_2024_cana_data"
config = Settings()
config.vars.cat.words = False  # Desativa a nuvem de palavras
report  = ProfileReport(df=df_copy[list(df_e_object_colums) + ['monitoring_class', 'state', 'period', 'fonte']], 
                        title=title,
                        minimal=False,
                        config=config)

report.to_file(f'{title}.html')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

(using `df.profile_report(correlations={"auto": {"calculate": False}})`
If this is problematic for your use case, please report this as an issue:
https://github.com/ydataai/ydata-profiling/issues
(include the error message: 'cannot reindex on an axis with duplicate labels')


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [63]:
df_number = df.select_dtypes(exclude=['float','int', 'datetime'])
df_number

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,obs_extra,eopath_location,sentinel_eopatch_current,contour_selected_timestamp,field_id,geometry,dataset_part
0,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,,start_2020-04-01_end_2021-04-01_monitoring_cla...,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-27,0_2,"POLYGON ((-48.34483 -8.91348, -48.34412 -8.913...",train
1,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,,start_2020-04-01_end_2021-04-01_monitoring_cla...,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-27,0_3,"POLYGON ((-48.35083 -8.91933, -48.35008 -8.917...",train
2,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,,start_2020-04-01_end_2021-04-01_monitoring_cla...,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-27,0_5,"POLYGON ((-48.343 -8.91709, -48.34291 -8.91718...",train
3,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,,start_2020-04-01_end_2021-04-01_monitoring_cla...,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-27,0_6,"POLYGON ((-48.34644 -8.9189, -48.34626 -8.9190...",train
4,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,,start_2020-04-01_end_2021-04-01_monitoring_cla...,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-27,0_7,"POLYGON ((-48.34119 -8.91971, -48.34119 -8.919...",train
...,...,...,...,...,...,...,...,...,...,...,...,...
1586,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_soca,GO,,start_2021-04-01_end_2022-04-01_monitoring_cla...,start_2021-04-01_end_2022-04-01_monitoring_cla...,2022-03-23,559_8,"POLYGON ((-50.96946 -19.31377, -50.97019 -19.3...",test
1587,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_soca,GO,,start_2021-04-01_end_2022-04-01_monitoring_cla...,start_2021-04-01_end_2022-04-01_monitoring_cla...,2022-03-23,559_9,"POLYGON ((-50.96251 -19.31623, -50.96498 -19.3...",test
1588,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_soca,GO,,start_2021-04-01_end_2022-04-01_monitoring_cla...,start_2021-04-01_end_2022-04-01_monitoring_cla...,2022-03-23,559_11,"POLYGON ((-50.9802 -19.31351, -50.9801 -19.313...",test
1589,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_soca,GO,,start_2021-04-01_end_2022-04-01_monitoring_cla...,start_2021-04-01_end_2022-04-01_monitoring_cla...,2022-03-23,559_12,"MULTIPOLYGON (((-50.95141 -19.31792, -50.95255...",test


# 6. Analysis culture per state

In [64]:
df[df['state'].isin(['BA', 'RS', 'MT'])].groupby('state').agg({'monitoring_class':'value_counts'})

Unnamed: 0_level_0,Unnamed: 1_level_0,monitoring_class
state,monitoring_class,Unnamed: 2_level_1
MT,SUGAR_CANE,3795


In [None]:
# df[df['state'].isin(['BA'])].to_file("/agrilearn_app/datasets/geopackages/crop-classification-v3-bahia", driver='GPKG', engine='fiona')
# df[df['state'].isin(['RS'])].to_file("/agrilearn_app/datasets/geopackages/crop-classification-v3-rio-grande-do-sul", driver='GPKG', engine='fiona')
# df[df['state'].isin(['MT'])].to_file("/agrilearn_app/datasets/geopackages/crop-classification-v3-mato-grosso", driver='GPKG', engine='fiona')