In [1]:
import boto3
import geopandas as gpd
from io import BytesIO
import pandas as pd
from folium import GeoJson
import folium
import numpy as np

In [2]:
import sys
sys.path.append("/agrilearn_app/agrilearn/")
from agrilearn.utils import s3_utils, str_utils

### Global Variables

In [3]:
DATASET_PATH_TRAIN = "/agrilearn_app/datasets/geopackages/train_all_cultures_v3.gpkg"
DATASET_PATH_VAL = "/agrilearn_app/datasets/geopackages/val_all_cultures_v3.gpkg"
DATASET_PATH_TEST = "/agrilearn_app/datasets/geopackages/test_all_cultures_v3.gpkg"

# 1. Read Datasets

In [4]:
df_train = gpd.read_file(DATASET_PATH_TRAIN)
df_val = gpd.read_file(DATASET_PATH_VAL)
df_test = gpd.read_file(DATASET_PATH_TEST)
df_train['dataset_part'] = 'train'
df_val['dataset_part'] = 'val'
df_test['dataset_part'] =  'test'
df = pd.concat([df_train, df_val, df_test])

In [5]:
df.head()

Unnamed: 0,monitoring_class,period,state,field_id,fonte,area,micro,start_season,end_season,peak_start,...,start_of_cycle,end_of_cycle,length_of_cycle,start_of_season,end_of_season,peak_of_season,length_of_season,eopath_location,geometry,dataset_part
0,CORN,2021/2022,RO,1231_0,Agrosatélite - Grãos Brasil 2021/2022,399481.0,0,2021-09-01,2022-06-01,2021-11-01,...,2021-11-16 00:00:00,2022-04-10 00:00:00,145,2021-10-18 00:00:00,2022-03-23 00:00:00,2022-01-15T00:00:00,156,start_2021-09-01_end_2022-06-01_monitoring_cla...,"MULTIPOLYGON (((-63.61826 -10.32938, -63.61796...",train
1,CORN,2021/2022,RO,1231_1,Agrosatélite - Grãos Brasil 2021/2022,399481.0,0,2021-09-01,2022-06-01,2021-11-01,...,2021-10-22 00:00:00,2022-03-11 00:00:00,140,2021-11-07 00:00:00,2022-03-22 00:00:00,2022-01-15T00:00:00,135,start_2021-09-01_end_2022-06-01_monitoring_cla...,"MULTIPOLYGON (((-63.61728 -10.34225, -63.61837...",train
2,CORN,2020/2021,RO,1232_0,Agrosatélite - Grãos Brasil 2020/2021,254481.0,2,2020-09-01,2021-06-01,2020-11-01,...,2020-11-11 00:00:00,2021-03-16 00:00:00,125,2020-11-19 00:00:00,2021-03-19 00:00:00,2021-01-25T00:00:00,120,start_2020-09-01_end_2021-06-01_monitoring_cla...,"MULTIPOLYGON (((-62.23076 -9.81551, -62.23116 ...",train
3,CORN,2021/2022,RO,1233_1,Agrosatélite - Grãos Brasil 2021/2022,266350.0,2,2021-09-01,2022-06-01,2021-11-01,...,2021-11-01 00:00:00,2022-03-26 00:00:00,145,2021-11-02 00:00:00,2022-03-10 00:00:00,2022-01-15T00:00:00,128,start_2021-09-01_end_2022-06-01_monitoring_cla...,"MULTIPOLYGON (((-63.09362 -9.90218, -63.09386 ...",train
4,CORN,2020/2021,RO,1235_1,Agrosatélite - Grãos Brasil 2020/2021,393922.0,5,2020-09-01,2021-06-01,2020-11-01,...,2020-10-24 00:00:00,2021-03-13 00:00:00,140,2020-11-06 00:00:00,2021-03-09 00:00:00,2020-12-13T00:00:00,123,start_2020-09-01_end_2021-06-01_monitoring_cla...,"MULTIPOLYGON (((-61.8818 -12.05653, -61.8818 -...",train


In [6]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns")

There are 46642 rows and 21 columns


In [7]:
print(f"Dados de {df['start_season'].min()} a {df['end_season'].max()}")

Dados de 2017-09-01 a 2024-01-01


In [8]:
list(df.columns)

['monitoring_class',
 'period',
 'state',
 'field_id',
 'fonte',
 'area',
 'micro',
 'start_season',
 'end_season',
 'peak_start',
 'peak_end',
 'start_of_cycle',
 'end_of_cycle',
 'length_of_cycle',
 'start_of_season',
 'end_of_season',
 'peak_of_season',
 'length_of_season',
 'eopath_location',
 'geometry',
 'dataset_part']

# 2. Data Understading

### 2.1 Data description
- monitoring_class:** é a cultura de interesse dp cliente ou da amostra de treinamento (SOYBEAN, CORN)
- period:  a safra agricola (2023/2024, 2024/2025)
- state: é o estado do polígono
- field_id: id do talhão
- fonte: a origem dos dados (mapas temáticos, banco de dados)
- area:
- micro:
- start_season: é a data que inicia a safra pra determinada cultura (monitoring_class) e safra (period)
- end_season: é a data que termina a safra pra determinada cultura (monitoring_class) e safra (period)
- peak_start: é a data que inicia o período em que pode ocorrer o pico pra determinada cultura (monitoring_class) e safra (period)
- peak_end: é a data que termina o período em que pode ocorrer o pico pra determinada cultura (monitoring_class) e safra (period) (Obs: temos um calendário pra isso, um arquivo yaml)
- start_of_cycle: data do inio do cultivo segundo o calendário agrícola
- end_of_cycle: data do fim do caledário de cuiltivo segundo calendário agrícola
- length_of_cycle:
- start_of_season: data da emergencia da cultura
- end_of_season: data da colheita da cultura
- peak_of_season: data do pico vegetativo da cultura
- length_of_season: Duração do cultivo
- eopath_location: é caminho onde o eopatch está salvo (imagens p/ inferência)
- geometry: é a geometria do polígono]

# 3. Data Processing

### 3.1 Check the Data formatation

In [15]:
df.isna().sum()

monitoring_class       0
period                 0
state                  0
field_id            8450
fonte                  0
area                   0
micro                  0
start_season           0
end_season             0
peak_start             0
peak_end               0
start_of_cycle         0
end_of_cycle           0
length_of_cycle        0
start_of_season        0
end_of_season          0
peak_of_season         0
length_of_season       0
eopath_location        0
geometry               0
dataset_part           0
dtype: int64

### 3.2 Vamos manter os dados nulos no campo ``field_id`` pois são os dados de trigo e não temos um padrão para gerar o field_id

In [22]:
df[df['field_id'].isna()].shape[0] / df.shape[0]

0.1811671883709961

In [23]:
df[df['field_id'].isna()]['monitoring_class'].value_counts()

monitoring_class
WHEAT    8450
Name: count, dtype: int64

### 3.3 Data Formatation

In [24]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 46642 entries, 0 to 4809
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   monitoring_class  46642 non-null  object  
 1   period            46642 non-null  object  
 2   state             46642 non-null  object  
 3   field_id          38192 non-null  object  
 4   fonte             46642 non-null  object  
 5   area              46642 non-null  float64 
 6   micro             46642 non-null  int64   
 7   start_season      46642 non-null  object  
 8   end_season        46642 non-null  object  
 9   peak_start        46642 non-null  object  
 10  peak_end          46642 non-null  object  
 11  start_of_cycle    46642 non-null  object  
 12  end_of_cycle      46642 non-null  object  
 13  length_of_cycle   46642 non-null  object  
 14  start_of_season   46642 non-null  object  
 15  end_of_season     46642 non-null  object  
 16  peak_of_season    46

### 3.3.1 Datetime variables

In [41]:
datetime_columns = ['start_season', 
                    'end_season', 
                    'peak_start', 
                    'peak_end']
                    #'start_of_cycle', 
                    #'end_of_cycle',
                    #'start_of_season',
                    #'end_of_season',
                    #'peak_of_season']

df[datetime_columns].info()

<class 'pandas.core.frame.DataFrame'>
Index: 46642 entries, 0 to 4809
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   start_season  46642 non-null  datetime64[ns]
 1   end_season    46642 non-null  datetime64[ns]
 2   peak_start    46642 non-null  datetime64[ns]
 3   peak_end      46642 non-null  datetime64[ns]
dtypes: datetime64[ns](4)
memory usage: 1.8 MB


In [42]:
df['start_of_cycle']

0       2021-11-16 00:00:00
1       2021-10-22 00:00:00
2       2020-11-11 00:00:00
3       2021-11-01 00:00:00
4       2020-10-24 00:00:00
               ...         
4805             2023-04-04
4806             2023-04-04
4807             2023-05-04
4808             2023-04-04
4809             2023-04-04
Name: start_of_cycle, Length: 46642, dtype: object

In [43]:
for col in datetime_columns:
    print(col)
    df[col] = pd.to_datetime(df[col])

start_season
end_season
peak_start
peak_end


#### 3.3.2 Integer variables

In [44]:
df['length_of_season'] = df['length_of_season'].astype(int)

In [46]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 46642 entries, 0 to 4809
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   monitoring_class  46642 non-null  object        
 1   period            46642 non-null  object        
 2   state             46642 non-null  object        
 3   field_id          38192 non-null  object        
 4   fonte             46642 non-null  object        
 5   area              46642 non-null  float64       
 6   micro             46642 non-null  int64         
 7   start_season      46642 non-null  datetime64[ns]
 8   end_season        46642 non-null  datetime64[ns]
 9   peak_start        46642 non-null  datetime64[ns]
 10  peak_end          46642 non-null  datetime64[ns]
 11  start_of_cycle    46642 non-null  object        
 12  end_of_cycle      46642 non-null  object        
 13  length_of_cycle   46642 non-null  object        
 14  start_of_season   46

# 4. Data Analysis

### 4.1 Analysing Target y (monitoring_class)

In [47]:
pd.concat([df['monitoring_class'].value_counts(), df['monitoring_class'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
monitoring_class,Unnamed: 1_level_1,Unnamed: 2_level_1
SOYBEAN,29670,63.612195
WHEAT,8450,18.116719
CORN,5710,12.242185
COTTON,1639,3.514
RICE,1173,2.514901


### 4.2 Analysing fonte de dados

In [49]:
pd.concat([df['fonte'].value_counts(), df['fonte'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
fonte,Unnamed: 1_level_1,Unnamed: 2_level_1
Agrosatélite - Grãos Brasil 2021/2022,19185,41.132456
Agrosatélite - Grãos Brasil 2020/2021,17834,38.235925
Conab - Culturas de Inverno 2023/2023,8450,18.116719
Conab - Arroz Irrigado 2019/2020,595,1.275674
Conab - Arroz Irrigado 2018/2019,341,0.731101
Conab - Arroz Irrigado 2017/2018,206,0.441662
Conab - Arroz Irrigado 2021/2022,31,0.066464


### 4.3 Analysing dados por estado

In [50]:
pd.concat([df['state'].value_counts(), df['state'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
state,Unnamed: 1_level_1,Unnamed: 2_level_1
PR,13360,28.643712
MT,5709,12.240041
SP,5079,10.889327
MG,4380,9.390678
RS,3338,7.15664
GO,3322,7.122336
MS,2542,5.450024
MA,1781,3.818447
TO,1571,3.368209
BA,1193,2.557781


### 4.6 Analysis field_id (existem IDs duplicados para os talhões)

In [52]:
df['field_id'].nunique()

36234

In [53]:
df[df.duplicated('field_id')]

Unnamed: 0,monitoring_class,period,state,field_id,fonte,area,micro,start_season,end_season,peak_start,...,start_of_cycle,end_of_cycle,length_of_cycle,start_of_season,end_of_season,peak_of_season,length_of_season,eopath_location,geometry,dataset_part
6131,RICE,2017/2018,TO,41_2,Conab - Arroz Irrigado 2017/2018,3.053780e+05,59,2017-10-01,2018-06-01,2017-12-01,...,2018-01-16 00:00:00,2018-05-21 00:00:00,125,2018-01-16 00:00:00,2018-05-13 00:00:00,2018-02-15T00:00:00,117,start_2017-10-01_end_2018-06-01_monitoring_cla...,"MULTIPOLYGON (((-49.88425 -10.47874, -49.88308...",train
6135,RICE,2017/2018,TO,53_1,Conab - Arroz Irrigado 2017/2018,5.749160e+05,60,2017-10-01,2018-06-01,2017-12-01,...,2017-11-17 00:00:00,2018-03-02 00:00:00,105,2017-10-24 00:00:00,2018-04-11 00:00:00,2018-01-21T00:00:00,169,start_2017-10-01_end_2018-06-01_monitoring_cla...,"MULTIPOLYGON (((-49.55478 -10.83641, -49.55464...",train
6137,RICE,2017/2018,TO,62_3,Conab - Arroz Irrigado 2017/2018,3.274483e+06,60,2017-10-01,2018-06-01,2017-12-01,...,2017-11-12 00:00:00,2018-02-20 00:00:00,100,2017-10-14 00:00:00,2018-02-13 00:00:00,2018-01-01T00:00:00,122,start_2017-10-01_end_2018-06-01_monitoring_cla...,"MULTIPOLYGON (((-49.50023 -11.15293, -49.50023...",train
6151,RICE,2017/2018,TO,84_3,Conab - Arroz Irrigado 2017/2018,5.458330e+05,60,2017-10-01,2018-06-01,2017-12-01,...,2017-11-17 00:00:00,2018-04-11 00:00:00,145,2017-11-24 00:00:00,2018-04-12 00:00:00,2018-02-10T00:00:00,139,start_2017-10-01_end_2018-06-01_monitoring_cla...,"MULTIPOLYGON (((-49.57375 -10.87316, -49.57378...",train
6152,RICE,2017/2018,TO,92_2,Conab - Arroz Irrigado 2017/2018,1.406380e+06,60,2017-10-01,2018-06-01,2017-12-01,...,2017-10-23 00:00:00,2018-02-07 00:00:00,107,2017-10-12 00:00:00,2018-04-06 00:00:00,2018-01-16T00:00:00,176,start_2017-10-01_end_2018-06-01_monitoring_cla...,"MULTIPOLYGON (((-49.32316 -11.17751, -49.32314...",train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4805,WHEAT,2023/2023,PR,,Conab - Culturas de Inverno 2023/2023,6.909008e+04,450,2023-04-01,2024-01-01,2023-06-01,...,2023-04-04,2023-10-06,185,2023-06-19,2023-10-01,2023-08-22 00:00:00,104,start_2023-04-01_end_2024-01-01_monitoring_cla...,"MULTIPOLYGON (((-49.20935 -25.92179, -49.20935...",test
4806,WHEAT,2023/2023,PR,,Conab - Culturas de Inverno 2023/2023,1.002700e+05,450,2023-04-01,2024-01-01,2023-06-01,...,2023-04-04,2023-09-26,175,2023-05-21,2023-09-15,2023-08-07 00:00:00,117,start_2023-04-01_end_2024-01-01_monitoring_cla...,"MULTIPOLYGON (((-49.66929 -26.08043, -49.66929...",test
4807,WHEAT,2023/2023,PR,,Conab - Culturas de Inverno 2023/2023,3.415436e+05,450,2023-04-01,2024-01-01,2023-06-01,...,2023-05-04,2023-11-10,190,2023-06-02,2023-10-29,2023-08-02 00:00:00,149,start_2023-04-01_end_2024-01-01_monitoring_cla...,"MULTIPOLYGON (((-49.65025 -25.95826, -49.65007...",test
4808,WHEAT,2023/2023,PR,,Conab - Culturas de Inverno 2023/2023,7.230476e+04,450,2023-04-01,2024-01-01,2023-06-01,...,2023-04-04,2023-09-26,175,2023-05-09,2023-08-07,2023-06-08 00:00:00,90,start_2023-04-01_end_2024-01-01_monitoring_cla...,"MULTIPOLYGON (((-49.47238 -25.94766, -49.47256...",test


In [54]:
df[df.duplicated('field_id')]['field_id'].unique()

array(['41_2', '53_1', '62_3', ..., '880_1', '884_1', '914_2'],
      dtype=object)

In [57]:
df[df['field_id'].isin(['41_2'])]

Unnamed: 0,monitoring_class,period,state,field_id,fonte,area,micro,start_season,end_season,peak_start,...,start_of_cycle,end_of_cycle,length_of_cycle,start_of_season,end_of_season,peak_of_season,length_of_season,eopath_location,geometry,dataset_part
5449,COTTON,2021/2022,BA,41_2,Agrosatélite - Grãos Brasil 2021/2022,1963743.0,220,2021-11-01,2022-10-01,2022-02-01,...,2022-01-09 00:00:00,2022-10-01 00:00:00,265,2022-01-20 00:00:00,2022-07-23 00:00:00,2022-04-24T00:00:00,184,start_2021-11-01_end_2022-10-01_monitoring_cla...,"MULTIPOLYGON (((-46.16128 -12.77912, -46.16128...",train
6131,RICE,2017/2018,TO,41_2,Conab - Arroz Irrigado 2017/2018,305378.0,59,2017-10-01,2018-06-01,2017-12-01,...,2018-01-16 00:00:00,2018-05-21 00:00:00,125,2018-01-16 00:00:00,2018-05-13 00:00:00,2018-02-15T00:00:00,117,start_2017-10-01_end_2018-06-01_monitoring_cla...,"MULTIPOLYGON (((-49.88425 -10.47874, -49.88308...",train
28896,SOYBEAN,2021/2022,RO,41_2,Agrosatélite - Grãos Brasil 2021/2022,20267233.0,0,2021-10-01,2022-05-01,2021-12-01,...,2021-10-12 00:00:00,2022-01-30 00:00:00,110,2021-10-19 00:00:00,2022-03-01 00:00:00,2021-12-01T00:00:00,133,start_2021-10-01_end_2022-05-01_monitoring_cla...,"MULTIPOLYGON (((-62.82534 -9.44642, -62.82463 ...",train


### 4.10 Analysis ``LOS`` and ``length_of_season``

In [62]:
df[['length_of_season', 'monitoring_class']]

Unnamed: 0,length_of_season,monitoring_class
0,156,CORN
1,135,CORN
2,120,CORN
3,128,CORN
4,123,CORN
...,...,...
4805,104,WHEAT
4806,117,WHEAT
4807,149,WHEAT
4808,90,WHEAT


In [64]:
df.groupby('monitoring_class').agg({'length_of_season':['max', 'mean','min']})

Unnamed: 0_level_0,length_of_season,length_of_season,length_of_season
Unnamed: 0_level_1,max,mean,min
monitoring_class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
CORN,160,125.735552,60
COTTON,240,190.315436,120
RICE,180,136.597613,78
SOYBEAN,160,117.017122,61
WHEAT,150,117.731243,90


## 5. Generating Html Report

In [193]:
from ydata_profiling import ProfileReport
from ydata_profiling.config import Settings
%matplotlib inline

In [231]:
df_copy = df.drop(columns=['cultura_2', 'geometry'])  # Supondo que 'cultura_2' seja a coluna problemática

In [237]:
df_e_object = df_copy.select_dtypes(exclude=['object'])
df_e_object

Unnamed: 0,area,meso,start_season,end_season,peak_start,peak_end,contour_score,contour_selected_timestamp,compac_index,planting_start,planting_end,cycle_start,cycle_end,LOS,is_valid_cvt,start_of_season,end_of_season,peaks,length_of_season
0,10269314.0,505,2021-10-01,2022-06-01,2021-12-01,2022-03-01,0.995002,2022-03-03,2.559326,2021-10-01,2022-02-01,2021-12-03,2022-05-12,160,True,2021-12-14,2022-05-03,2022-02-26,140
1,10269314.0,505,2021-10-01,2022-06-01,2021-12-01,2022-03-01,0.995002,2022-03-03,2.030404,2021-10-01,2022-02-01,2021-11-23,2022-04-17,145,True,2021-12-10,2022-05-03,2022-02-26,144
2,10269314.0,505,2021-10-01,2022-06-01,2021-12-01,2022-03-01,0.995002,2022-03-03,1.498424,2021-10-01,2022-02-01,2021-12-03,2022-04-22,140,True,2021-12-12,2022-05-03,2022-02-26,142
3,10269314.0,505,2021-10-01,2022-06-01,2021-12-01,2022-03-01,0.995002,2022-03-03,1.929409,2021-10-01,2022-02-01,2021-12-03,2022-05-02,150,True,2021-12-11,2022-04-15,2022-02-26,125
4,10269314.0,505,2021-10-01,2022-06-01,2021-12-01,2022-03-01,0.995002,2022-03-03,1.476064,2021-10-01,2022-02-01,2021-12-03,2022-04-17,135,True,2021-12-16,2022-04-14,2022-02-26,119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38187,681005.0,494,2019-09-01,2020-06-01,2019-11-01,2020-03-01,0.990099,2020-02-04,2.202441,2019-09-01,2020-01-01,2019-09-27,2020-05-29,245,True,2019-10-25,2020-03-26,2020-01-25,153
38188,681005.0,494,2019-09-01,2020-06-01,2019-11-01,2020-03-01,0.990099,2020-02-04,1.593055,2019-09-01,2020-01-01,2019-10-07,2020-05-19,225,True,2019-10-16,2020-03-12,2020-01-05,148
38189,139290.0,505,2019-09-01,2020-06-01,2019-11-01,2020-03-01,,2020-01-18,1.903089,2019-09-01,2020-01-01,2019-11-04,2020-05-27,205,True,2019-11-11,2020-03-26,2020-01-18,136
38190,176041.0,509,2018-08-01,2019-05-01,2018-10-01,2019-02-01,0.989781,2019-01-22,1.452775,2018-08-01,2019-01-01,2018-08-20,2019-03-08,200,True,2018-10-29,2019-03-01,2019-01-22,123


In [238]:
df_i_object = df_copy.select_dtypes(include=['object'])
df_i_object

Unnamed: 0,monitoring_class,period,fonte,state,obs_extra,eopath_location,sentinel_eopatch_current,field_id,is_valid
0,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_0,1
1,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_1,1
2,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_2,1
3,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_6,1
4,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_8,1
...,...,...,...,...,...,...,...,...,...
38187,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,52_0,0
38188,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,52_2,0
38189,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,80_2,0
38190,RICE,2018/2019,Conab - Arroz Irrigado 2018/2019,MS,,start_2018-08-01_end_2019-06-01_monitoring_cla...,start_2018-08-01_end_2019-06-01_monitoring_cla...,81_0,0


In [240]:
df_e_object_colums = df_copy.select_dtypes(exclude=['object']).columns

In [248]:
title = "data_report_07_10_2024_full_copy_v3"
config = Settings()
config.vars.cat.words = False  # Desativa a nuvem de palavras
report  = ProfileReport(df=df_copy[list(df_e_object_colums) + ['monitoring_class', 'state', 'period', 'fonte']], 
                        title=title,
                        minimal=False,
                        config=config)

report.to_file(f'{title}.html')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [206]:
df_number = df.select_dtypes(exclude=['float','int', 'datetime'])
df_number

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,obs_extra,eopath_location,sentinel_eopatch_current,field_id,is_valid,is_valid_cvt,geometry
0,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_0,1,True,"POLYGON ((-52.44675 -32.21676, -52.44679 -32.2..."
1,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_1,1,True,"POLYGON ((-52.44533 -32.21992, -52.44368 -32.2..."
2,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_2,1,True,"POLYGON ((-52.45559 -32.21991, -52.45526 -32.2..."
3,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_6,1,True,"POLYGON ((-52.46623 -32.22236, -52.46621 -32.2..."
4,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_8,1,True,"POLYGON ((-52.45816 -32.22394, -52.45805 -32.2..."
...,...,...,...,...,...,...,...,...,...,...,...,...
38187,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,52_0,0,True,"POLYGON ((-50.64755 -29.66027, -50.64755 -29.6..."
38188,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,52_2,0,True,"POLYGON ((-50.64708 -29.66136, -50.64704 -29.6..."
38189,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,80_2,0,True,"POLYGON ((-52.75066 -32.89058, -52.75065 -32.8..."
38190,RICE,2018/2019,Conab - Arroz Irrigado 2018/2019,,MS,,start_2018-08-01_end_2019-06-01_monitoring_cla...,start_2018-08-01_end_2019-06-01_monitoring_cla...,81_0,0,True,"POLYGON ((-56.70471 -20.11537, -56.70492 -20.1..."


# 5. Analysis culture per state

In [67]:
df[df['state'].isin(['BA', 'RS', 'MT'])].groupby('state').agg({'monitoring_class':'value_counts'})

Unnamed: 0_level_0,Unnamed: 1_level_0,monitoring_class
state,monitoring_class,Unnamed: 2_level_1
BA,SOYBEAN,561
BA,COTTON,383
BA,CORN,249
MT,SOYBEAN,4546
MT,COTTON,798
MT,CORN,365
RS,SOYBEAN,2239
RS,RICE,595
RS,CORN,504


In [68]:
df[df['state'].isin(['BA'])].to_file("/agrilearn_app/datasets/geopackages/crop-classification-v3-bahia", driver='GPKG', engine='fiona')
df[df['state'].isin(['RS'])].to_file("/agrilearn_app/datasets/geopackages/crop-classification-v3-rio-grande-do-sul", driver='GPKG', engine='fiona')
df[df['state'].isin(['MT'])].to_file("/agrilearn_app/datasets/geopackages/crop-classification-v3-mato-grosso", driver='GPKG', engine='fiona')