In [3]:
import boto3
import geopandas as gpd
from io import BytesIO
import pandas as pd
from folium import GeoJson
import folium
import numpy as np

In [4]:
import sys
sys.path.append("/agrilearn_app/agrilearn/")
from agrilearn.utils import s3_utils, str_utils

### Global Variables

In [5]:
BUCKET_NAME = "brain-alternative-data-scientists-development"
OUTPUT_DATASETS_PATH_LOCAL ="/agrilearn_app/datasets/eopatchs/processed/**"
DATASET_PATH_LOCAL = "/agrilearn_app/datasets/crop_classification_raw.gpkg"

# 1. Read Datasets

In [6]:
df = gpd.read_file(DATASET_PATH_LOCAL)

In [7]:
df.head()

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,area,meso,obs_extra,eopath_location,start_season,...,cycle_end,LOS,is_valid,is_valid_cvt,start_of_season,end_of_season,peaks,length_of_season,key_bucket,geometry
0,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,2022-05-12,160,1,True,2021-12-14,2022-05-03,2022-02-26T00:00:00,140,datasets/culture/culture_v02/soybean_train_v2....,"POLYGON ((-52.44675 -32.21676, -52.44679 -32.2..."
1,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,2022-04-17,145,1,True,2021-12-10,2022-05-03,2022-02-26T00:00:00,144,datasets/culture/culture_v02/soybean_train_v2....,"POLYGON ((-52.44533 -32.21992, -52.44368 -32.2..."
2,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,2022-04-22,140,1,True,2021-12-12,2022-05-03,2022-02-26T00:00:00,142,datasets/culture/culture_v02/soybean_train_v2....,"POLYGON ((-52.45559 -32.21991, -52.45526 -32.2..."
3,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,2022-05-02,150,1,True,2021-12-11,2022-04-15,2022-02-26T00:00:00,125,datasets/culture/culture_v02/soybean_train_v2....,"POLYGON ((-52.46623 -32.22236, -52.46621 -32.2..."
4,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,2022-04-17,135,1,True,2021-12-16,2022-04-14,2022-02-26T00:00:00,119,datasets/culture/culture_v02/soybean_train_v2....,"POLYGON ((-52.45816 -32.22394, -52.45805 -32.2..."


In [8]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns")

There are 38192 rows and 31 columns


In [9]:
df['start_season']

0        2021-10-01
1        2021-10-01
2        2021-10-01
3        2021-10-01
4        2021-10-01
            ...    
38187    2019-09-01
38188    2019-09-01
38189    2019-09-01
38190    2018-08-01
38191    2018-10-01
Name: start_season, Length: 38192, dtype: object

# 2. Data Understading

### 2.1 Data description
- **(target) monitoring_class:** é a cultura de interesse dp cliente ou da amostra de treinamento (SOYBEAN, CORN)
- period:  a safra agricola (2023/2024, 2024/2025)
- fonte: a origem dos dados (mapas temáticos, banco de dados)
- cultura_2: é para ser mais detalhado sobre o monitoring_class. Se o monitoring_class é SUGAR_CANE, cultura_2 pode ser CANA SOCA ou CANA PLANTA
- state: é o estado do polígono
- area: é o tamanho do polígono (em metros quadrados)
- meso: é a mesoregião ou microregião do IBGE. Acho que o título está meso, mas estamos usando a micro
- obs_extra:  é quando precisa add alguma informação importante
- eopath_location: é caminho onde o eopatch está salvo (imagens p/ inferência)
- start_season: é a data que inicia a safra pra determinada cultura (monitoring_class) e safra (period)
- end_season: é a data que termina a safra pra determinada cultura (monitoring_class) e safra (period)
- peak_start: é a data que inicia o período em que pode ocorrer o pico pra determinada cultura (monitoring_class) e safra (period)
- peak_end: é a data que termina o período em que pode ocorrer o pico pra determinada cultura (monitoring_class) e safra (period) (Obs: temos um calendário pra isso, um arquivo yaml)
- sentinel_eopatch_current -> nome único do eopatch
- contour_score -> score do modelo q talhona
- contour_selected_timestamp -> acho q é a data do timestamp q foi selecionado p/ talhonar
- field_id: id do talhão
- compac_index:    
- planting_start: é a data que inicia o período em que pode ocorrer o  plantio pra determinada cultura (monitoring_class) e safra (period)
- planting_end: é a data que termina o período em que pode ocorrer o plantio pra determinada cultura (monitoring_class) e safra (period)
- cycle_start: informações sobre o inicio do ciclo de cultivo
- cycle_end: informações sobre o fim do ciclo de cultivo
- LOS (length_of_season): Calcula o ciclo (em dias) da cultura agrícola)
- is_valid:
- is_valid_cvt:
- start_of_season: data do inio do cultivo segundo o calendário agrícola
- end_of_season: data do fim do caledário de cuiltivo segundo calendário agrícola
- peaks: A data do NVDI máximo
- length_of_season: Duração do cultivo
- geometry: é a geometria do polígono

# 3. Data Processing

### 3.1 Check the Data formatation

In [10]:
df.isna().sum()

monitoring_class                  0
period                            0
fonte                             0
cultura_2                     38192
state                             0
area                              0
meso                              0
obs_extra                         0
eopath_location                   0
start_season                      0
end_season                        0
peak_start                        0
peak_end                          0
sentinel_eopatch_current          0
contour_score                  1174
contour_selected_timestamp        0
field_id                          0
compac_index                      0
planting_start                    0
planting_end                      0
cycle_start                       0
cycle_end                         0
LOS                               0
is_valid                          0
is_valid_cvt                      0
start_of_season                   0
end_of_season                     0
peaks                       

### 3.2 Deletar toda a coluna cultura_2 não é NaN, portanto, será deletada.

In [11]:
df.drop(['cultura_2'], axis=1, inplace=True)

### 3.3 Existem dados nulo na coluna `contour_score`

Por que exis

In [12]:
df['contour_score'].describe()

count    37018.000000
mean         0.993785
std          0.010461
min          0.537894
25%          0.991081
50%          0.997696
75%          1.000000
max          1.000000
Name: contour_score, dtype: float64

In [13]:
df[df['contour_score'].isna()].to_csv('contour_score_NaN.csv')

In [14]:
shape_before_drop = df.shape[0]
df.dropna(subset=['contour_score'], inplace=True)

In [15]:
print(f"Total de dados removidos: {shape_before_drop-df.shape[0]:.2f}")
print(f"Percentual de dados removidos: {shape_before_drop/df.shape[0]:.2f}%")
print(f"Dados remanescentes após a remoção de nulos: {df.shape[0]}")

Total de dados removidos: 1174.00
Percentual de dados removidos: 1.03%
Dados remanescentes após a remoção de nulos: 37018


### 3.4 Data Formatation
#### 3.4.1 Datetime variables

In [16]:
df[['start_season', 
    'end_season', 
    'peak_start', 
    'peak_end', 
    'planting_start', 
    'planting_end',
    'contour_selected_timestamp',
    'peaks'
   ]].info()

<class 'pandas.core.frame.DataFrame'>
Index: 37018 entries, 0 to 38191
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   start_season                37018 non-null  object
 1   end_season                  37018 non-null  object
 2   peak_start                  37018 non-null  object
 3   peak_end                    37018 non-null  object
 4   planting_start              37018 non-null  object
 5   planting_end                37018 non-null  object
 6   contour_selected_timestamp  37018 non-null  object
 7   peaks                       37018 non-null  object
dtypes: object(8)
memory usage: 2.5+ MB


In [17]:
df['start_season'] = pd.to_datetime(df['start_season'])
df['end_season'] = pd.to_datetime(df['end_season'])
df['peak_start'] = pd.to_datetime(df['peak_start'])
df['peak_end'] = pd.to_datetime(df['peak_end'])
df['planting_start'] = pd.to_datetime(df['planting_start'])
df['planting_end'] = pd.to_datetime(df['planting_end'])
df['contour_selected_timestamp'] = pd.to_datetime(df['contour_selected_timestamp'])
df['peaks'] = pd.to_datetime(df['peaks'])

In [18]:
df[['start_season', 
    'end_season', 
    'peak_start', 
    'peak_end', 
    'planting_start', 
    'planting_end',
    'contour_selected_timestamp']].info()

<class 'pandas.core.frame.DataFrame'>
Index: 37018 entries, 0 to 38191
Data columns (total 7 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   start_season                37018 non-null  datetime64[ns]
 1   end_season                  37018 non-null  datetime64[ns]
 2   peak_start                  37018 non-null  datetime64[ns]
 3   peak_end                    37018 non-null  datetime64[ns]
 4   planting_start              37018 non-null  datetime64[ns]
 5   planting_end                37018 non-null  datetime64[ns]
 6   contour_selected_timestamp  37018 non-null  datetime64[ns]
dtypes: datetime64[ns](7)
memory usage: 2.3 MB


In [19]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 37018 entries, 0 to 38191
Data columns (total 30 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   monitoring_class            37018 non-null  object        
 1   period                      37018 non-null  object        
 2   fonte                       37018 non-null  object        
 3   state                       37018 non-null  object        
 4   area                        37018 non-null  float64       
 5   meso                        37018 non-null  int64         
 6   obs_extra                   37018 non-null  object        
 7   eopath_location             37018 non-null  object        
 8   start_season                37018 non-null  datetime64[ns]
 9   end_season                  37018 non-null  datetime64[ns]
 10  peak_start                  37018 non-null  datetime64[ns]
 11  peak_end                    37018 non-null  datetim

#### 3.4.2 Integer variables

In [20]:
df['LOS'] = df['LOS'].astype(int)
df['length_of_season'] = df['length_of_season'].astype(int)

# 4. Data Analysis

### 4.1 Analysing Target y (monitoring_class)

In [21]:
pd.concat([df['monitoring_class'].value_counts(), df['monitoring_class'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
monitoring_class,Unnamed: 1_level_1,Unnamed: 2_level_1
SOYBEAN,28845,77.921552
CORN,5484,14.814415
COTTON,1595,4.308715
RICE,1094,2.955319


### 4.2 OneHotEncoding

In [22]:
df_dummy = pd.get_dummies(df, columns=['monitoring_class'], drop_first=True)

df.info()

### 4.3 Analysing fonte de dados

In [23]:
pd.concat([df['fonte'].value_counts(), df['fonte'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
fonte,Unnamed: 1_level_1,Unnamed: 2_level_1
Agrosatélite - Grãos Brasil 2021/2022,18577,50.183694
Agrosatélite - Grãos Brasil 2020/2021,17347,46.860987
Conab - Arroz Irrigado 2019/2020,549,1.483062
Conab - Arroz Irrigado 2018/2019,325,0.877951
Conab - Arroz Irrigado 2017/2018,192,0.518667
Conab - Arroz Irrigado 2021/2022,28,0.075639


### 4.4 Analysing dados por estado

In [110]:
pd.concat([df['state'].value_counts(), df['state'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
state,Unnamed: 1_level_1,Unnamed: 2_level_1
MT,5709,14.948157
SP,5079,13.298597
PR,4911,12.858714
MG,4380,11.46837
RS,3338,8.74005
GO,3322,8.698157
MS,2542,6.655844
MA,1781,4.66328
TO,1571,4.113427
BA,1193,3.123691


### 4.5 Analysing Meso

In [116]:
df['meso'].describe()

count    38192.000000
mean       368.849628
std        171.460486
min          0.000000
25%        270.000000
50%        421.000000
75%        517.000000
max        559.000000
Name: meso, dtype: float64

In [127]:
df['field_id'].nunique()

36234

### 4.6 Analysis field_id (id do talhão)

In [130]:
df['field_id'].nunique()

36234

In [133]:
df[df.duplicated('field_id')]

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,area,meso,obs_extra,eopath_location,start_season,...,cycle_start,cycle_end,LOS,is_valid,is_valid_cvt,start_of_season,end_of_season,peaks,length_of_season,geometry
27045,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RO,3778727.0,0,,start_2021-10-01_end_2022-05-01_monitoring_cla...,2021-10-01,...,2021-10-22,2022-03-16,145,1,True,2021-11-06,2022-03-02,2022-01-15T00:00:00,116,"POLYGON ((-63.66987 -9.14331, -63.66985 -9.143..."
27054,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RO,551422.0,3,,start_2021-10-01_end_2022-05-01_monitoring_cla...,2021-10-01,...,2021-12-16,2022-04-10,115,1,True,2021-11-20,2022-03-24,2022-01-15T00:00:00,124,"POLYGON ((-62.38324 -10.12717, -62.38324 -10.1..."
27056,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,,RO,1562629.0,4,,start_2020-10-01_end_2021-05-01_monitoring_cla...,2020-10-01,...,2020-11-21,2021-03-31,130,1,True,2020-12-12,2021-03-19,2021-01-25T00:00:00,97,"POLYGON ((-62.70898 -11.63535, -62.70806 -11.6..."
27058,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RO,1562629.0,4,,start_2021-10-01_end_2022-05-01_monitoring_cla...,2021-10-01,...,2021-10-22,2022-02-24,125,1,True,2021-10-22,2022-03-25,2022-01-15T00:00:00,154,"POLYGON ((-62.70898 -11.63535, -62.70797 -11.6..."
27060,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RO,3148591.0,4,,start_2021-10-01_end_2022-05-01_monitoring_cla...,2021-10-01,...,2021-11-11,2022-03-03,112,1,True,2021-11-01,2022-02-23,2022-01-22T00:00:00,114,"POLYGON ((-62.01629 -11.33566, -62.01628 -11.3..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38185,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,119645.0,490,,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,...,2019-09-30,2020-06-01,245,0,True,2019-10-30,2020-04-11,2020-02-02T00:00:00,164,"POLYGON ((-52.63034 -29.73281, -52.63034 -29.7..."
38186,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,148128.0,491,,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,...,2019-09-30,2020-06-01,245,0,True,2019-11-10,2020-04-06,2020-02-02T00:00:00,148,"POLYGON ((-51.97064 -29.61299, -51.9706 -29.61..."
38187,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,681005.0,494,,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,...,2019-09-27,2020-05-29,245,0,True,2019-10-25,2020-03-26,2020-01-25T00:00:00,153,"POLYGON ((-50.64755 -29.66027, -50.64755 -29.6..."
38188,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,681005.0,494,,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,...,2019-10-07,2020-05-19,225,0,True,2019-10-16,2020-03-12,2020-01-05T00:00:00,148,"POLYGON ((-50.64708 -29.66136, -50.64704 -29.6..."


In [137]:
df[df.duplicated('field_id')]['field_id'].unique()

array(['1_2', '9_2', '12_1', ..., '52_0', '52_2', '80_2'], dtype=object)

In [138]:
df[df['field_id'].isin(['1_2'])]

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,area,meso,obs_extra,eopath_location,start_season,...,cycle_start,cycle_end,LOS,is_valid,is_valid_cvt,start_of_season,end_of_season,peaks,length_of_season,geometry
12921,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,,RO,874435.0,0,,start_2020-10-01_end_2021-05-01_monitoring_cla...,2020-10-01,...,2020-10-22,2021-03-11,140,1,True,2020-10-06,2021-02-26,2020-12-16T00:00:00,143,"POLYGON ((-63.54999 -9.28001, -63.5498 -9.2799..."
27045,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RO,3778727.0,0,,start_2021-10-01_end_2022-05-01_monitoring_cla...,2021-10-01,...,2021-10-22,2022-03-16,145,1,True,2021-11-06,2022-03-02,2022-01-15T00:00:00,116,"POLYGON ((-63.66987 -9.14331, -63.66985 -9.143..."
28308,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,,RO,5311119.0,0,,start_2020-10-01_end_2021-05-01_monitoring_cla...,2020-10-01,...,2020-10-12,2021-02-24,135,1,True,2020-10-05,2021-02-06,2020-12-16T00:00:00,124,"POLYGON ((-63.2211 -9.20751, -63.22096 -9.2075..."
35923,COTTON,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,,TO,10649118.0,62,,start_2020-12-01_end_2021-09-01_monitoring_cla...,2020-12-01,...,2020-12-05,2021-09-01,270,0,True,2021-01-14,2021-07-08,2021-04-09T00:00:00,175,"POLYGON ((-45.79706 -10.30369, -45.79706 -10.3..."


### 4.7 Analysis compac_index

In [140]:
df['compac_index'].unique()

array([2.55932559, 2.03040427, 1.49842405, ..., 1.90308879, 1.45277458,
       1.7436045 ])

In [141]:
df['compac_index'].describe()

count    38192.000000
mean         2.137347
std          0.788510
min          1.001286
25%          1.569456
50%          1.890289
75%          2.471879
max          6.731624
Name: compac_index, dtype: float64

### 4.8 Analysis LOS (ciclo em dias da cultura agrícola

In [165]:
df['LOS'].describe(percentiles=np.arange(0.1, 1.1, 0.1))

count    38192.000000
mean       141.802262
std         33.268121
min         90.000000
10%        110.000000
20%        120.000000
30%        125.000000
40%        130.000000
50%        135.000000
60%        140.000000
70%        150.000000
80%        160.000000
90%        180.000000
100%       330.000000
max        330.000000
Name: LOS, dtype: float64

### 4.9 Analysis ``is_valid`` and ``is_valid_cvt``

In [171]:
pd.concat([df['is_valid'].value_counts(), df['is_valid'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
is_valid,Unnamed: 1_level_1,Unnamed: 2_level_1
1,32640,85.462924
0,5552,14.537076


In [172]:
pd.concat([df['is_valid_cvt'].value_counts(), df['is_valid_cvt'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
is_valid_cvt,Unnamed: 1_level_1,Unnamed: 2_level_1
True,36684,96.051529
False,1508,3.948471


### 4.10 Analysis ``LOS`` and ``length_of_season``

In [187]:
df[['LOS', 'length_of_season']]

Unnamed: 0,LOS,length_of_season
0,160,140
1,145,144
2,140,142
3,150,125
4,135,119
...,...,...
38187,245,153
38188,225,148
38189,205,136
38190,200,123


## 5. Generating Html Report

In [193]:
from ydata_profiling import ProfileReport
from ydata_profiling.config import Settings
%matplotlib inline

In [231]:
df_copy = df.drop(columns=['cultura_2', 'geometry'])  # Supondo que 'cultura_2' seja a coluna problemática

In [237]:
df_e_object = df_copy.select_dtypes(exclude=['object'])
df_e_object

Unnamed: 0,area,meso,start_season,end_season,peak_start,peak_end,contour_score,contour_selected_timestamp,compac_index,planting_start,planting_end,cycle_start,cycle_end,LOS,is_valid_cvt,start_of_season,end_of_season,peaks,length_of_season
0,10269314.0,505,2021-10-01,2022-06-01,2021-12-01,2022-03-01,0.995002,2022-03-03,2.559326,2021-10-01,2022-02-01,2021-12-03,2022-05-12,160,True,2021-12-14,2022-05-03,2022-02-26,140
1,10269314.0,505,2021-10-01,2022-06-01,2021-12-01,2022-03-01,0.995002,2022-03-03,2.030404,2021-10-01,2022-02-01,2021-11-23,2022-04-17,145,True,2021-12-10,2022-05-03,2022-02-26,144
2,10269314.0,505,2021-10-01,2022-06-01,2021-12-01,2022-03-01,0.995002,2022-03-03,1.498424,2021-10-01,2022-02-01,2021-12-03,2022-04-22,140,True,2021-12-12,2022-05-03,2022-02-26,142
3,10269314.0,505,2021-10-01,2022-06-01,2021-12-01,2022-03-01,0.995002,2022-03-03,1.929409,2021-10-01,2022-02-01,2021-12-03,2022-05-02,150,True,2021-12-11,2022-04-15,2022-02-26,125
4,10269314.0,505,2021-10-01,2022-06-01,2021-12-01,2022-03-01,0.995002,2022-03-03,1.476064,2021-10-01,2022-02-01,2021-12-03,2022-04-17,135,True,2021-12-16,2022-04-14,2022-02-26,119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38187,681005.0,494,2019-09-01,2020-06-01,2019-11-01,2020-03-01,0.990099,2020-02-04,2.202441,2019-09-01,2020-01-01,2019-09-27,2020-05-29,245,True,2019-10-25,2020-03-26,2020-01-25,153
38188,681005.0,494,2019-09-01,2020-06-01,2019-11-01,2020-03-01,0.990099,2020-02-04,1.593055,2019-09-01,2020-01-01,2019-10-07,2020-05-19,225,True,2019-10-16,2020-03-12,2020-01-05,148
38189,139290.0,505,2019-09-01,2020-06-01,2019-11-01,2020-03-01,,2020-01-18,1.903089,2019-09-01,2020-01-01,2019-11-04,2020-05-27,205,True,2019-11-11,2020-03-26,2020-01-18,136
38190,176041.0,509,2018-08-01,2019-05-01,2018-10-01,2019-02-01,0.989781,2019-01-22,1.452775,2018-08-01,2019-01-01,2018-08-20,2019-03-08,200,True,2018-10-29,2019-03-01,2019-01-22,123


In [238]:
df_i_object = df_copy.select_dtypes(include=['object'])
df_i_object

Unnamed: 0,monitoring_class,period,fonte,state,obs_extra,eopath_location,sentinel_eopatch_current,field_id,is_valid
0,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_0,1
1,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_1,1
2,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_2,1
3,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_6,1
4,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_8,1
...,...,...,...,...,...,...,...,...,...
38187,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,52_0,0
38188,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,52_2,0
38189,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,80_2,0
38190,RICE,2018/2019,Conab - Arroz Irrigado 2018/2019,MS,,start_2018-08-01_end_2019-06-01_monitoring_cla...,start_2018-08-01_end_2019-06-01_monitoring_cla...,81_0,0


In [240]:
df_e_object_colums = df_copy.select_dtypes(exclude=['object']).columns

In [248]:
title = "data_report_07_10_2024_full_copy_v3"
config = Settings()
config.vars.cat.words = False  # Desativa a nuvem de palavras
report  = ProfileReport(df=df_copy[list(df_e_object_colums) + ['monitoring_class', 'state', 'period', 'fonte']], 
                        title=title,
                        minimal=False,
                        config=config)

report.to_file(f'{title}.html')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={"index": "df_index"}, inplace=True)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [206]:
df_number = df.select_dtypes(exclude=['float','int', 'datetime'])
df_number

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,obs_extra,eopath_location,sentinel_eopatch_current,field_id,is_valid,is_valid_cvt,geometry
0,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_0,1,True,"POLYGON ((-52.44675 -32.21676, -52.44679 -32.2..."
1,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_1,1,True,"POLYGON ((-52.44533 -32.21992, -52.44368 -32.2..."
2,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_2,1,True,"POLYGON ((-52.45559 -32.21991, -52.45526 -32.2..."
3,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_6,1,True,"POLYGON ((-52.46623 -32.22236, -52.46621 -32.2..."
4,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,,start_2021-10-01_end_2022-06-01_monitoring_cla...,start_2021-10-01_end_2022-06-01_monitoring_cla...,13415_8,1,True,"POLYGON ((-52.45816 -32.22394, -52.45805 -32.2..."
...,...,...,...,...,...,...,...,...,...,...,...,...
38187,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,52_0,0,True,"POLYGON ((-50.64755 -29.66027, -50.64755 -29.6..."
38188,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,52_2,0,True,"POLYGON ((-50.64708 -29.66136, -50.64704 -29.6..."
38189,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,,start_2019-09-01_end_2020-06-01_monitoring_cla...,start_2019-09-01_end_2020-06-01_monitoring_cla...,80_2,0,True,"POLYGON ((-52.75066 -32.89058, -52.75065 -32.8..."
38190,RICE,2018/2019,Conab - Arroz Irrigado 2018/2019,,MS,,start_2018-08-01_end_2019-06-01_monitoring_cla...,start_2018-08-01_end_2019-06-01_monitoring_cla...,81_0,0,True,"POLYGON ((-56.70471 -20.11537, -56.70492 -20.1..."


## Considerações da análise

<img src="alert-image.png">

### Dúvidas sobre os dados
- a) Por que existem dados nulos na coluna `contour_score`?
- b) O que seria a meso? Essa informação é importante para o modelo?
- c) Pode existir o mesmo ID de talhão para monitoring_class diferentes?
- d) O que é o ``compac_index``?
- e) O que é a coluna ``ìs_valid``?
- f) Qual a diferença dos atributos ``LOS`` and	``length_of_season``