In [1]:
import boto3
import geopandas as gpd
import pandas as pd
import folium
import numpy as np
from folium import GeoJson
from io import BytesIO

In [2]:
import sys
sys.path.append("/agrilearn_app/agrilearn/")
from agrilearn.utils import s3_utils, str_utils

### Global Variables

In [3]:
DATASET_PATH = "/agrilearn_app/datasets/v2/geopackage/crop_classification_raw-filtered-v2-with_cana.gpkg"

# 1. Read Datasets

In [4]:
df = gpd.read_file(DATASET_PATH)

In [5]:
df['dataset_part'].value_counts()

dataset_part
train    66407
test      3636
val       3425
Name: count, dtype: int64

In [6]:
df['monitoring_class'].value_counts()

monitoring_class
SUGAR_CANE    35276
SOYBEAN       29670
CORN           5710
COTTON         1639
RICE           1173
Name: count, dtype: int64

In [7]:
df['cultura_2'].value_counts()

cultura_2
cana_soca         30377
cana_reformada     4019
cana_expansao       880
Name: count, dtype: int64

In [8]:
print(f"There are {df.shape[0]} rows and {df.shape[1]} columns")

There are 73468 rows and 41 columns


In [9]:
print(f"Dados de {df['start_season'].min()} a {df['end_season'].max()}")

Dados de 2017-09-01 a 2023-04-01


In [10]:
list(df.columns)

['monitoring_class',
 'period',
 'fonte',
 'cultura_2',
 'state',
 'area',
 'meso',
 'obs_extra',
 'eopath_location',
 'start_season',
 'end_season',
 'peak_start',
 'peak_end',
 'sentinel_eopatch_current',
 'contour_score',
 'contour_selected_timestamp',
 'field_id',
 'compac_index',
 'planting_start',
 'planting_end',
 'cycle_start',
 'cycle_end',
 'LOS',
 'is_valid',
 'is_valid_cvt',
 'start_of_season',
 'end_of_season',
 'peaks',
 'length_of_season',
 'key_bucket',
 'dataset_part',
 'local_eopatch_path',
 'total_imagens_do_intervalo',
 'total_de_imagem',
 'status_missing_images',
 'days_gap',
 'dates_diff',
 'mean_dates_diff',
 'status_series_missing',
 'monitoring_class_path',
 'geometry']

In [11]:
df['dataset_part'].value_counts(normalize=True)*100

dataset_part
train    90.389013
test      4.949093
val       4.661894
Name: proportion, dtype: float64

In [12]:
df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 73468 entries, 0 to 73467
Data columns (total 41 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   monitoring_class            73468 non-null  object        
 1   period                      73468 non-null  object        
 2   fonte                       73468 non-null  object        
 3   cultura_2                   35276 non-null  object        
 4   state                       73468 non-null  object        
 5   area                        73468 non-null  float64       
 6   meso                        73468 non-null  int64         
 7   obs_extra                   73468 non-null  object        
 8   eopath_location             73468 non-null  object        
 9   start_season                73468 non-null  object        
 10  end_season                  73468 non-null  object        
 11  peak_start                  73468 non-null  ob

# 2. Data Analysis

### 2.1 Analysing Target y (monitoring_class)

In [13]:
pd.concat([df['monitoring_class'].value_counts(), df['monitoring_class'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
monitoring_class,Unnamed: 1_level_1,Unnamed: 2_level_1
SUGAR_CANE,35276,48.015463
SOYBEAN,29670,40.384929
CORN,5710,7.772091
COTTON,1639,2.230903
RICE,1173,1.596613


### 2.2 Analysing fonte de dados

In [14]:
pd.concat([df['fonte'].value_counts(), df['fonte'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
fonte,Unnamed: 1_level_1,Unnamed: 2_level_1
Agrosatélite - Grãos Brasil 2021/2022,19185,26.11341
Agrosatélite - Grãos Brasil 2020/2021,17834,24.274514
Agrosatélite - Canasat 2022/23,12061,16.416671
Agrosatélite - Canasat 2021/2022,11903,16.201612
Agrosatélite - Canasat 2023/2024,11312,15.39718
Conab - Arroz Irrigado 2019/2020,595,0.809876
Conab - Arroz Irrigado 2018/2019,341,0.464148
Conab - Arroz Irrigado 2017/2018,206,0.280394
Conab - Arroz Irrigado 2021/2022,31,0.042195


### 2.3 Analysing dados por estado

In [15]:
pd.concat([df['state'].value_counts(), df['state'].value_counts(normalize=True)*100], axis=1)

Unnamed: 0_level_0,count,proportion
state,Unnamed: 1_level_1,Unnamed: 2_level_1
SP,16526,22.494147
PR,9689,13.188055
MT,9504,12.936244
MG,9434,12.840965
GO,7729,10.520226
MS,7274,9.900909
RS,3338,4.543475
TO,2634,3.585234
MA,1781,2.424185
BA,1193,1.623836


### 2.6 Analysis field_id (existem IDs duplicados para os talhões)

In [16]:
df['field_id'].nunique()

67829

In [17]:
df[df.duplicated(['field_id','period'])]

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,area,meso,obs_extra,eopath_location,start_season,...,local_eopatch_path,total_imagens_do_intervalo,total_de_imagem,status_missing_images,days_gap,dates_diff,mean_dates_diff,status_series_missing,monitoring_class_path,geometry
27054,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RO,551422.0,3,,start_2021-10-01_end_2022-05-01_monitoring_cla...,2021-10-01,...,/agrilearn_app/datasets/eopatchs/processed/soy...,49,31,OK,,,5.17,OK,soybean,"POLYGON ((-62.38324 -10.12717, -62.38324 -10.1..."
27060,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RO,3148591.0,4,,start_2021-10-01_end_2022-05-01_monitoring_cla...,2021-10-01,...,/agrilearn_app/datasets/eopatchs/processed/soy...,49,23,OK,,,5.23,OK,soybean,"POLYGON ((-62.01629 -11.33566, -62.01628 -11.3..."
27071,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RO,2135605.0,7,,start_2021-10-01_end_2022-05-01_monitoring_cla...,2021-10-01,...,/agrilearn_app/datasets/eopatchs/processed/soy...,49,25,OK,,,5.00,OK,soybean,"POLYGON ((-60.58 -13.51871, -60.58 -13.51845, ..."
27077,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,PA,568770.0,31,,start_2021-10-01_end_2022-09-01_monitoring_cla...,2021-10-01,...,/agrilearn_app/datasets/eopatchs/processed/soy...,49,48,OK,,,2.51,OK,soybean,"POLYGON ((-54.4106 -2.77062, -54.41037 -2.7716..."
27095,SOYBEAN,2020/2021,Agrosatélite - Grãos Brasil 2020/2021,,PA,1404261.0,42,,start_2020-10-01_end_2021-09-01_monitoring_cla...,2020-10-01,...,/agrilearn_app/datasets/eopatchs/processed/soy...,49,26,OK,,,5.20,OK,soybean,"MULTIPOLYGON (((-48.91709 -3.4195, -48.91695 -..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73457,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,GO,520114.0,558,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,...,/agrilearn_app/datasets/eopatchs/processed/can...,,,,,,,,cana,"POLYGON ((-50.97791 -19.31829, -50.9795 -19.31..."
73458,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_soca,GO,7609666.0,558,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,...,/agrilearn_app/datasets/eopatchs/processed/can...,,,,,,,,cana,"POLYGON ((-50.95173 -19.29924, -50.95077 -19.2..."
73459,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_soca,GO,7609666.0,558,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,...,/agrilearn_app/datasets/eopatchs/processed/can...,,,,,,,,cana,"POLYGON ((-50.96747 -19.30071, -50.96747 -19.3..."
73464,SUGAR_CANE,2021/2022,Agrosatélite - Canasat 2022/23,cana_soca,GO,7609666.0,558,,start_2021-04-01_end_2022-04-01_monitoring_cla...,2021-04-01,...,/agrilearn_app/datasets/eopatchs/processed/can...,,,,,,,,cana,"POLYGON ((-50.96251 -19.31623, -50.96498 -19.3..."


In [18]:
df[df.duplicated(['field_id','period'])]['field_id'].unique()

array(['9_2', '14_3', '23_1', ..., '559_3', '559_9', '559_12'],
      dtype=object)

In [19]:
#df[df['field_id'].isin(['0_6'])].to_csv('/agrilearn_app/datasets/cana-v1/csvs/duplicated_lines_CANA-sample-02.csv')

In [20]:
df[['cultura_2', 'monitoring_class']]

Unnamed: 0,cultura_2,monitoring_class
0,,SOYBEAN
1,,SOYBEAN
2,,SOYBEAN
3,,SOYBEAN
4,,SOYBEAN
...,...,...
73463,cana_soca,SUGAR_CANE
73464,cana_soca,SUGAR_CANE
73465,cana_soca,SUGAR_CANE
73466,cana_soca,SUGAR_CANE


# 3. Analysis culture per state

In [21]:
df[df['state'].isin(['BA', 'RS', 'MT', 'SP'])].groupby(['state', 'dataset_part']).agg({'monitoring_class':'value_counts'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,monitoring_class
state,dataset_part,monitoring_class,Unnamed: 3_level_1
BA,test,SOYBEAN,38
BA,test,COTTON,15
BA,test,CORN,14
BA,train,SOYBEAN,489
BA,train,COTTON,338
BA,train,CORN,212
BA,val,SOYBEAN,34
BA,val,COTTON,30
BA,val,CORN,23
MT,test,SUGAR_CANE,198


In [22]:
#df[df['state'].isin(['BA'])].to_file("/agrilearn_app/datasets/v2/geopackage/crop-classification-with-cana-BH.gpkg", driver='GPKG', engine='fiona')
#df[df['state'].isin(['RS'])].to_file("/agrilearn_app/datasets/v2/geopackage/crop-classification-with-cana-RS.gpkg", driver='GPKG', engine='fiona')
#df[df['state'].isin(['MT'])].to_file("/agrilearn_app/datasets/v2/geopackage/crop-classification-with-cana-MT.gpkg", driver='GPKG', engine='fiona')
df[df['state'].isin(['SP'])].to_file("/agrilearn_app/datasets/v2/geopackage/crop-classification-with-cana-SP.gpkg", driver='GPKG', engine='fiona')