In [7]:
# Load the auto reload extension to automatically reload modules when files on disk are updated
%load_ext autoreload
# it will automatically be reloaded without the need to restart the kernel.
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Imports

In [8]:
import boto3
import geopandas as gpd
from io import BytesIO
import pandas as pd
from folium import GeoJson
import folium
import numpy as np
from tqdm import tqdm
from glob import glob
#import local modules
import sys
sys.path.append("/agrilearn_app/agrilearn/")
from agrilearn.utils import s3_utils, str_utils, eopatch_utils

### Global Variables

In [9]:
DATASET_PATH_LOCAL = "/agrilearn_app/datasets/v1/geopackage/cana_fields_filtered.gpkg"

# 1. Read Geopackage from Local Disck

## 1.1 Open Dataset

In [10]:
df = gpd.read_file(DATASET_PATH_LOCAL)

In [11]:
df.head()

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,area,meso,obs_extra,eopath_location,start_season,end_season,peak_start,peak_end,sentinel_eopatch_current,contour_score,contour_selected_timestamp,field_id,compac_index,dataset_part,geometry
0,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,3603383.0,58,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,1.0,2020-04-27,0_2,1.433508,train,"POLYGON ((-48.34483 -8.91348, -48.34412 -8.913..."
1,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,3603383.0,58,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,1.0,2020-04-27,0_3,1.705125,train,"POLYGON ((-48.35083 -8.91933, -48.35008 -8.917..."
2,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,3603383.0,58,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,1.0,2020-04-27,0_5,1.501758,train,"POLYGON ((-48.343 -8.91709, -48.34291 -8.91718..."
3,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,3603383.0,58,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,1.0,2020-04-27,0_6,1.493624,train,"POLYGON ((-48.34644 -8.9189, -48.34626 -8.9190..."
4,SUGAR_CANE,2020/2021,Agrosatélite - Canasat 2021/2022,cana_soca,TO,3603383.0,58,,start_2020-04-01_end_2021-04-01_monitoring_cla...,2020-04-01,2021-04-01,2020-04-01,2021-04-01,start_2020-04-01_end_2021-04-01_monitoring_cla...,1.0,2020-04-27,0_7,1.746982,train,"POLYGON ((-48.34119 -8.91971, -48.34119 -8.919..."


In [12]:
df['dataset_part'].value_counts(normalize=True)*100

dataset_part
train    91.362399
test      4.510149
val       4.127452
Name: proportion, dtype: float64

In [13]:
report_geopackage = df.groupby(['monitoring_class', 'dataset_part'], as_index=False)\
                            .agg(count=('state','count'))\
                            .sort_values(['monitoring_class', 'count'], ascending=[False, False]).reset_index(drop=True)
report_geopackage

Unnamed: 0,monitoring_class,dataset_part,count
0,SUGAR_CANE,train,32229
1,SUGAR_CANE,test,1591
2,SUGAR_CANE,val,1456


# 2. Read eopatch files from Local disk

## 2.1 Open files

In [14]:
OUTPUT_DATASETS_PATH_LOCAL ="/agrilearn_app/datasets/eopatchs/processed/cana/**"
df_eopatch_files = eopatch_utils.get_local_path_from_eopatch(OUTPUT_DATASETS_PATH_LOCAL,
                                                             new_label='path')

In [15]:
df_eopatch_files.head()

Unnamed: 0,path
0,/agrilearn_app/datasets/eopatchs/processed/can...
1,/agrilearn_app/datasets/eopatchs/processed/can...
2,/agrilearn_app/datasets/eopatchs/processed/can...
3,/agrilearn_app/datasets/eopatchs/processed/can...
4,/agrilearn_app/datasets/eopatchs/processed/can...


## 2.2 Create new Features

In [16]:
df_eopatch_files['dataset_part'] = str_utils.get_series_from_string_list(df_eopatch_files['path'], categories=['train', 'val', 'test'])
df_eopatch_files['monitoring_class'] = str_utils.get_series_from_string_list(df_eopatch_files['path'], categories=['cana'])

In [17]:
df_eopatch_files['dataset_part'].value_counts()

dataset_part
train    32229
test      1591
val       1456
Name: count, dtype: int64

## 2.3 Report Generation

In [18]:
report_eopatch = df_eopatch_files.groupby(['monitoring_class', 'dataset_part'], as_index=False)\
                                        .agg(count=('path','count'))\
                                        .sort_values(['monitoring_class', 'count'], ascending=[False, False]).reset_index(drop=True)
report_eopatch

Unnamed: 0,monitoring_class,dataset_part,count
0,cana,train,32229
1,cana,test,1591
2,cana,val,1456


# 3. Compare files between Geopackage and Eopatch

In [19]:
pd.concat([report_eopatch, report_geopackage])

Unnamed: 0,monitoring_class,dataset_part,count
0,cana,train,32229
1,cana,test,1591
2,cana,val,1456
0,SUGAR_CANE,train,32229
1,SUGAR_CANE,test,1591
2,SUGAR_CANE,val,1456


In [20]:
print('Se True então os reports são iguais e todas as imagens foram baixadas')

Se True então os reports são iguais e todas as imagens foram baixadas


In [21]:
report_eopatch.equals(report_geopackage)

False