In [2]:
# Load the auto reload extension to automatically reload modules when files on disk are updated
%load_ext autoreload
# it will automatically be reloaded without the need to restart the kernel.
%autoreload 2

### Imports

In [3]:
import boto3
import geopandas as gpd
from io import BytesIO
import pandas as pd
from folium import GeoJson
import folium
import numpy as np
from tqdm import tqdm
from glob import glob
#import local modules
import sys
sys.path.append("/agrilearn_app/agrilearn/")
from agrilearn.utils import s3_utils, str_utils, eopatch_utils

### Global Variables

In [4]:
DATASET_PATH = "/agrilearn_app/datasets/v2/geopackage/crop_classification_raw-v2.gpkg"

# 1. Read Datasets

In [5]:
df = gpd.read_file(DATASET_PATH)

In [6]:
df.head()

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,area,meso,obs_extra,eopath_location,start_season,...,cycle_end,LOS,is_valid,is_valid_cvt,start_of_season,end_of_season,peaks,length_of_season,key_bucket,geometry
0,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,2022-05-12,160,1,True,2021-12-14,2022-05-03,2022-02-26T00:00:00,140,datasets/culture/culture_v02/soybean_train_v2....,"POLYGON ((-52.44675 -32.21676, -52.44679 -32.2..."
1,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,2022-04-17,145,1,True,2021-12-10,2022-05-03,2022-02-26T00:00:00,144,datasets/culture/culture_v02/soybean_train_v2....,"POLYGON ((-52.44533 -32.21992, -52.44368 -32.2..."
2,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,2022-04-22,140,1,True,2021-12-12,2022-05-03,2022-02-26T00:00:00,142,datasets/culture/culture_v02/soybean_train_v2....,"POLYGON ((-52.45559 -32.21991, -52.45526 -32.2..."
3,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,2022-05-02,150,1,True,2021-12-11,2022-04-15,2022-02-26T00:00:00,125,datasets/culture/culture_v02/soybean_train_v2....,"POLYGON ((-52.46623 -32.22236, -52.46621 -32.2..."
4,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,2022-04-17,135,1,True,2021-12-16,2022-04-14,2022-02-26T00:00:00,119,datasets/culture/culture_v02/soybean_train_v2....,"POLYGON ((-52.45816 -32.22394, -52.45805 -32.2..."


In [7]:
df['dataset_part'] = str_utils.get_series_from_string_list(df['key_bucket'], categories=['train', 'val', 'test'])

In [8]:
df['dataset_part'].value_counts(normalize=True)*100

dataset_part
train    89.489946
test      5.354525
val       5.155530
Name: proportion, dtype: float64

In [9]:
report_train_val_test = df.groupby(['monitoring_class', 'dataset_part'], as_index=False)\
                            .agg({'length_of_season':'count'})\
                            .sort_values(['monitoring_class', 'length_of_season'], ascending=[False, False])
report_train_val_test

Unnamed: 0,monitoring_class,dataset_part,length_of_season
10,SOYBEAN,train,27043
9,SOYBEAN,test,1367
11,SOYBEAN,val,1260
7,RICE,train,1042
8,RICE,val,69
6,RICE,test,62
4,COTTON,train,1204
5,COTTON,val,219
3,COTTON,test,216
1,CORN,train,4889


## 2. Check Eopatches from Local Path

In [10]:
EOPATCH_LOCAL_PATH="/agrilearn_app/datasets/eopatchs/processed/**"
eopatches_path = [f for f in glob(EOPATCH_LOCAL_PATH, recursive=True) if f.endswith('/eopatch_0_col-0_row-0')]

In [11]:
df['local_eopatch_path'] = eopatches_path

ValueError: Length of values (118845) does not match length of index (38192)

In [None]:
df['local_eopatch_path'].str.split('/start_').str[0].value_counts()

## 2.1 Check total of missing images based on frequence time

In [12]:
dic_missing_images = check_eopatch.get_number_of_missing_images(EOPATCH_IDS=df['local_eopatch_path'].values, freq='5D')

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38192/38192 [10:10<00:00, 62.57it/s]


In [13]:
df_missing_images = pd.DataFrame(dic_missing_images).T.reset_index(names='path')

In [15]:
df_missing_images['status_missing_images'].value_counts()

status_missing_images
OK    38192
Name: count, dtype: int64

In [16]:
df_missing_images.to_csv('/agrilearn_app/datasets/csvs/get_number_of_missing_images_based_freq5D.csv', 
                 sep=';', 
                 float_format='%.2f',
                 index=False)

In [17]:
df_missing_images.head()

Unnamed: 0,path,total_imagens_do_intervalo,total_de_imagem,status_missing_images
0,/agrilearn_app/datasets/eopatchs/processed/COR...,61,34,OK
1,/agrilearn_app/datasets/eopatchs/processed/COR...,61,31,OK
2,/agrilearn_app/datasets/eopatchs/processed/COR...,67,28,OK
3,/agrilearn_app/datasets/eopatchs/processed/COR...,67,41,OK
4,/agrilearn_app/datasets/eopatchs/processed/COR...,67,40,OK


In [18]:
df['total_imagens_do_intervalo'] = df_missing_images['total_imagens_do_intervalo']
df['total_de_imagem'] = df_missing_images['total_de_imagem']
df['status_missing_images'] = df_missing_images['status_missing_images']

## 2.2 Check time series missing

In [20]:
dic_get_time_series_missing = check_eopatch.get_time_series_missing(EOPATCH_IDS=df['local_eopatch_path'].values)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 38192/38192 [02:22<00:00, 267.52it/s]


In [21]:
df_time_series_missing = pd.DataFrame(dic_get_time_series_missing).T.reset_index(names='path')
df_time_series_missing['mean_dates_diff'] = df_time_series_missing['mean_dates_diff'].astype("float")

In [22]:
df_time_series_missing['status_series_missing'].value_counts()

status_series_missing
OK    38192
Name: count, dtype: int64

In [23]:
df_time_series_missing.to_csv('/agrilearn_app/datasets/csvs/get_time_series_missing.csv', 
                      sep=';', 
                      index=False,
                      float_format='%.2f')

In [24]:
df['days_gap'] = df_time_series_missing['days_gap']
df['dates_diff'] = df_time_series_missing['dates_diff']
df['mean_dates_diff'] = df_time_series_missing['mean_dates_diff'] 
df['status_series_missing'] = df_time_series_missing['status_series_missing']

## 3. concat files and Save new GPKG

In [25]:
OUTPUT_FILE='/agrilearn_app/datasets/crop_classification_raw-checked.gpkg'
df.to_file(OUTPUT_FILE, driver='GPKG', engine='fiona')

In [26]:
df

Unnamed: 0,monitoring_class,period,fonte,cultura_2,state,area,meso,obs_extra,eopath_location,start_season,...,geometry,dataset_part,local_eopatch_path,total_imagens_do_intervalo,total_de_imagem,status_missing_images,days_gap,dates_diff,mean_dates_diff,status_series_missing
0,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,"POLYGON ((-52.44675 -32.21676, -52.44679 -32.2...",train,/agrilearn_app/datasets/eopatchs/processed/COR...,61,34,OK,"[2020-10-21 00:00:00, 2020-10-26 00:00:00, 202...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 5, 5,...",5.15,OK
1,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,"POLYGON ((-52.44533 -32.21992, -52.44368 -32.2...",train,/agrilearn_app/datasets/eopatchs/processed/COR...,61,31,OK,"[2020-10-24 00:00:00, 2020-11-08 00:00:00, 202...","[15, 5, 10, 5, 10, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...",5.67,OK
2,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,"POLYGON ((-52.45559 -32.21991, -52.45526 -32.2...",train,/agrilearn_app/datasets/eopatchs/processed/COR...,67,28,OK,"[2020-10-28 00:00:00, 2020-11-02 00:00:00, 202...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...",5.00,OK
3,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,"POLYGON ((-52.46623 -32.22236, -52.46621 -32.2...",train,/agrilearn_app/datasets/eopatchs/processed/COR...,67,41,OK,"[2020-12-07 00:00:00, 2020-12-12 00:00:00, 202...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, 5,...",5.12,OK
4,SOYBEAN,2021/2022,Agrosatélite - Grãos Brasil 2021/2022,,RS,10269314.0,505,,start_2021-10-01_end_2022-06-01_monitoring_cla...,2021-10-01,...,"POLYGON ((-52.45816 -32.22394, -52.45805 -32.2...",train,/agrilearn_app/datasets/eopatchs/processed/COR...,67,40,OK,"[2020-10-13 00:00:00, 2020-10-18 00:00:00, 202...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...",5.13,OK
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38187,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,681005.0,494,,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,...,"POLYGON ((-50.64755 -29.66027, -50.64755 -29.6...",test,/agrilearn_app/datasets/eopatchs/processed/SOY...,43,21,OK,"[2022-05-08 00:00:00, 2022-05-13 00:00:00, 202...","[5, 10, 5, 10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 10, ...",5.75,OK
38188,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,681005.0,494,,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,...,"POLYGON ((-50.64708 -29.66136, -50.64704 -29.6...",test,/agrilearn_app/datasets/eopatchs/processed/SOY...,43,30,OK,"[2022-04-05 00:00:00, 2022-04-10 00:00:00, 202...","[5, 10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,...",5.17,OK
38189,RICE,2019/2020,Conab - Arroz Irrigado 2019/2020,,RS,139290.0,505,,start_2019-09-01_end_2020-06-01_monitoring_cla...,2019-09-01,...,"POLYGON ((-52.75066 -32.89058, -52.75065 -32.8...",test,/agrilearn_app/datasets/eopatchs/processed/SOY...,43,28,OK,"[2022-05-10 00:00:00, 2022-05-15 00:00:00, 202...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...",5.00,OK
38190,RICE,2018/2019,Conab - Arroz Irrigado 2018/2019,,MS,176041.0,509,,start_2018-08-01_end_2019-06-01_monitoring_cla...,2018-08-01,...,"POLYGON ((-56.70471 -20.11537, -56.70492 -20.1...",test,/agrilearn_app/datasets/eopatchs/processed/SOY...,43,37,OK,"[2022-04-30 00:00:00, 2022-05-05 00:00:00, 202...","[5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, ...",5.00,OK
