In [None]:
# Load the auto reload extension to automatically reload modules when files on disk are updated
%load_ext autoreload
# it will automatically be reloaded without the need to restart the kernel.
%autoreload 2

In [None]:
import boto3
import geopandas as gpd
from io import BytesIO
import pandas as pd
from folium import GeoJson
import folium
import numpy as np
from tqdm import tqdm
from glob import glob
#import local modules
import sys
sys.path.append("/agrilearn_app/agrilearn/")
from agrilearn.utils import s3_utils, str_utils, eopatch_utils
import os

### Define PATHs

In [None]:
OUTPUT_PREDICTIONS_VAL = "/agrilearn_app/output/experiment_01/predictions/val/"
OUTPUT_PREDICTIONS_TEST = "/agrilearn_app/output/experiment_01/predictions/test/"

In [None]:
def read_multiplies_files_from_path(path, sep=','):
    files = glob(os.path.join(path, "**"))
    print(f'There are {len(files)} files to read')
    
    array_df=[]
    for x in tqdm(files, desc=f'Reading files:'):
        temp_df = pd.read_csv(x,  sep=sep)
        array_df.append(temp_df)
    df_ = pd.concat(array_df, ignore_index=True)

    return df_

In [None]:
df_prediction_test = read_multiplies_files_from_path(OUTPUT_PREDICTIONS_VAL)

In [None]:
filter_= (df_prediction_test['crop_class_rnn'].isna() == False) & (df_prediction_test['monitoring_class'].isna() == False)
df_prediction_test = df_prediction_test[filter_]

In [None]:
df_prediction_test['monitoring_class'].value_counts().index

In [None]:
report_test = classification_report(y_true=df_prediction_test['monitoring_class'], 
                                    y_pred=df_prediction_test['crop_class_rnn'],
                                    target_names=list(df_prediction_test['monitoring_class'].value_counts().index),
                                    output_dict=True
                                   )

In [None]:
pd.DataFrame(report_test)

### 1. Read Datasets

In [None]:
df_val = gpd.read_file(DATASET_PATH_VAL)
df_test = gpd.read_file(DATASET_PATH_TEST)

In [None]:
df_train['dataset_part'] = 'train'
df_val['dataset_part'] = 'val'
df_test['dataset_part'] =  'test'

In [None]:
df = pd.concat([df_train, df_val, df_test])

In [None]:
df.shape

In [None]:
df.info()

## 2. Data Analysis

In [None]:
print(f"Dados de {df['start_season'].min()} a {df['end_season'].max()}")

In [None]:
## Apenas 16 dados não pa
df[(df['crop_class_rnn'].isna() == True) | (df['monitoring_class'].isna() == True)][['crop_class_rnn', 'monitoring_class', 'obs_crop_rnn', 'dataset_part']]

In [None]:
df_train = df[(df['dataset_part']=='train') & (df['crop_class_rnn'].isna() == False) & (df['monitoring_class'].isna() == False)]
df_val = df[(df['dataset_part']=='val') & (df['crop_class_rnn'].isna() == False) & (df['monitoring_class'].isna() == False)]
df_test = df[(df['dataset_part']=='test') & (df['crop_class_rnn'].isna() == False) & (df['monitoring_class'].isna() == False)]

In [None]:
df_train.shape

In [None]:
df_val.shape

In [None]:
df_test = df[(df['dataset_part']=='test') & (df['crop_class_rnn'].isna() == False) & (df['monitoring_class'].isna() == False)]

## 3. Evaluate classification report

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import balanced_accuracy_score

### 3.1 Test Set evaluation

In [None]:
df_test['crop_class_rnn'].value_counts(normalize=True)*100

In [None]:
report_test = classification_report(df_test['monitoring_class'], 
                                    df_test['crop_class_rnn'],
                                    target_names=list(df_test['monitoring_class'].value_counts().index))

In [None]:
print(report_test)

In [None]:
balanced_accuracy_score(df_test['monitoring_class'],
                        df_test['crop_class_rnn'])


### 3.2 Validation Set evaluation

In [None]:
df_val['crop_class_rnn'].value_counts(normalize=True)*100

In [None]:
report_val = classification_report(df_val['monitoring_class'], 
                                   df_val['crop_class_rnn'],
                                   target_names=list(df_val['monitoring_class'].value_counts().index))

In [None]:
print(report_val)

### 3.3 Train Set evaluation

In [None]:
df_train['crop_class_rnn'].value_counts(normalize=True)*100

In [None]:
report_train = classification_report(df_train['monitoring_class'], 
                                     df_train['crop_class_rnn'],
                                     target_names=list(df_train['monitoring_class'].value_counts().index))

In [None]:
print(report_train)

## 4. Evaluation errors

In [None]:
df['total_de_imagem'] = df['total_de_imagem'].astype(float)

## 4.1 Diff Day (gap)

In [None]:
df[df['monitoring_class'] != df['crop_class_rnn']]['mean_dates_diff'].mean()

In [None]:
df[df['monitoring_class'] == df['crop_class_rnn']]['mean_dates_diff'].mean()

## 4.2 Missing values

In [None]:
df[df['monitoring_class'] != df['crop_class_rnn']]['total_de_imagem'].mean()

In [None]:
df[df['monitoring_class'] == df['crop_class_rnn']]['total_de_imagem'].mean()

In [None]:
df[df['monitoring_class'] != df['crop_class_rnn']].iloc[:,-12:]

In [None]:
df.iloc[:,-13:]