# Earth Observation Validation Report Notebook

The following notebook provides a report on the progress of validations when performed
using the Teams option. The second portion of this notebook generates the overall aggregated database.

In [None]:
import os
import csv
import pandas as pd
import geopandas as gpd

from glob import glob
from pathlib import Path
from datetime import date
from tabulate import tabulate

## Validation Report

In [None]:
# specify the data_dir option used in the ValidationDashboard notebook
data_dir = '/home/jovyan/efs/BinPeng_Colombia/validation'

In [None]:
report_list = []
fields = ["Number of Points", "Verified Points", "Percentage", "Seconds Per Point (Mean)", "Username-Filename"]
for username in os.listdir(data_dir):
    
    if username in [".ipynb_checkpoints", "original_points"]:
        continue
    
    filenames = glob(os.path.join(data_dir, username, '*.gpkg'))

    for filename in filenames:
        
        try:
            gdf = gpd.read_file(filename)
            report_list.append(
                [
                    gdf.shape[0], # total points
                    gdf['verified'].sum(), # verified points
                    round((gdf['verified'].sum() / gdf.shape[0]) * 100, 2), # percentage done
                    pd.to_numeric(gdf['seconds_taken'], errors='coerce').mean(), # seconds per point
                    username
                ])
        except:
            report_list.append(["broken file", 0, Path(filename).stem])

print (tabulate(report_list, headers=fields))

with open(f'validation-database-report-{date.today()}.csv', 'w') as f:
     
    write = csv.writer(f)
    write.writerow(fields)
    write.writerows(report_list)

# Validation Database Generation

In [None]:
# specify the database_filename option used to name the Validation Database, ends with .gpkg
database_filename = f'validation-database-{date.today()}.gpkg'

In [None]:
report_list = []
database_list = []
for username in os.listdir(data_dir):
    
    if username in [".ipynb_checkpoints", "original_points"]:
        continue
    
    filenames = glob(os.path.join(data_dir, username, '*.gpkg'))
    
    for filename in filenames:
        
        try:
            gdf = gpd.read_file(filename)

            report_list.append(
                [
                    gdf.shape[0], # total points
                    gdf['verified'].sum(), # verified points
                    round((gdf['verified'].sum() / gdf.shape[0]) * 100, 2), # percentage done
                    pd.to_numeric(gdf['seconds_taken'], errors='coerce').mean(), # seconds per point
                    username
                ])

            gdf['username'] = username
            gdf['short_filename'] = Path(filename).stem
            database_list.append(gdf)

        except:
            report_list.append(["broken file", 0, Path(filename).stem])

print (tabulate(report_list, headers=[
    "Number of Points", "Verified Points", "Percentage", "Seconds Per Point (Mean)", "Username-Filename"]))

full_database = pd.concat(database_list)
full_database.to_file(database_filename, driver='GPKG', layer='validation') 
full_database.head()