# Maestria en Explotacion de datos y Descubrimiento de conocimiento
### Sistemas de información geografica
### Trabajo Practico N°2

# Clasificación por pixel

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../lib')

In [3]:
import plot
import sql
import pandas as pd
import numpy  as np
from sklearn.model_selection import train_test_split

## Constantes globales

In [4]:
RASTERS_PATH = "../datasets/images"
DATA_PATH    = "../datasets/data"
RESULT_PATH  = "../results"

In [5]:
DATES = [
    '2020-10-01', 
    '2020-11-01', 
    '2020-12-01',
    '2021-01-01', 
    '2021-02-20', 
    '2021-03-17'
]
RASTER_FILES = [
    '0000000000-0000000000', 
    '0000000000-0000012544'
]

# VERDAD_CAMPO = 'verdad_campo'
VERDAD_CAMPO = 'verdad_campo_aumentada_7'

JOIN_MAPS = False

## Preparar verddad de campo aumentada

In [6]:
!cp ../verdad_campo_augmented.tar.gz {DATA_PATH}
!cd {DATA_PATH}; tar zxfv verdad_campo_augmented.tar.gz

verdad_campo_aumentada_2.cpg
verdad_campo_aumentada_2.dbf
verdad_campo_aumentada_2.gpkg
verdad_campo_aumentada_2.prj
verdad_campo_aumentada_2.qmd
verdad_campo_aumentada_2.shp
verdad_campo_aumentada_2.shx
verdad_campo_aumentada_3.cpg
verdad_campo_aumentada_3.dbf
verdad_campo_aumentada_3.prj
verdad_campo_aumentada_3.qmd
verdad_campo_aumentada_3.shp
verdad_campo_aumentada_3.shx
verdad_campo_aumentada_4.cpg
verdad_campo_aumentada_4.dbf
verdad_campo_aumentada_4.prj
verdad_campo_aumentada_4.qmd
verdad_campo_aumentada_4.shp
verdad_campo_aumentada_4.shx
verdad_campo_aumentada_5.cpg
verdad_campo_aumentada_5.dbf
verdad_campo_aumentada_5.prj
verdad_campo_aumentada_5.qmd
verdad_campo_aumentada_5.shp
verdad_campo_aumentada_5.shx
verdad_campo_aumentada_6.cpg
verdad_campo_aumentada_6.dbf
verdad_campo_aumentada_6.prj
verdad_campo_aumentada_6.qmd
verdad_campo_aumentada_6.shp
verdad_campo_aumentada_6.shx
verdad_campo_aumentada_7.cpg
verdad_campo_aumentada_7.dbf
verdad_campo_aumentada_7.prj
verdad_campo_

## Funciones helper

In [7]:
flatten = lambda list: np.ndarray.flatten(np.array(list))
raster_path      = lambda path: f'{RASTERS_PATH}/{path}.tif'
raster_date_path = lambda date, file: raster_path(f'{date}/{file}')

data_path        = lambda file: f'{DATA_PATH}/{file}.shp'
result_path      = lambda file: f'{RESULT_PATH}/{file}'


def print_title(title): print(f'\n\n{title}...\n')

def class_statistics(
    raster_paths,
    labels_file  = VERDAD_CAMPO, 
    label_column = 'id', 
    out_file      = 'class_statistics.xml',
    verbose      = 0
):
    
    in_paths = " ".join(raster_paths)
    vec_path = data_path(labels_file)
    out_path = result_path(out_file)
    
    if verbose > 0:
        print_title('Generate class statistics')
        print(f'- In Paths: {in_paths}')
        print(f'- Vec Path: {vec_path}')
        print(f'- Field...: {label_column}')
        print(f'- Out Path: {out_path}\n\n')
    
    !time otbcli_PolygonClassStatistics -in {in_paths} -vec {vec_path} -field {label_column} -out {out_path} > /dev/null
    !head {out_path}
    return out_path


def merge_rasters(source_paths, target_path, verbose = 1, plot=0):
    print(" ".join(source_paths))
    
    !time gdal_merge.py -o {target_path} -of gtiff {" ".join(source_paths)}

    if verbose > 0:
        print_title('Merge Rasters')
        for idx, path in enumerate(source_paths):
            print(f'Source path {idx+1}: {path}')
            if plot > 0:
                plot.plot_raster(path)

        print(f'Target path:\n  - {target_path}\n\n')
        if plot > 0:
            plot.plot_raster(target_path)
        

def layer_info(file):
    !ogrinfo -so {data_path(file)}


def layer_table_info(file, table):
    !ogrinfo -so {data_path(file)} {table}


def layer_query(file, query):
    !ogrinfo -dialect sqlite -sql "{query}" {data_path(file)}

Sampling de observaciones:

In [8]:
def sampling(
    raster_paths,
    class_stat_path,
    out_rates_path, 
    out_sql_path,
    labels_file      = VERDAD_CAMPO,
    label_column     = 'id',
    strategy         = 'smallest',
    strategy_percent = 50,
    verbose          = 0
):
    raster_paths = " ".join(raster_paths)

    if verbose > 0:
        print_title('Sampling')
        print(f'- In Paths......: {raster_paths}')
        print(f'- Vec Path......: {data_path(labels_file)}')
        print(f'- Field.........: {label_column}')
        print(f'- Instats Path..: {class_stat_path}')
        print(f'- Strategy......: {strategy}')
        print(f'- Out Rates Path: {result_path(out_rates_path)}')
        print(f'- Out SQL Path..: {result_path(out_sql_path)}\n\n')

    !time otbcli_SampleSelection \
        -in       {raster_paths} \
        -vec      {data_path(labels_file)} \
        -instats  {class_stat_path} \
        -field    {label_column} \
        -strategy {strategy} \
        -strategy.percent.p {strategy_percent} \
        -outrates {result_path(out_rates_path)} \
        -out      {result_path(out_sql_path)}

def sample_extraction(
    raster_paths,
    out_sql_file,
    label_column = 'id',
    verbose      = 0
):
    if verbose > 0:
        print_title('Sample extraction')
        print(f'- In Paths......: {" ".join(raster_paths)}')
        print(f'- Vec SQL Path..: {result_path(out_sql_file)}')
        print(f'- Field.........: {label_column}\n\n')

    !time otbcli_SampleExtraction \
        -in                   {" ".join(raster_paths)} \
        -vec                  {result_path(out_sql_file)} \
        -field                {label_column} \
        -outfield             prefix \
        -outfield.prefix.name band_

Funciones de visualización:

Funciones para la clasificación:

In [9]:
def compute_raster_statistics(
    raster_paths,
    stat_file,
    verbose = 0
):
    output_path = result_path(stat_file)

    if verbose > 0:
        print_title('Compute rasters statistics')
        print(f'- Raster Paths: {" ".join(raster_paths)}')
        print(f'- Stats Path : {output_path}\n\n')

    !time otbcli_ComputeImagesStatistics -il {" ".join(raster_paths)} -out.xml {output_path}

    return output_path

In [10]:
def train_clasifier(
    sql_file,
    stat_file,
    features         = ['band_0', 'band_1', 'band_2', 'band_3', 'band_4', 'band_5', 'band_6'],
    label_column     = 'id',
    out_model_file   = 'dt_model.txt',
    out_cm_file      = 'dt_cm_model.csv',

    clasifier_config = {
        'classifier': 'dt',
        'classifier.dt.max': '10'
    },
    verbose          = 0
):
    features_param = " ".join(features)

    clasifier_params = ''
    for k, v in clasifier_config.items():
        clasifier_params += f' -{k} {v}' 
        
    if verbose > 0:
        print_title(f'Training {clasifier_config["classifier"]} classifier')
        print(f'- SQL Path.............: {result_path(sql_file)}')
        print(f'- Stats Path...........: {result_path(stat_file)}')
        print(f'- Target...............: {label_column}')
        print(f'- Features.............: {features}')
        print(f'- Model Path...........: {result_path(out_model_file)}')
        print(f'- Confusion Matrix Path: {result_path(out_cm_file)}')
        print(f'- Clasifier config.....: {clasifier_params}\n\n')

    !time otbcli_TrainVectorClassifier \
        -io.vd             {result_path(sql_file)} \
        -io.stats          {result_path(stat_file)} \
        -feat              {features_param} \
        -io.out            {result_path(out_model_file)} \
        -io.confmatout     {result_path(out_cm_file)} \
        -cfield            {label_column} \
        {clasifier_params}

In [11]:
def band_math(
    raster_paths,
    name           = 'ndvi',
    formula        = '(im1b7-im1b3)/(im1b7+im1b3)',
    extension      = '.tif', 
    verbose        = 0
):
    out_path = lambda path: f'{path.split(extension)[0]}_{name}.tif'
    
    if verbose > 0:
        print_title(f'Calculate "{name} = {formula}" formula')

    outputs = []
    for in_raster_path in raster_paths:
        out_raster_path = out_path(in_raster_path)

        if verbose > 0:
            print(f'\n\n- In: {in_raster_path}\n- Out: {out_raster_path}')

        !time otbcli_BandMath \
            -il  {in_raster_path} \
            -out {out_raster_path} \
            -exp "{formula}"
        outputs.append(out_raster_path)
    return outputs

In [12]:
ALL_VEGETATION_INDEXES = [
    'Vegetation:NDVI',
    'Vegetation:TNDVI', 
    'Vegetation:RVI',
    'Vegetation:SAVI',
    'Vegetation:TSAVI', 
    'Vegetation:MSAVI', 
    'Vegetation:MSAVI2',
    'Vegetation:IPVI',
    'Vegetation:LAIFromNDVILog',
    'Vegetation:LAIFromReflLinear',
    'Vegetation:LAIFromNDVIFormo'
]

def compute_index(
    raster_paths,
    indexes     = ALL_VEGETATION_INDEXES,
    out_postfix = 'indexes',
    blue_band   = 1,
    green_band  = 1,
    red_band    = 1,
    nir_band    = 1,
    mir_band    = 1,
    extension   = '.tif', 
    verbose     = 0
):  
    out_path = lambda path: f'{path.split(extension)[0]}_{out_postfix}.tif'

    if verbose > 0:
        print_title(f'Calculate indexes: "{indexes}"')

    for in_raster_path in raster_paths:
        out_raster_path = out_path(in_raster_path)

        if verbose > 0:
            print(f'\n\n- In: {in_raster_path}\n- Out: {out_raster_path}')

        !time otbcli_RadiometricIndices      \
            -channels.blue  {blue_band}      \
            -channels.green {green_band}     \
            -channels.red   {red_band}       \
            -channels.nir   {nir_band}       \
            -channels.mir   {mir_band}       \
            -in             {in_raster_path} \
            -list           {" ".join(indexes)}        \
            -out            {out_raster_path}

In [13]:
def join_rasters(raster_paths, out_file, verbose = 0):
    if verbose > 0:
        print_title(f'Join rasters')
        print(f'- Input Rasters: {" ".join(raster_paths)}')
        print(f'- Output Raster: {" ".join(out_file)}')

    !time otbcli_ConcatenateImages \
        -il {' '.join(raster_paths)} \
        -out {out_file}

## Analisis

In [15]:
!mkdir -p {RESULT_PATH}

Listamos los archivos de datos:

In [15]:
!ls -la {DATA_PATH}/*shp 

-rw-rw-r-- 1 adrian adrian  79252 abr 24  2021 ../datasets/data/departamentos.shp
-rw-r--r-- 1 adrian adrian 331892 jul  3 12:35 ../datasets/data/verdad_campo_aumentada_2.shp
-rw-r--r-- 1 adrian adrian 182772 jul  3 12:38 ../datasets/data/verdad_campo_aumentada_3.shp
-rw-r--r-- 1 adrian adrian 182772 jul  3 12:40 ../datasets/data/verdad_campo_aumentada_4.shp
-rw-r--r-- 1 adrian adrian 182772 jul  3 12:46 ../datasets/data/verdad_campo_aumentada_5.shp
-rw-r--r-- 1 adrian adrian 182772 jul  3 15:15 ../datasets/data/verdad_campo_aumentada_6.shp
-rw-r--r-- 1 adrian adrian 182772 jul  3 15:16 ../datasets/data/verdad_campo_aumentada_7.shp
-rw-r--r-- 1 adrian adrian 182772 jul  2 12:20 ../datasets/data/verdad_campo_aumentada.shp
-rw-rw-r-- 1 adrian adrian  13148 abr 24  2021 ../datasets/data/verdad_campo.shp


Tenemos dos archivos, departamentes de buenos aires y la verdad de campo o poligonos labels. estos poligonos representan un label y cubro una parte de la superficie de las imagenes donde se encuentra esa misma clase.

Veamos que tablas contienen:

In [16]:
layer_info('departamentos')

INFO: Open of `../datasets/data/departamentos.shp'
      using driver `ESRI Shapefile' successful.
1: departamentos (3D Polygon)


In [17]:
layer_info(VERDAD_CAMPO)

INFO: Open of `../datasets/data/verdad_campo_aumentada_7.shp'
      using driver `ESRI Shapefile' successful.
1: verdad_campo_aumentada_7 (Polygon)


In [18]:
layer_table_info(VERDAD_CAMPO, VERDAD_CAMPO)

INFO: Open of `../datasets/data/verdad_campo_aumentada_7.shp'
      using driver `ESRI Shapefile' successful.

Layer name: verdad_campo_aumentada_7
Metadata:
  DBF_DATE_LAST_UPDATE=2022-07-03
Geometry: Polygon
Feature Count: 466
Extent: (-65.072536, -35.287469) - (-62.299875, -33.833209)
Layer SRS WKT:
GEOGCS["WGS 84",
    DATUM["WGS_1984",
        SPHEROID["WGS 84",6378137,298.257223563,
            AUTHORITY["EPSG","7030"]],
        AUTHORITY["EPSG","6326"]],
    PRIMEM["Greenwich",0,
        AUTHORITY["EPSG","8901"]],
    UNIT["degree",0.0174532925199433,
        AUTHORITY["EPSG","9122"]],
    AXIS["Latitude",NORTH],
    AXIS["Longitude",EAST],
    AUTHORITY["EPSG","4326"]]
Data axis to CRS axis mapping: 2,1
in1: String (6.0)
id: Integer64 (10.0)
cultivo: String (10.0)


La capa o archivo de verdad de campo tiene la columna **cultivo** la cual es una columna categorica que tiene las clases en formato string.

In [19]:
layer_query(
    VERDAD_CAMPO, 
    f'SELECT * FROM {VERDAD_CAMPO} LIMIT 2'
)

INFO: Open of `../datasets/data/verdad_campo_aumentada_7.shp'
      using driver `ESRI Shapefile' successful.

Layer name: SELECT
Geometry: Polygon
Feature Count: 2
Extent: (-62.887100, -34.024758) - (-62.877781, -34.021651)
Layer SRS WKT:
GEOGCS["WGS 84",
    DATUM["WGS_1984",
        SPHEROID["WGS 84",6378137,298.257223563,
            AUTHORITY["EPSG","7030"]],
        AUTHORITY["EPSG","6326"]],
    PRIMEM["Greenwich",0,
        AUTHORITY["EPSG","8901"]],
    UNIT["degree",0.0174532925199433,
        AUTHORITY["EPSG","9122"]],
    AXIS["Latitude",NORTH],
    AXIS["Longitude",EAST],
    AUTHORITY["EPSG","4326"]]
Data axis to CRS axis mapping: 2,1
Geometry Column = GEOMETRY
in1: String (0.0)
id: Integer64 (0.0)
cultivo: String (0.0)
OGRFeature(SELECT):0
  in1 (String) = 014084
  id (Integer64) = 2
  cultivo (String) = MAIZ
  POLYGON ((-62.8860996133436 -34.0221508898355,-62.8861240850855 -34.0223053983326,-62.8861951048464 -34.0224447824616,-62.8863057207175 -34.0225553983326,-62.8864

In [20]:
layer_query(
    VERDAD_CAMPO,
    f"""
    SELECT 
        cultivo  AS Cultivo,
        COUNT(*) AS Cantidad
    FROM
        {VERDAD_CAMPO}
    GROUP BY
        cultivo
    """
)

INFO: Open of `../datasets/data/verdad_campo_aumentada_7.shp'
      using driver `ESRI Shapefile' successful.

Layer name: SELECT
Geometry: None
Feature Count: 5
Layer SRS WKT:
(unknown)
Cultivo: String (0.0)
Cantidad: Integer (0.0)
OGRFeature(SELECT):0
  Cultivo (String) = ALFALFA
  Cantidad (Integer) = 2

OGRFeature(SELECT):1
  Cultivo (String) = CAMPONATUR
  Cantidad (Integer) = 18

OGRFeature(SELECT):2
  Cultivo (String) = GIRASOL
  Cantidad (Integer) = 32

OGRFeature(SELECT):3
  Cultivo (String) = MAIZ
  Cantidad (Integer) = 246

OGRFeature(SELECT):4
  Cultivo (String) = SOJA
  Cantidad (Integer) = 168



In [21]:
layer_query(
    VERDAD_CAMPO, 
    f"""
    SELECT 
        id       AS 'Codigo de cultivo',
        COUNT(*) AS Cantidad
    FROM
        {VERDAD_CAMPO}
    GROUP BY
        cultivo
    """
)

INFO: Open of `../datasets/data/verdad_campo_aumentada_7.shp'
      using driver `ESRI Shapefile' successful.

Layer name: SELECT
Geometry: None
Feature Count: 5
Layer SRS WKT:
(unknown)
Codigo de cultivo: Integer64 (0.0)
Cantidad: Integer (0.0)
OGRFeature(SELECT):0
  Codigo de cultivo (Integer64) = 10
  Cantidad (Integer) = 2

OGRFeature(SELECT):1
  Codigo de cultivo (Integer64) = 20
  Cantidad (Integer) = 18

OGRFeature(SELECT):2
  Codigo de cultivo (Integer64) = 5
  Cantidad (Integer) = 32

OGRFeature(SELECT):3
  Codigo de cultivo (Integer64) = 2
  Cantidad (Integer) = 246

OGRFeature(SELECT):4
  Codigo de cultivo (Integer64) = 1
  Cantidad (Integer) = 168



Las columnas **id** y **cultivo** sin intercambiables. Es decir que el id representa a cada tipo de cultivo.

### Estadisticas y merge de rasters

A continuacion veamos la cantidad de pixels en la imagen por cada clase. Seria la distribición de probabilidad discreta de la variable categorica **cultivo** para una imagen dada.

Ver: [PolygonClassStatistics](https://www.orfeo-toolbox.org/CookBook/Applications/app_PolygonClassStatistics.html)

In [22]:
class_statistics([raster_date_path('2020-10-01', '0000000000-0000000000')], verbose=1)

otbcli_PolygonClassStatistics -in  -vec  -field id -out  > /dev/null  0.07s user 0.05s system 52% cpu 0.227 total
<?xml version="1.0" ?>
<GeneralStatistics>
    <Statistic name="samplesPerClass">
        <StatisticMap key="1" value="1873" />
        <StatisticMap key="10" value="48" />
        <StatisticMap key="2" value="1578" />
        <StatisticMap key="20" value="431" />
        <StatisticMap key="3" value="3112" />
        <StatisticMap key="4" value="816" />
        <StatisticMap key="5" value="752" />


Vemos una frecuencia muy baja x clase. Esto se debe a que la imagen completa esta compuesta por las dos imagenes dentro de cada directorio de fecha. Por esta cuestión, primero debemos hacer un merge de ambas imagenes para luego calcular estadisticas, clasificar, etc... 

A continuación se hace mer de todos las imagenes(rasters) por fecha: 

In [23]:
def merge_rasters_by_date(date):
    source_files = [raster_date_path(date, f) for f in RASTER_FILES]
    target_file  =  raster_date_path(date, 'complete_raster')
    merge_rasters(source_files, target_file)

In [24]:
if JOIN_MAPS:
    merge_rasters_by_date(DATES[0])

In [25]:
if JOIN_MAPS:
    merge_rasters_by_date(DATES[1])

In [26]:
if JOIN_MAPS:
    merge_rasters_by_date(DATES[2])

In [27]:
if JOIN_MAPS:
    merge_rasters_by_date(DATES[3])

In [28]:
if JOIN_MAPS:
    merge_rasters_by_date(DATES[4])

In [29]:
if JOIN_MAPS:
    merge_rasters_by_date(DATES[5])

Ahora validemos si vemos diferencia en las frecuencias:

In [None]:
# class_statistics([raster_date_path('2020-10-01', '0000000000-0000000000')], verbose=1)
# class_statistics([raster_date_path('2020-10-01', '0000000000-0000012544')], verbose=1)
# class_statistics([raster_date_path('2020-10-01', 'complete_raster')], verbose=1)

In [None]:
# class_statistics([raster_date_path('2021-03-17', '0000000000-0000000000')], verbose=1)
# class_statistics([raster_date_path('2021-03-17', '0000000000-0000012544')], verbose=1)
# class_statistics([raster_date_path('2021-03-17', 'complete_raster')], verbose=1)

**Por que todas las imagenes tiene la misma cantidad de pixeles por clase?**

### Sampling

A continacion sampleamos una cantidad de pixels por clase. De esta forma podemos estratificar las obsercaciones por clase, y asi evitar el desbalanceo de las clases.

Ver: [SampleSelection](https://www.orfeo-toolbox.org/CookBook/Applications/app_SampleSelection.html)

In [None]:
for date in DATES:
    raster_paths = [raster_date_path(date, 'complete_raster')]

    class_stat_path = class_statistics(raster_paths, out_file = f'{date}_class_stat.xml', verbose = 1)

    sampling(
        raster_paths,
        class_stat_path  = class_stat_path,
        out_rates_path   = f'{date}_rates.csv',
        out_sql_path     = f'{date}_samples.sqlite',
        strategy         = 'percent', # [byclass|constant|percent|total|smallest|all]
        strategy_percent = 50,
        verbose          = 1
    )

In [33]:
sql.SQLiteClient.inline_tables_definition(
    path  = result_path('2020-12-01_samples.sqlite'),
    table = 'output'
)

In [34]:
sql.SQLiteClient.inline_query(
    path  = result_path('2020-12-01_samples.sqlite'),
    query = """
        SELECT 
            cultivo,
            COUNT(*) AS Cantidad
        FROM
            output
        GROUP BY
            cultivo
        ORDER BY
            Cantidad desc
    """
)

Unnamed: 0,cultivo,Cantidad
0,MAIZ,5893
1,SOJA,4005
2,GIRASOL,777
3,CAMPONATUR,431
4,ALFALFA,48


### Extracción de observaciones


En este paso, en base a una capa vectorial (sqlite) y un raster, se genera la tabla **output** dentro del archivos de base de datos sqlite, donde cada fila es un pixel del raster y cada columna es el valor del pixel en cada banda que contenida en el mismo (En nuestro caso, como el raster es una imagen generada con el satelite SENTINEL).

Ver: [SampleExtraction](https://www.orfeo-toolbox.org/CookBook/Applications/app_SampleExtraction.html)

In [None]:
for date in DATES:
    sample_extraction(
        raster_paths = [raster_date_path(date, 'complete_raster')],
        out_sql_file = f'{date}_samples.sqlite',
        verbose      = 1
    )

In [36]:
sql.SQLiteClient.inline_query(
    path  = result_path('2020-12-01_samples.sqlite'),
    query = """
    SELECT
        id, cultivo, band_0, band_1, band_2, band_3, band_4, band_5, band_6
    FROM
        output
    """
)

Unnamed: 0,id,cultivo,band_0,band_1,band_2,band_3,band_4,band_5,band_6
0,1,SOJA,0.1083,0.1460,0.2057,0.2961,0.31315,0.44065,0.35915
1,1,SOJA,0.1099,0.1478,0.2048,0.2959,0.31170,0.44015,0.36025
2,1,SOJA,0.1096,0.1486,0.2081,0.2996,0.31345,0.44045,0.36075
3,1,SOJA,0.1076,0.1459,0.2049,0.2955,0.31420,0.44490,0.35980
4,1,SOJA,0.1084,0.1431,0.2010,0.2937,0.31060,0.44155,0.35890
...,...,...,...,...,...,...,...,...,...
11149,2,MAIZ,0.1154,0.1554,0.2130,0.2832,0.29750,0.40950,0.32330
11150,2,MAIZ,0.1174,0.1544,0.2088,0.2920,0.31490,0.41400,0.31600
11151,2,MAIZ,0.1144,0.1538,0.2160,0.2894,0.31980,0.42000,0.32110
11152,2,MAIZ,0.1302,0.1722,0.2364,0.3088,0.31790,0.41640,0.32170


### ComputeImageStatistics


Ver: [ComputeImageStatistics](https://www.orfeo-toolbox.org/CookBook/Applications/app_ComputeImagesStatistics.html)

In [37]:
for date in DATES:
    compute_raster_statistics(
        raster_paths = [raster_date_path(date, 'complete_raster')],
        stat_file    = f'{date}_norm_raster_stat.xml',
        verbose      = 1
    )

2022-07-03 20:22:49 (INFO) ComputeImagesStatistics: Default RAM limit for OTB is 1024 MB
2022-07-03 20:22:49 (INFO) ComputeImagesStatistics: GDAL maximum cache size is 1600 MB
2022-07-03 20:22:49 (INFO) ComputeImagesStatistics: OTB will use at most 128 threads
2022-07-03 20:22:49 (INFO): Loading metadata from official product
2022-07-03 20:22:49 (INFO): Estimated memory for full processing: 10993.6MB (avail.: 1024 MB), optimal image partitioning: 11 blocks
2022-07-03 20:22:49 (INFO): Estimation will be performed in 12 blocks of 15860x722 pixels
Processing Image (1/1): 100% [**************************************************] (5s)
Output parameters value:
out.mean: [0.079486, 0.105772, 0.134994, 0.251863, 0.259304, 0.321531, 0.25522]
out.min: [0.0001, 0.0063, 0.0009, 0.0001, 0.0001, 0.0042, 0.0029]
out.max: [1.0088, 1.1228, 1.2356, 1.2884, 0.88735, 0.90225, 0.9506]
out.std: [0.0272121, 0.0306367, 0.0473926, 0.0502634, 0.0507456, 0.0778327, 0.0759153]

otbcli_ComputeImagesStatistics -il 

2022-07-03 20:22:55 (INFO) ComputeImagesStatistics: Default RAM limit for OTB is 1024 MB
2022-07-03 20:22:55 (INFO) ComputeImagesStatistics: GDAL maximum cache size is 1600 MB
2022-07-03 20:22:55 (INFO) ComputeImagesStatistics: OTB will use at most 128 threads
2022-07-03 20:22:55 (INFO): Loading metadata from official product
2022-07-03 20:22:55 (INFO): Estimated memory for full processing: 10993.6MB (avail.: 1024 MB), optimal image partitioning: 11 blocks
2022-07-03 20:22:55 (INFO): Estimation will be performed in 12 blocks of 15860x722 pixels
Processing Image (1/1): 100% [**************************************************] (5s)
Output parameters value:
out.mean: [0.077316, 0.104465, 0.127616, 0.253876, 0.265029, 0.318965, 0.255314]
out.min: [0.0001, 0.0001, 0.0001, 0.0014, 0.0001, 0.0019, 0.0013]
out.max: [1.848, 1.8208, 1.72, 1.6248, 1.5872, 1.3226, 1.5124]
out.std: [0.025387, 0.0274923, 0.0468042, 0.050141, 0.0507984, 0.0813999, 0.0845822]

otbcli_ComputeImagesStatistics -il  -out.

2022-07-03 20:23:01 (INFO) ComputeImagesStatistics: Default RAM limit for OTB is 1024 MB
2022-07-03 20:23:01 (INFO) ComputeImagesStatistics: GDAL maximum cache size is 1600 MB
2022-07-03 20:23:01 (INFO) ComputeImagesStatistics: OTB will use at most 128 threads
2022-07-03 20:23:01 (INFO): Loading metadata from official product
2022-07-03 20:23:01 (INFO): Estimated memory for full processing: 10993.6MB (avail.: 1024 MB), optimal image partitioning: 11 blocks
2022-07-03 20:23:01 (INFO): Estimation will be performed in 12 blocks of 15860x722 pixels
Processing Image (1/1): 100% [**************************************************] (5s)
Output parameters value:
out.mean: [0.0819938, 0.114135, 0.143706, 0.284331, 0.295263, 0.339276, 0.266309]
out.min: [0.0001, 0.0001, 0.0001, 0.0001, 0.0001, 0.00265, 0.0021]
out.max: [1.9408, 1.8096, 1.7176, 1.6052, 1.5974, 1.29595, 1.485]
out.std: [0.0269167, 0.0304293, 0.0510517, 0.0510848, 0.0513198, 0.0730549, 0.0757679]

otbcli_ComputeImagesStatistics -il

2022-07-03 20:23:06 (INFO) ComputeImagesStatistics: Default RAM limit for OTB is 1024 MB
2022-07-03 20:23:06 (INFO) ComputeImagesStatistics: GDAL maximum cache size is 1600 MB
2022-07-03 20:23:06 (INFO) ComputeImagesStatistics: OTB will use at most 128 threads
2022-07-03 20:23:06 (INFO): Loading metadata from official product
2022-07-03 20:23:06 (INFO): Estimated memory for full processing: 10993.6MB (avail.: 1024 MB), optimal image partitioning: 11 blocks
2022-07-03 20:23:06 (INFO): Estimation will be performed in 12 blocks of 15860x722 pixels
Processing Image (1/1): 100% [**************************************************] (5s)
Output parameters value:
out.mean: [0.0527389, 0.0828316, 0.080859, 0.32603, 0.340943, 0.261331, 0.174473]
out.min: [0.0001, 0.0001, 0.0007, 0.0027, 0.001, 0.00165, 0.00155]
out.max: [1.8528, 1.7408, 1.6672, 1.568, 1.5194, 1.312, 1.5121]
out.std: [0.0234102, 0.0258292, 0.0431831, 0.0906783, 0.0911687, 0.0595638, 0.0656748]

otbcli_ComputeImagesStatistics -il  

2022-07-03 20:23:12 (INFO) ComputeImagesStatistics: Default RAM limit for OTB is 1024 MB
2022-07-03 20:23:12 (INFO) ComputeImagesStatistics: GDAL maximum cache size is 1600 MB
2022-07-03 20:23:12 (INFO) ComputeImagesStatistics: OTB will use at most 128 threads
2022-07-03 20:23:12 (INFO): Loading metadata from official product
2022-07-03 20:23:12 (INFO): Estimated memory for full processing: 10993.6MB (avail.: 1024 MB), optimal image partitioning: 11 blocks
2022-07-03 20:23:12 (INFO): Estimation will be performed in 12 blocks of 15860x722 pixels
Processing Image (1/1): 100% [**************************************************] (5s)
Output parameters value:
out.mean: [0.0427842, 0.0683798, 0.0630044, 0.325853, 0.342392, 0.227265, 0.140894]
out.min: [0.0001, 0.0001, 0.0006, 0.0001, 0.0001, 0.0015, 0.0012]
out.max: [1.0528, 1.1312, 1.2112, 1.2312, 0.9017, 0.9353, 0.9853]
out.std: [0.022535, 0.025289, 0.0390887, 0.10535, 0.106354, 0.0540611, 0.0593879]

otbcli_ComputeImagesStatistics -il  -o

2022-07-03 20:23:18 (INFO) ComputeImagesStatistics: Default RAM limit for OTB is 1024 MB
2022-07-03 20:23:18 (INFO) ComputeImagesStatistics: GDAL maximum cache size is 1600 MB
2022-07-03 20:23:18 (INFO) ComputeImagesStatistics: OTB will use at most 128 threads
2022-07-03 20:23:18 (INFO): Loading metadata from official product
2022-07-03 20:23:18 (INFO): Estimated memory for full processing: 10993.6MB (avail.: 1024 MB), optimal image partitioning: 11 blocks
2022-07-03 20:23:18 (INFO): Estimation will be performed in 12 blocks of 15860x722 pixels
Processing Image (1/1): 100% [**************************************************] (5s)
Output parameters value:
out.mean: [0.0394666, 0.0660235, 0.0673456, 0.257657, 0.271254, 0.222137, 0.148719]
out.min: [0.0001, 0.0001, 0.0002, 0.003, 0.00095, 0.0014, 0.0012]
out.max: [0.888, 0.8864, 0.9392, 0.932, 0.6501, 1.0793, 1.2956]
out.std: [0.0163246, 0.0185411, 0.0304052, 0.0886013, 0.0900108, 0.0484511, 0.0544197]

otbcli_ComputeImagesStatistics -il 

## Clasificacion por pixels


Ver: [TrainVectorClassifier](https://www.orfeo-toolbox.org/CookBook/Applications/app_TrainVectorClassifier.html)

In [None]:
for date in DATES:
    train_clasifier(
        sql_file         = f'{date}_samples.sqlite',
        stat_file        = f'{date}_norm_raster_stat.xml',
        out_model_file   = f'{date}_dt_model.txt',
        out_cm_file      = f'{date}_dt_cm_model.csv',
        features         = ['band_0', 'band_1', 'band_2', 'band_3', 'band_4', 'band_5', 'band_6'],
        verbose          = 1,
        clasifier_config = {
            'classifier': 'dt',
            'classifier.dt.min': 1,
            'classifier.dt.max': 10
        }
    )

## Join de campañas de cultivo y calculo de indices

Partiendo de los rasters(.tif) pertenecientes a cada campaña de cultivo (6 en total), calculamos el indice NDVI para cada uno de estos. Luego realizamos la acción **Concat**, la cual realiza un join de todos los rasters. Esto significa que, como resultado tendremos un unico raster con las columnas de los 6 rasters iniciales. Como filas tendremos el mismo numero de pixels que los rasters iniciales. Por ejemplo: dados dos rasters 1 y 2 los cuales tiene 10 filas(pixels) y 1 columna cada uno, al concatenarlos tenemos un unico raster de 10 filas y 2 columna. **Concat** es el análogo al aplicar JOIN en sql.

### 1. Primero calculamos lo indice que creamos necesarios. Estos seran utilizados como features en el paso de clasificación.

In [39]:
raster_paths = [raster_date_path(date, 'complete_raster') for date in DATES]
raster_paths

<img src="../images/lansat2_bands.png" alt="LANSAT2 Bands" width="800">

**Nota**: Las bandas en los rasters comienzan desde 0 no desde 1.

Ver: [RadiometricIndices](https://www.orfeo-toolbox.org/CookBook/Applications/app_RadiometricIndices.html)

In [40]:
ALL_VEGETATION_INDEXES

In [41]:
BEST_INDEXES = [
 'Vegetation:NDVI', 'Vegetation:SAVI' # , 'Vegetation:TSAVI'
]

In [42]:
compute_index(
    raster_paths,
    indexes     = BEST_INDEXES,
    out_postfix = 'indexes',
    blue_band   = 1,
    green_band  = 2,
    red_band    = 3,
    nir_band    = 7,
    verbose     = 1
)

2022-07-03 20:23:26 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:23:26 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:23:26 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:23:26 (INFO): Loading metadata from official product
2022-07-03 20:23:26 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:23:26 (INFO): File ../datasets/images/2020-10-01/complete_raster_indexes.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2020-10-01/complete_raster_indexes.tif...: 100% [**************************************************] (5s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   8.08s user 2.60s system 178% cpu 5.968 total


2022-07-03 20:23:32 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:23:32 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:23:32 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:23:32 (INFO): Loading metadata from official product
2022-07-03 20:23:32 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:23:32 (INFO): File ../datasets/images/2020-11-01/complete_raster_indexes.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2020-11-01/complete_raster_indexes.tif...: 100% [**************************************************] (5s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   5.82s user 2.49s system 138% cpu 6.015 total


2022-07-03 20:23:38 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:23:38 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:23:38 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:23:38 (INFO): Loading metadata from official product
2022-07-03 20:23:38 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:23:38 (INFO): File ../datasets/images/2020-12-01/complete_raster_indexes.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2020-12-01/complete_raster_indexes.tif...: 100% [**************************************************] (5s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   6.30s user 2.63s system 145% cpu 6.126 total


2022-07-03 20:23:44 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:23:44 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:23:44 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:23:44 (INFO): Loading metadata from official product
2022-07-03 20:23:44 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:23:44 (INFO): File ../datasets/images/2021-01-01/complete_raster_indexes.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2021-01-01/complete_raster_indexes.tif...: 100% [**************************************************] (6s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   6.82s user 2.87s system 150% cpu 6.435 total


2022-07-03 20:23:51 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:23:51 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:23:51 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:23:51 (INFO): Loading metadata from official product
2022-07-03 20:23:51 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:23:51 (INFO): File ../datasets/images/2021-02-20/complete_raster_indexes.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2021-02-20/complete_raster_indexes.tif...: 100% [**************************************************] (5s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   6.48s user 2.70s system 154% cpu 5.953 total


2022-07-03 20:23:57 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:23:57 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:23:57 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:23:57 (INFO): Loading metadata from official product
2022-07-03 20:23:57 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:23:57 (INFO): File ../datasets/images/2021-03-17/complete_raster_indexes.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2021-03-17/complete_raster_indexes.tif...: 100% [**************************************************] (6s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   6.89s user 2.68s system 145% cpu 6.585 total


Como resultado tenemos un nuevo raster por cada campaña de cultivo, cada uno contiene una columna por cada indice calculado.

In [43]:
!gdalinfo {raster_date_path('2020-10-01', 'complete_raster_indexes')}

Driver: GTiff/GeoTIFF
Files: ../datasets/images/2020-10-01/complete_raster_indexes.tif
Size is 15860, 8653
Origin = (-65.117617304426346,-33.815102577054326)
Pixel Size = (0.000179663056824,-0.000179663056824)
Metadata:
  DataType=9
  METADATATYPE=OTB
  OTB_VERSION=8.0.1
  TileHintX=15860
  TileHintY=1
Image Structure Metadata:
  INTERLEAVE=PIXEL
Corner Coordinates:
Upper Left  ( -65.1176173, -33.8151026) 
Lower Left  ( -65.1176173, -35.3697270) 
Upper Right ( -62.2681612, -33.8151026) 
Lower Right ( -62.2681612, -35.3697270) 
Center      ( -63.6928893, -34.5924148) 
Band 1 Block=15860x1 Type=Float32, ColorInterp=Gray
Band 2 Block=15860x1 Type=Float32, ColorInterp=Undefined


### 2. Join de rasters

En este paso vamos a realizar un join de todos los rasters del paso anterior.

In [44]:
join_rasters(
    raster_paths = [raster_date_path(date, 'complete_raster_indexes') for date in DATES],
    out_file     = result_path('complete_rasters_join'),
    verbose      = 1
)

2022-07-03 20:24:04 (INFO) ConcatenateImages: Default RAM limit for OTB is 1024 MB
2022-07-03 20:24:04 (INFO) ConcatenateImages: GDAL maximum cache size is 1600 MB
2022-07-03 20:24:04 (INFO) ConcatenateImages: OTB will use at most 128 threads
2022-07-03 20:24:04 (INFO): Loading metadata from official product
2022-07-03 20:24:04 (INFO): Loading metadata from official product
2022-07-03 20:24:04 (INFO): Loading metadata from official product
2022-07-03 20:24:04 (INFO): Loading metadata from official product
2022-07-03 20:24:04 (INFO): Loading metadata from official product
2022-07-03 20:24:04 (INFO): Loading metadata from official product
2022-07-03 20:24:04 (INFO): Estimated memory for full processing: 25128.3MB (avail.: 1024 MB), optimal image partitioning: 25 blocks
2022-07-03 20:24:04 (INFO): File ../results/complete_rasters_join.tif will be written in 26 blocks of 15860x333 pixels
Writing ../results/complete_rasters_join.tif...: 100% [************************************************

A continuación podemos observar que el raster consolidado tiene 66 Bandas. El formato raster agregar el nombre **Band** a sus columnas, ya que es la información mas común para este tipo de datos. En nuestro caso estas columnas representan los indices de cada raster de entrada calculado para por cada pixel. Como resultado tendremos un único raster o mapa con todos los indices calculado para cada raster inicial y los pixels del mapa como filas.

In [45]:
!gdalinfo  {result_path('complete_rasters_join.tif')}

Driver: GTiff/GeoTIFF
Files: ../results/complete_rasters_join.tif
Size is 15860, 8653
Origin = (-65.117617304426346,-33.815102577054326)
Pixel Size = (0.000179663056824,-0.000179663056824)
Metadata:
  DataType=9
  METADATATYPE=OTB
  OTB_VERSION=8.0.1
  TileHintX=15860
  TileHintY=1
Image Structure Metadata:
  INTERLEAVE=PIXEL
Corner Coordinates:
Upper Left  ( -65.1176173, -33.8151026) 
Lower Left  ( -65.1176173, -35.3697270) 
Upper Right ( -62.2681612, -33.8151026) 
Lower Right ( -62.2681612, -35.3697270) 
Center      ( -63.6928893, -34.5924148) 
Band 1 Block=15860x1 Type=Float32, ColorInterp=Gray
Band 2 Block=15860x1 Type=Float32, ColorInterp=Undefined
Band 3 Block=15860x1 Type=Float32, ColorInterp=Undefined
Band 4 Block=15860x1 Type=Float32, ColorInterp=Undefined
Band 5 Block=15860x1 Type=Float32, ColorInterp=Undefined
Band 6 Block=15860x1 Type=Float32, ColorInterp=Undefined
Band 7 Block=15860x1 Type=Float32, ColorInterp=Undefined
Band 8 Block=15860x1 Type=Float32, ColorInterp=Undefi

Cuanto pesa el mapa o raster consolidado? 

In [46]:
!du -h {result_path('complete_rasters_join.tif')}

6.2G	../results/complete_rasters_join.tif


### 3. Clasificacion por pixes usando el raster de indices consolidado

In [47]:
class_stat_path = class_statistics(
    [result_path('complete_rasters_join.tif')], 
    out_file = 'complete_rasters_join_class_stat.xml', 
    verbose = 1
)

otbcli_PolygonClassStatistics -in ../results/complete_rasters_join.tif -vec    0.07s user 0.05s system 97% cpu 0.122 total
<?xml version="1.0" ?>
<GeneralStatistics>
    <Statistic name="samplesPerClass">
        <StatisticMap key="1" value="3189" />
        <StatisticMap key="10" value="48" />
        <StatisticMap key="2" value="2570" />
        <StatisticMap key="20" value="431" />
        <StatisticMap key="3" value="3323" />
        <StatisticMap key="4" value="816" />
        <StatisticMap key="5" value="777" />


In [48]:
sampling(
    [result_path('complete_rasters_join.tif')],
    class_stat_path  = class_stat_path,
    out_rates_path   = f'complete_rasters_join_rates.csv',
    out_sql_path     = f'complete_rasters_join_samples.sqlite',
    verbose          = 1,  
    strategy         = 'percent', # [byclass|constant|percent|total|smallest|all]
    strategy_percent = 100,
)

2022-07-03 20:24:19 (INFO) SampleSelection: Default RAM limit for OTB is 1024 MB
2022-07-03 20:24:19 (INFO) SampleSelection: GDAL maximum cache size is 1600 MB
2022-07-03 20:24:19 (INFO) SampleSelection: OTB will use at most 128 threads
2022-07-03 20:24:19 (INFO) SampleSelection: Elevation management: setting default height above ellipsoid to 0 meters
2022-07-03 20:24:19 (INFO) SampleSelection: Sampling strategy: set a percentage of samples for each class.
2022-07-03 20:24:19 (INFO) SampleSelection: Sampling rates :  className  requiredSamples  totalSamples  rate
1	3189	3189	1
10	48	48	1
2	2570	2570	1
20	431	431	1
3	3323	3323	1
4	816	816	1
5	777	777	1

2022-07-03 20:24:19 (INFO): Loading metadata from official product
2022-07-03 20:24:19 (INFO): Estimated memory for full processing: 12563.9MB (avail.: 1024 MB), optimal image partitioning: 13 blocks
2022-07-03 20:24:19 (INFO): Estimation will be performed in 15 blocks of 3264x3264 pixels
Selecting positions with periodic sampler...: 100

In [49]:
sample_extraction(
    raster_paths = [result_path('complete_rasters_join.tif')],
    out_sql_file = f'complete_rasters_join_samples.sqlite',
    verbose      = 1
)

2022-07-03 20:24:20 (INFO) SampleExtraction: Default RAM limit for OTB is 1024 MB
2022-07-03 20:24:20 (INFO) SampleExtraction: GDAL maximum cache size is 1600 MB
2022-07-03 20:24:20 (INFO) SampleExtraction: OTB will use at most 128 threads
2022-07-03 20:24:20 (INFO): Loading metadata from official product
2022-07-03 20:24:20 (INFO): Estimated memory for full processing: 18846.1MB (avail.: 1024 MB), optimal image partitioning: 19 blocks
2022-07-03 20:24:20 (INFO): Estimation will be performed in 20 blocks of 15860x433 pixels
Extracting sample values...: 100% [**************************************************] (5s)
otbcli_SampleExtraction -in ../results/complete_rasters_join.tif -vec  -field  3.71s user 1.75s system 95% cpu 5.741 total


In [50]:
sql.SQLiteClient.inline_query(
    path  = result_path('complete_rasters_join_samples.sqlite'),
    query = """
        SELECT 
            cultivo  AS 'Codigo de cultivo',
            COUNT(*) AS 'Cantidad de pixels (Solo aquellos que tiene verdad de campo)'
        FROM
            output
        GROUP BY
            cultivo
        ORDER BY
            'Cantidad de pixels (Solo aquellos que tiene verdad de campo)' desc
    """
)

Unnamed: 0,Codigo de cultivo,Cantidad de pixels (Solo aquellos que tiene verdad de campo)
0,SOJA,4005
1,MAIZ,5893
2,GIRASOL,777
3,CAMPONATUR,431
4,ALFALFA,48


In [51]:
compute_raster_statistics(
    raster_paths = [result_path('complete_rasters_join.tif')],
    stat_file    = f'complete_rasters_join_norm_raster_stat.xml',
    verbose      = 1
)

2022-07-03 20:24:26 (INFO) ComputeImagesStatistics: Default RAM limit for OTB is 1024 MB
2022-07-03 20:24:26 (INFO) ComputeImagesStatistics: GDAL maximum cache size is 1600 MB
2022-07-03 20:24:26 (INFO) ComputeImagesStatistics: OTB will use at most 128 threads
2022-07-03 20:24:26 (INFO): Loading metadata from official product
2022-07-03 20:24:26 (INFO): Estimated memory for full processing: 18846.1MB (avail.: 1024 MB), optimal image partitioning: 19 blocks
2022-07-03 20:24:26 (INFO): Estimation will be performed in 20 blocks of 15860x433 pixels
Processing Image (1/1): 100% [**************************************************] (5s)
Output parameters value:
out.mean: [0.306923, 0.198543, 0.330888, 0.211161, 0.30659, 0.199373, 0.391082, 0.183084, 0.414047, 0.163147, 0.381713, 0.166763]
out.min: [-0.902098, -0.692431, -0.935125, -0.614061, -0.887691, -0.608959, -0.935157, -0.650943, -0.945238, -0.619271, -0.961889, -0.573433]
out.max: [0.914894, 0.544045, 0.994595, 0.583584, 0.994709, 0.699

In [52]:
train_clasifier(
    sql_file         = f'complete_rasters_join_samples.sqlite',
    stat_file        = f'complete_rasters_join_norm_raster_stat.xml',
    out_model_file   = f'complete_rasters_join_rf_model.txt',
    out_cm_file      = f'complete_rasters_join_rf_cm_model.csv',
    verbose          = 1,
    features         = [f'band_{idx}' for idx in range(12)],
    clasifier_config = {
        'classifier': 'rf',
        'classifier.rf.min': 1,
        'classifier.rf.max': 5
    }
)

2022-07-03 20:24:32 (INFO) TrainVectorClassifier: Default RAM limit for OTB is 1024 MB
2022-07-03 20:24:32 (INFO) TrainVectorClassifier: GDAL maximum cache size is 1600 MB
2022-07-03 20:24:32 (INFO) TrainVectorClassifier: OTB will use at most 128 threads
2022-07-03 20:24:32 (INFO) TrainVectorClassifier: Reading vector file 1/1
2022-07-03 20:24:32 (INFO) TrainVectorClassifier: Computing model file : ../results/complete_rasters_join_rf_model.txt
Training model...: 100% [**************************************************] (1s)
Validation...: 100% [**************************************************] (0s)
2022-07-03 20:24:33 (INFO) TrainVectorClassifier: Predicted list size : 11154
2022-07-03 20:24:33 (INFO) TrainVectorClassifier: ValidationLabeledListSample size : 11154
2022-07-03 20:24:33 (INFO) TrainVectorClassifier: Training performances:
2022-07-03 20:24:33 (INFO) TrainVectorClassifier: Confusion matrix (rows = reference labels, columns = produced labels):
     [1] [2] [3] [4] [5] [10]

## Construcción de dataset CSV

A partir de el conjunto completo de pixels que tienen verdad de campo construmimos un dataset en csv, renombrando las columnas para identificar a que indice y campaña de cultivo pertenecen.

In [53]:
df = sql.SQLiteClient.inline_query(
    path  = result_path('complete_rasters_join_samples.sqlite'), 
    query = 'SELECT * FROM output'
)

In [54]:
indexes_columns = flatten([[f'{value.replace("Vegetation:", "")}_{i+1}' for value in BEST_INDEXES] for i in range(len(DATES))])

band_columns = np.array(list(filter(lambda v: 'band_' in v, df.columns)))

indexes_columns, band_columns

In [55]:
rename_def = {k:v for k,v in zip(band_columns, indexes_columns) }

### Target combinado

Concatenamos las columnas id y colvibo apra creat la columna target_combined.

In [56]:
df = df.rename(columns =rename_def)

df['target_combined'] = df["cultivo"] + '_' + df["id"].astype(str)

In [57]:
df.groupby(['target_combined']).size()

### Target combinado solo maiz, soja  y others

Hacemso merge de todas catagorias expecto mais y soja en una nueva categoria OTHERS.

In [58]:
others = [ 'ALFALFA_10', 'CAMPONATUR_20', 'GIRASOL_5']
df['target_compined_maiz_soja_others'] = df['target_combined'].apply(lambda v: 'OTHER' if v in others else v)
df.groupby(['target_compined_maiz_soja_others']).size()

### Target maiz vs others

In [59]:
df['target_maiz_others'] = df['target_combined'].apply(lambda v: 1 if v in ['MAIZ_3', 'MAIZ_2'] else 0)
df.groupby(['target_maiz_others']).size()

### Target maiz vs others

In [60]:
df['target_soja_others'] = df['target_combined'].apply(lambda v: 1 if v in ['SOJA_1', 'SOJA_4'] else 0)
df.groupby(['target_soja_others']).size()

## Agredamos columnas como min, max, mean, median, var

### Gardamos todos los datasets

In [61]:
df.to_csv( result_path('dataset_augmented.csv'), encoding='utf-8')

In [62]:
for col in indexes_columns:
    df[col] = (df[col] - df[col].mean()) / df[col].std() 

In [63]:
df.to_csv( result_path('dataset_norm_augmented.csv'), encoding='utf-8')

## Predicción

1. Tomamos los rasters de cada campaña y calculasmos lo mismo indices que en el paso de entrenamiento. esta ves calculamos lo indices para todos los pixels y no solo para aquellos qeu tiene verdad de campo.

In [67]:
for date in DATES:
    compute_index(
        raster_paths = [raster_date_path(date, 'complete_raster')],
        indexes      = BEST_INDEXES,
        out_postfix  = 'indexes_without_labels',
        blue_band    = 1,
        green_band   = 2,
        red_band     = 3,
        nir_band     = 7,
        verbose      = 1
    )   

2022-07-03 20:32:59 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:32:59 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:32:59 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:32:59 (INFO): Loading metadata from official product
2022-07-03 20:32:59 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:32:59 (INFO): File ../datasets/images/2020-10-01/complete_raster_indexes_without_labels.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2020-10-01/complete_raster_indexes_without_labels.tif...: 100% [**************************************************] (5s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   7.26s user 2.62s system 162% cpu 6.082 total


2022-07-03 20:33:05 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:33:05 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:33:05 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:33:05 (INFO): Loading metadata from official product
2022-07-03 20:33:05 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:33:05 (INFO): File ../datasets/images/2020-11-01/complete_raster_indexes_without_labels.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2020-11-01/complete_raster_indexes_without_labels.tif...: 100% [**************************************************] (6s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   6.86s user 2.69s system 153% cpu 6.220 total


2022-07-03 20:33:11 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:33:11 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:33:11 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:33:11 (INFO): Loading metadata from official product
2022-07-03 20:33:11 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:33:11 (INFO): File ../datasets/images/2020-12-01/complete_raster_indexes_without_labels.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2020-12-01/complete_raster_indexes_without_labels.tif...: 100% [**************************************************] (6s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   6.71s user 2.66s system 132% cpu 7.052 total


2022-07-03 20:33:18 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:33:18 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:33:18 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:33:18 (INFO): Loading metadata from official product
2022-07-03 20:33:18 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:33:18 (INFO): File ../datasets/images/2021-01-01/complete_raster_indexes_without_labels.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2021-01-01/complete_raster_indexes_without_labels.tif...: 100% [**************************************************] (6s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   6.42s user 2.54s system 144% cpu 6.219 total


2022-07-03 20:33:25 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:33:25 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:33:25 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:33:25 (INFO): Loading metadata from official product
2022-07-03 20:33:25 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:33:25 (INFO): File ../datasets/images/2021-02-20/complete_raster_indexes_without_labels.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2021-02-20/complete_raster_indexes_without_labels.tif...: 100% [**************************************************] (6s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   5.59s user 2.67s system 130% cpu 6.322 total


2022-07-03 20:33:31 (INFO) RadiometricIndices: Default RAM limit for OTB is 1024 MB
2022-07-03 20:33:31 (INFO) RadiometricIndices: GDAL maximum cache size is 1600 MB
2022-07-03 20:33:31 (INFO) RadiometricIndices: OTB will use at most 128 threads
2022-07-03 20:33:31 (INFO): Loading metadata from official product
2022-07-03 20:33:31 (INFO): Estimated memory for full processing: 5758.6MB (avail.: 1024 MB), optimal image partitioning: 6 blocks
2022-07-03 20:33:31 (INFO): File ../datasets/images/2021-03-17/complete_raster_indexes_without_labels.tif will be written in 7 blocks of 15860x1237 pixels
Writing ../datasets/images/2021-03-17/complete_raster_indexes_without_labels.tif...: 100% [**************************************************] (6s)
otbcli_RadiometricIndices -channels.blue 1 -channels.green 2 -channels.red 3   6.97s user 2.83s system 155% cpu 6.288 total


2. Hacemos un join de todas las columnas enn un unico file, deonde tenemos pixesl coli fulas y toda las coolumnas del cada campaña.

In [68]:
join_rasters(
    raster_paths = [raster_date_path(date, 'complete_raster_indexes_without_labels') for date in DATES],
    out_file     = result_path('complete_rasters_join_without_labels'),
    verbose      = 1
)

2022-07-03 20:35:12 (INFO) ConcatenateImages: Default RAM limit for OTB is 1024 MB
2022-07-03 20:35:12 (INFO) ConcatenateImages: GDAL maximum cache size is 1600 MB
2022-07-03 20:35:12 (INFO) ConcatenateImages: OTB will use at most 128 threads
2022-07-03 20:35:12 (INFO): Loading metadata from official product
2022-07-03 20:35:12 (INFO): Loading metadata from official product
2022-07-03 20:35:12 (INFO): Loading metadata from official product
2022-07-03 20:35:12 (INFO): Loading metadata from official product
2022-07-03 20:35:12 (INFO): Loading metadata from official product
2022-07-03 20:35:12 (INFO): Loading metadata from official product
2022-07-03 20:35:12 (INFO): Estimated memory for full processing: 25128.3MB (avail.: 1024 MB), optimal image partitioning: 25 blocks
2022-07-03 20:35:12 (INFO): File ../results/complete_rasters_join_without_labels.tif will be written in 26 blocks of 15860x333 pixels
Writing ../results/complete_rasters_join_without_labels.tif...: 100% [******************

3. Ya armado el file con todos los pixels del dataset y preprocesados para que contenga los mismo indices calculado en el entrenamiento, predecimos el tipo del cultivo de cada pixel.

In [71]:
!time otbcli_ImageClassifier \
    -in     {result_path('complete_rasters_join_without_labels.tif')}  \
    -imstat {result_path('complete_rasters_join_norm_raster_stat.xml')} \
    -model  {result_path('complete_rasters_join_rf_model.txt')} \
    -out    {result_path('predictions.tif')}

2022-07-03 20:36:49 (INFO) ImageClassifier: Default RAM limit for OTB is 1024 MB
2022-07-03 20:36:49 (INFO) ImageClassifier: GDAL maximum cache size is 1600 MB
2022-07-03 20:36:49 (INFO) ImageClassifier: OTB will use at most 128 threads
2022-07-03 20:36:49 (INFO): Loading metadata from official product
2022-07-03 20:36:49 (INFO) ImageClassifier: Loading model
2022-07-03 20:36:49 (INFO) ImageClassifier: Model loaded
2022-07-03 20:36:49 (INFO) ImageClassifier: Input image normalization activated.
2022-07-03 20:36:49 (INFO) ImageClassifier: mean used: [0.306923, 0.198543, 0.330888, 0.211161, 0.30659, 0.199373, 0.391082, 0.183084, 0.414047, 0.163147, 0.381713, 0.166763]
2022-07-03 20:36:49 (INFO) ImageClassifier: standard deviation used: [0.128868, 0.0613974, 0.117285, 0.0615362, 0.113647, 0.0596905, 0.145405, 0.0510545, 0.154498, 0.047732, 0.131504, 0.054056]
2022-07-03 20:36:49 (INFO): Estimated memory for full processing: 15443.7MB (avail.: 1024 MB), optimal image partitioning: 16 block

4. Revsamos el resutado y tenemos un uno file con una unica banda, la cual tiene los tags predichos.

In [73]:
!du -h {result_path('predictions.tif')}

131M	../results/predictions.tif


In [74]:
!gdalinfo c

Driver: GTiff/GeoTIFF
Files: ../results/predictions.tif
Size is 15860, 8653
Origin = (-65.117617304426346,-33.815102577054326)
Pixel Size = (0.000179663056824,-0.000179663056824)
Metadata:
  DataType=9
  METADATATYPE=OTB
  OTB_VERSION=8.0.1
  TileHintX=15860
  TileHintY=1
Image Structure Metadata:
  INTERLEAVE=BAND
Corner Coordinates:
Upper Left  ( -65.1176173, -33.8151026) 
Lower Left  ( -65.1176173, -35.3697270) 
Upper Right ( -62.2681612, -33.8151026) 
Lower Right ( -62.2681612, -35.3697270) 
Center      ( -63.6928893, -34.5924148) 
Band 1 Block=15860x1 Type=Byte, ColorInterp=Gray


5. Reproyectamos la mascara para alinearla.

In [56]:
!time otbcli_Superimpose \
    -inr {result_path('predictions.tif')} \
    -inm {raster_path('mask_agri_aoi')} \
    -out {raster_path('mask_agri_aoi_aligned')}

2022-07-04 14:34:54 (INFO): Loading metadata from official product
2022-07-04 14:34:54 (INFO): Loading metadata from official product
2022-07-04 14:34:54 (INFO) Superimpose: Default RAM limit for OTB is 1024 MB
2022-07-04 14:34:54 (INFO) Superimpose: GDAL maximum cache size is 1600 MB
2022-07-04 14:34:54 (INFO) Superimpose: OTB will use at most 128 threads
2022-07-04 14:34:54 (INFO) Superimpose: Elevation management: setting default height above ellipsoid to 0 meters
2022-07-04 14:34:54 (INFO): Estimated memory for full processing: 1799.55MB (avail.: 1024 MB), optimal image partitioning: 2 blocks
2022-07-04 14:34:54 (INFO): File ../datasets/images/mask_agri_aoi_aligned.tif will be written in 3 blocks of 15860x2885 pixels
Writing ../datasets/images/mask_agri_aoi_aligned.tif...: 100% [**************************************************] (2s)


6. Aplicamos la mascara a lapredicción.

In [57]:
!time gdal_calc.py  \
    -A  {result_path('predictions.tif')}   \
    --A_band=1 \
    -B  {raster_path('mask_agri_aoi_aligned')} \
    --B_band=1 \
    --calc='((B==1)*A)+((B==0)*0)' \
    --outfile {result_path('predictions_masked.tif')}

0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 0.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 1.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 2.. 

7. Generamos archivos csv con ambas predicciones.

In [58]:
!time gdal2xyz.py \
    -band 1 \
    {result_path('predictions.tif')}  \
    {result_path('predictions.csv')}

gdal2xyz.py -band 1 ../results/predictions.tif ../results/predictions.csv  174.57s user 3.28s system 100% cpu 2:56.77 total


In [66]:
!du -h {result_path('predictions.csv')}

3.6G	../results/predictions.csv


In [59]:
!time gdal2xyz.py \
    -band 1 \
    {result_path('predictions_masked.tif')}  \
    {result_path('predictions_masked.csv')}

gdal2xyz.py -band 1 ../results/predictions_masked.tif   195.29s user 3.83s system 100% cpu 3:18.03 total


In [67]:
!du -h {result_path('predictions_masked.csv')}

4.1G	../results/predictions_masked.csv


8. Calculo de hectareas pare Maíz y Soja.

In [60]:
class Mapper:
    labels = {
        1:  'SOJA',
        2:  'MAIZ',
        3:  'MAIZ',
        4:  'SOJA',
        5:  'OTHER',
        10: 'OTHER',
        20: 'OTHER'
    }
    def map(self, value): 
        return 'NOTHING' if value <=0 or value >= 30 else self.labels[value]

def load_predictions(path):
    predictions = pd.read_csv(path, header=None, delimiter=r"\s+")
    predictions = predictions.rename(columns={0: 'X', 1: 'Y', 2: 'LABEL_NUM'})
    predictions['LABEL'] = predictions['LABEL_NUM'].apply(Mapper().map)
    return predictions[predictions['LABEL'] != 'NOTHING']
    
def hectareas(df):
    df2 = df.groupby(['LABEL'])[['LABEL_NUM']] \
            .count() * 0.04
    return df2.sort_values('LABEL_NUM', ascending=False) \
              .reset_index() \
              .rename(columns={'LABEL': 'Tipo de Cultivo', 'LABEL_NUM':'Hectareas'})

In [61]:
predictions = load_predictions(result_path('predictions.csv'))

In [62]:
hectareas(predictions)

Unnamed: 0,Tipo de Cultivo,Hectareas
0,SOJA,3031001.36
1,MAIZ,2244241.32
2,OTHER,214220.52


In [63]:
predictions_masked = load_predictions(result_path('predictions_masked.csv'))

In [64]:
hectareas(predictions_masked)

Unnamed: 0,Tipo de Cultivo,Hectareas
0,MAIZ,861836.32
1,SOJA,719000.64
2,OTHER,22795.8
