# Entrenamiento iterativo

In [1]:
import os
import json
import joblib
import numpy as np
import pandas as pd
from glob import glob
from sqlite3 import connect
from sklearn.ensemble import RandomForestClassifier

## Conjunto de entrenamiento

In [2]:
train_sqlite_files = glob('../data/selection_verdad_campo/*.sqlite')

train_data = pd.DataFrame()

for sf in train_sqlite_files:
    cnx = connect(sf)
    df = pd.read_sql_query("SELECT * FROM output", cnx)
    train_data = pd.concat([train_data, df], ignore_index=True)

In [3]:
train_data.shape

(466, 34)

In [4]:
train_data.head()

Unnamed: 0,ogc_fid,GEOMETRY,in1,id,cultivo,originfid,band_0,band_1,band_2,band_3,...,band_18,band_19,band_20,band_21,band_22,band_23,band_24,band_25,band_26,band_27
0,1,b'\x01\x01\x00\x00\x00(I\x80\xe0\xf6fO\xc0\xbe...,14084,1,SOJA,7,0.42816,0.692352,0.36905,0.730617,...,0.222903,4.816959,0.822,0.410423,0.831812,0.543103,0.868833,2.9545,0.109526,0.861515
1,2,"b'\x01\x01\x00\x00\x00\x1f|\x14""lkO\xc0^\x14\x...",14084,1,SOJA,8,0.41152,0.671952,0.4294,0.711504,...,0.271554,4.774915,1.363,0.44164,0.81709,0.587002,0.862946,3.15975,0.148569,0.85983
2,3,b'\x01\x01\x00\x00\x00L).\x93\x08eO\xc0fz\xd9|...,14084,1,SOJA,11,0.49848,0.794412,0.4227,0.805747,...,0.253338,5.247081,1.27025,0.423015,0.84994,0.563715,0.884401,3.213,0.130448,0.87977
3,4,b'\x01\x01\x00\x00\x00\x03X\x12\xb7\xb0iO\xc0[...,14084,1,SOJA,12,0.52872,0.834804,0.798,0.835724,...,0.568978,5.583365,2.861,0.432937,0.861866,0.588915,0.89253,2.7585,0.430294,0.891874
4,5,b'\x01\x01\x00\x00\x00\xa5\x87\x83F\xd6\\O\xc0...,14084,1,SOJA,28,0.5056,0.799404,0.32025,0.812116,...,0.192942,5.668369,0.0745,0.453263,0.858904,0.58945,0.899895,2.889,0.079681,0.89436


## Conjunto de predicción

In [5]:
pred_sqlite_files = glob('../data/selection_mask_agri_aoi/*.sqlite')

pred_data = pd.DataFrame()

for sf in pred_sqlite_files:
    cnx = connect(sf)
    df = pd.read_sql_query("SELECT * FROM output", cnx)
    df = df
    pred_data = pd.concat([pred_data, df], ignore_index=True)

In [6]:
pred_data.shape

(2000, 33)

In [7]:
pred_data.head()

Unnamed: 0,ogc_fid,GEOMETRY,dn,originfid,band_0,band_1,band_2,band_3,band_4,band_5,...,band_19,band_20,band_21,band_22,band_23,band_24,band_25,band_26,band_27,fid
0,1,b'\x01\x01\x00\x00\x00\xd9d\xed\xadtnO\xc0c\xc...,0,1700,0.2494,0.46014,0.5252,0.528496,0.402478,17.191881,...,2.475016,1.63725,0.147789,0.73226,0.206949,0.678206,2.088,0.248439,0.648655,
1,2,b'\x01\x01\x00\x00\x00\xac\xa8\xb6x<kO\xc0)\x0...,0,2725,0.26908,0.505404,0.35965,0.553477,0.444814,24.925385,...,2.609456,2.10075,0.254138,0.697226,0.345695,0.692849,3.222,0.285051,0.670396,
2,3,b'\x01\x01\x00\x00\x00\xec\x08\x8e\x96\x8d]O\x...,0,3064,0.00596,0.110892,0.1662,0.188773,0.087656,10.408563,...,0.79853,0.12125,-0.160827,0.550313,-0.212589,0.274238,1.6695,0.019799,0.185629,
3,4,b'\x01\x01\x00\x00\x00\xb6\xb2\xb1`d]O\xc0\x81...,0,3064,0.45856,0.744036,0.5723,0.765991,0.704343,21.82736,...,4.507212,3.22075,0.362193,0.831067,0.481766,0.857324,1.801,0.405955,0.845724,
4,5,"b'\x01\x01\x00\x00\x00P\x00\x00bkiO\xc05\xdd""=...",0,3160,0.14972,0.286284,0.27095,0.360262,0.255082,45.364269,...,2.378919,1.4995,0.190551,0.696629,0.275626,0.647059,1.6775,0.167357,0.635977,


## Entrenamiento del Modelo

In [8]:
with open('../model/randomforest_parameters.json','r') as f:
    parameters = json.load(f)

In [9]:
thresholds = [0.4, 0.56]

In [10]:
for threshold in thresholds:
    
    print(f'+++++ PREDICCIONES PARA THRESHOLD {threshold}')
    
    threshold_folder = os.path.join('..','model',f'threshhold_{threshold}')
    os.makedirs(threshold_folder, exist_ok=True)
    
    i = 0
    while True:
        
        # arma carpeta para el output (i aumenta con las iteraciones)
        n_iter = '{0:03d}'.format(i)
        output_folder = os.path.join(threshold_folder,f'randomforest_iterations_{n_iter}')
        os.makedirs(output_folder, exist_ok=True)
        
        # toma los datasets
        columns = train_data.filter(regex='band_').columns.to_list()
        X_train = train_data.filter(regex='band_').fillna(-99999).to_numpy()
        y_train = train_data.id.to_numpy()
        X_pred = pred_data.filter(regex='band_').fillna(-99999).to_numpy()
        
        # instancia y entrena el modelo
        model = RandomForestClassifier(**parameters)
        model.fit(X_train, y_train)
        output_model_file = os.path.join(output_folder, f'model_{n_iter}.joblib')
        _ = joblib.dump(model, output_model_file)
        
        # predice
        probas = model.predict_proba(X_pred)
        output_proba_file = os.path.join(output_folder, f'probas_{n_iter}.npy')
        np.save(output_proba_file, probas)
        predictions = pred_data.assign(pred_class=probas.argmax(axis=1), pred_score=probas.max(axis=1))
        
        # separa entre nuevo train y nuevo pred
        add_to_train = predictions.query(f'pred_score >= {threshold}').copy()
        continue_pred = predictions.query(f'pred_score < {threshold}').copy()
        train_data_len, add_to_train_len , continue_pred_len = train_data.shape[0], add_to_train.shape[0] , continue_pred.shape[0]
        output_pixels_file = os.path.join(output_folder, f'pixels_{n_iter}.csv')
        (
            pd.DataFrame(
                [
                    [f'De entrenamiento', train_data_len],
                    [f'Con proba>={threshold}', add_to_train_len],
                    [f'Con proba<{threshold}', continue_pred_len]
                ],
                columns=['Pyxels_type','Pixels']
            )
            .to_csv(output_pixels_file, index=False)
        )
        
        # pasa predicción a las columna id (target)
        # y lo agrega al train original
        add_to_train['id'] = add_to_train['pred_class']
        train_data = train_data.append(add_to_train, ignore_index=True)
        pred_data = continue_pred
        
        # imprime información
        print('''\n*** ITERACIÓN #{0:03d}
        - Modelo guardado en {1}
        - Probabilidades guardadas en {2}
        - Pixeles de entrenamiento: {3}
        - Pixeles con proba>={4}: {5}
        - Pixeles con proba<{4}: {6}'''.format(i, output_model_file, output_proba_file, train_data_len, threshold, add_to_train_len , continue_pred_len))
        i += 1
        if add_to_train_len == 0:
            break

    # reemplaza na en columna pred_class con 'vc_original'
    # (los pixeles que no tiene pred_class son los pieles de verdad de campo originales)
    # y guarda la predicción final
    final_prediction = os.path.join(threshold_folder,f'randomforest_iterations_final_prediction.csv')
    train_data['pred_class'] = train_data.pred_class.fillna('vc_original')
    train_data.to_csv(final_prediction)

+++++ PREDICCIONES PARA THRESHOLD 0.4


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    1.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.2s finished



*** ITERACIÓN #000
        - Modelo guardado en ../model/threshhold_0.4/randomforest_iterations_000/model_000.joblib
        - Probabilidades guardadas en ../model/threshhold_0.4/randomforest_iterations_000/probas_000.npy
        - Pixeles de entrenamiento: 466
        - Pixeles con proba>=0.4: 1846
        - Pixeles con proba<0.4: 154


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    2.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.1s finished



*** ITERACIÓN #001
        - Modelo guardado en ../model/threshhold_0.4/randomforest_iterations_001/model_001.joblib
        - Probabilidades guardadas en ../model/threshhold_0.4/randomforest_iterations_001/probas_001.npy
        - Pixeles de entrenamiento: 2312
        - Pixeles con proba>=0.4: 132
        - Pixeles con proba<0.4: 22


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    2.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.1s finished



*** ITERACIÓN #002
        - Modelo guardado en ../model/threshhold_0.4/randomforest_iterations_002/model_002.joblib
        - Probabilidades guardadas en ../model/threshhold_0.4/randomforest_iterations_002/probas_002.npy
        - Pixeles de entrenamiento: 2444
        - Pixeles con proba>=0.4: 16
        - Pixeles con proba<0.4: 6


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    2.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.1s finished



*** ITERACIÓN #003
        - Modelo guardado en ../model/threshhold_0.4/randomforest_iterations_003/model_003.joblib
        - Probabilidades guardadas en ../model/threshhold_0.4/randomforest_iterations_003/probas_003.npy
        - Pixeles de entrenamiento: 2460
        - Pixeles con proba>=0.4: 3
        - Pixeles con proba<0.4: 3


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    2.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.1s finished



*** ITERACIÓN #004
        - Modelo guardado en ../model/threshhold_0.4/randomforest_iterations_004/model_004.joblib
        - Probabilidades guardadas en ../model/threshhold_0.4/randomforest_iterations_004/probas_004.npy
        - Pixeles de entrenamiento: 2463
        - Pixeles con proba>=0.4: 0
        - Pixeles con proba<0.4: 3
+++++ PREDICCIONES PARA THRESHOLD 0.56


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.1s finished



*** ITERACIÓN #000
        - Modelo guardado en ../model/threshhold_0.56/randomforest_iterations_000/model_000.joblib
        - Probabilidades guardadas en ../model/threshhold_0.56/randomforest_iterations_000/probas_000.npy
        - Pixeles de entrenamiento: 2463
        - Pixeles con proba>=0.56: 0
        - Pixeles con proba<0.56: 3
