# Entrenamiento iterativo

In [None]:
import os
import json
import joblib
import numpy as np
import pandas as pd
from glob import glob
from sqlite3 import connect
from sklearn.ensemble import RandomForestClassifier

## Conjunto de entrenamiento

In [None]:
train_sqlite_files = glob('../data/selection_verdad_campo/*.sqlite')

train_data = pd.DataFrame()

for sf in train_sqlite_files:
    cnx = connect(sf)
    df = pd.read_sql_query("SELECT * FROM output", cnx)
    train_data = pd.concat([train_data, df], ignore_index=True)

## Conjunto de predicción

In [None]:
pred_sqlite_files = glob('../data/selection_mask_agri_aoi/12*.sqlite')

pred_data = pd.DataFrame()

for sf in pred_sqlite_files:
    cnx = connect(sf)
    df = pd.read_sql_query("SELECT * FROM output", cnx)
    df = df
    pred_data = pd.concat([pred_data, df], ignore_index=True)

In [None]:
with open('../model/randomforest_parameters.json','r') as f:
    parameters = json.load(f)

In [None]:
threshold = 0.95

In [None]:
i = 0
condition = True
while True:
    # arma carpeta para el output (i aumenta con las iteraciones)
    n_iter = '{0:03d}'.format(i)
    output_folder = os.path.join('..','model',f'randomforest_iterations_{n_iter}')
    os.makedirs(output_folder, exist_ok=True)
    # toma los datasets
    columns = train_data.filter(regex='band_').columns.to_list()
    X_train = train_data.filter(regex='band_').to_numpy()
    y_train = train_data.id.to_numpy()
    X_pred = pred_data.filter(regex='band_').to_numpy()
    # instancia y entrena el modelo
    model = RandomForestClassifier(**parameters)
    model.fit(X_train, y_train)
    output_model_file = os.path.join(output_folder, f'model_{n_iter}.joblib')
    _ = joblib.dump(model, output_model_file)
    # predice
    probas = model.predict_proba(X_pred)
    output_proba_file = os.path.join(output_folder, f'probas_{n_iter}.npy')
    np.save(output_proba_file, probas)
    predictions = pred_data.assign(pred_class=probas.argmax(axis=1), pred_score=probas.max(axis=1))
    # separa entre nuevo train y nuevo pred
    add_to_train = predictions.query(f'pred_score >= {threshold}').copy()
    continue_pred = predictions.query(f'pred_score < {threshold}').copy()
    train_data_len, add_to_train_len , continue_pred_len = train_data.shape[0], add_to_train.shape[0] , continue_pred.shape[0]
    output_pixels_file = os.path.join(output_folder, f'pixels_{n_iter}.csv')
    (
        pd.DataFrame(
            [
                [f'De entrenamiento', train_data_len],
                [f'Con proba>={threshold}', add_to_train_len],
                [f'Con proba<{threshold}', continue_pred_len]
            ],
            columns=['Pyxels_type','Pixels']
        )
        .to_csv(output_pixels_file, index=False)
    )
    # pasa predicción a las columna id (target)
    # y lo agrega al train original
    add_to_train['id'] = add_to_train['pred_class']
    train_data = train_data.append(add_to_train, ignore_index=True)
    pred_data = continue_pred
    # imprime información
    print('''\n*** ITERACIÓN #{0:03d}
    - Modelo guardado en {1}
    - Probabilidades guardadas en {2}
    - Pixeles de entrenamiento: {3}
    - Pixeles con proba>={4}: {5}
    - Pixeles con proba<{4}: {6}'''.format(i, output_model_file, output_proba_file, train_data_len, threshold, add_to_train_len , continue_pred_len))
    i += 1
    if add_to_train_len == 0:
        break
        
# reemplaza na en columna pred_class con 'vc_original'
# (los pixeles que no tiene pred_class son los pieles de verdad de campo originales)
# y guarda la predicción final
final_prediction = os.path.join('..','model',f'randomforest_iterations_final_prediction.csv')
train_data.to_csv(final_prediction)