# Ethiopia Random Forest Model

## 1. Import Libraries

In [None]:
import os
import gc
import sys
import glob
import random
import logging
import argparse
from tqdm import tqdm

import joblib
import numpy as np
import xarray as xr
import pandas as pd
from osgeo import gdal, osr
import rasterio as rio
import rasterio.features as riofeat

import cupy
import cudf
import cuml

from cuml.ensemble import RandomForestClassifier as cumlRFC
from cuml.model_selection import train_test_split
from cuml.metrics import accuracy_score
from cupyx.scipy.ndimage import median_filter

cp.random.seed(seed=24)

## Define Parameters

In addition to the number of examples, random forest fitting performance depends heavily on the number of columns in a dataset and (especially) on the maximum depth to which trees are allowed to grow. Lower max_depth values can greatly speed up fitting, though going too low may reduce accuracy.

In [None]:
pipeline_step = ['train', 'predict']

In [None]:
# Data parameters
train_csv = '/att/pubrepo/ILAB/projects/Ethiopia/ethiopia-lcluc/data/random_forest/train_data.csv'
seed = 24
train_size = 0.80
max_feat = 'log2'

# Random Forest building parameters
n_trees = 20
max_feat = 'log2'
max_depth = 24 # 12 - bad
n_bins = 16
n_trees = 10000

## Read data csv file

In [None]:
assert os.path.exists(train_csv), f'{train_csv} not found.'
data_df = cudf.read_csv(train_csv, sep=',')
assert not data_df.isnull().values.any(), f'Na found: {train_csv}'
print(data_df, type(data_df))

## Shuffle and Split Dataset

In [None]:
data_df = data_df.sample(frac=1).reset_index(drop=True)

In [None]:
# dask_cudf does not support iloc operations, the objects gett converted to plain cudf
x = data_df.iloc[:, :-1].astype(np.float32)
y = data_df.iloc[:, -1].astype(np.int32)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=train_size)
del data_df, x, y
print(f'X_train: {X_train.shape[0]} elements')
print(f'X_test:  {X_test.shape[0]} elements')
print(f'y_train: {y_train.shape[0]} elements')
print(f'y_test:  {y_test.shape[0]} elements')

## Train the distributed cuML model

In [None]:
%%time

cuml_model = cumlRFC(n_estimators=n_trees, max_features=max_feat)
cuml_model.fit(X_train, y_train)

## Predict and Validate Accuracy

In [None]:
cuml_y_pred = cuml_model.predict(X_test)

# Due to randomness in the algorithm, you may see slight variation in accuracies
print("CuML accuracy:     ", accuracy_score(y_test, cuml_y_pred))

In [None]:
"""
        # ------------------------------------------------------------------
        # 3. Instantiate RandomForest object - FIX this area
        # ------------------------------------------------------------------
        if args.has_gpu:  # run using RAPIDS library

            # initialize cudf data and log into GPU memory
            logging.info('Training model via RAPIDS.')

            # single gpu setup
            x_train = cf.DataFrame.from_pandas(x_train)
            x_test = cf.DataFrame.from_pandas(x_test)
            y_train = cf.Series(y_train.values)
            rf_funct = cumlRFC  # RF Classifier

            # TODO: multi gpu setup
            # https://github.com/rapidsai/cuml/blob/branch-21.12/notebooks/random_forest_mnmg_demo.ipynb
            # cluster = LocalCUDACluster(
            # threads_per_worker=1, n_workers=n_workers)
            # c = Client(cluster)
            # workers = c.has_what().keys()
            # rf_funct = cumlRFC_mg


        # ------------------------------------------------------------------
        # 4. Fit Model
        # ------------------------------------------------------------------
        # fit model to training data and predict for accuracy score
        rf_model.fit(x_train, y_train)

        if args.has_gpu:
            acc_score = accuracy_score(
                y_test, rf_model.predict(x_test).to_array())
            p_score = precision_score(
                y_test, rf_model.predict(x_test).to_array(), average='macro')
            r_score = recall_score(
                y_test, rf_model.predict(x_test).to_array(), average='macro')
            f_score = f1_score(
                y_test, rf_model.predict(x_test).to_array(), average='macro')
        else:
            acc_score = accuracy_score(y_test, rf_model.predict(x_test))
            p_score = precision_score(y_test, rf_model.predict(x_test), average='macro')
            r_score = recall_score(y_test, rf_model.predict(x_test), average='macro')
            f_score = f1_score(y_test, rf_model.predict(x_test), average='macro')

        logging.info(f'Test Accuracy:  {acc_score}')
        logging.info(f'Test Precision: {p_score}')
        logging.info(f'Test Recall:    {r_score}')
        logging.info(f'Test F-Score:   {f_score}')

        # make output directory
        os.makedirs(
            os.path.dirname(os.path.realpath(args.output_pkl)), exist_ok=True)

        # export model to file
        try:
            joblib.dump(rf_model, args.output_pkl)
            logging.info(f'Model has been saved as {args.output_pkl}')
        except Exception as e:
            logging.error(f'ERROR: {e}')
"""