In [None]:
# notebooks use their location as their working directory, so
# if we are in a subfolder, move to the main folder.  
# This however can safely be run multiple times
if(!dir.exists("Functions/")){
    setwd("../")
}
source("Functions/lecospectR.R", echo = FALSE)
library(class)
library(caret)
library(vegan)

## Load the Data

In [None]:
# spectral library
base_path <- "./Output/C_001_SC3_Cleaned_SpectralLib.csv"
veg_index_path <- "./Data/D_002_SpecLib_Derivs.csv"
speclib <- read.csv(base_path)
veg_indices <- read.csv(veg_index_path)

In [None]:
# Targets 
targets <- veg_indices[!is.na(veg_indices$Functional_group1),"Functional_group1"] %>% as.factor()
# weights
weights_by_pft <- targets_to_weights(targets)

In [None]:
# image-based validation
uav_speclib_df <- read.csv(
    "Data/Ground_Validation/PFT_image_spectra/PFT_Image_SpectralLib_Clean_unsmoothed.csv", 
    header = TRUE)
image_validation <- uav_speclib_df[,16:(ncol(uav_speclib_df) - 1)]
validation_labels <- uav_speclib_df$FncGrp1 %>% as.factor()
levels(validation_labels) <- c(
    levels(validation_labels),
    "Forb") 



## Base transformation
This removes infinity, outliers and NAs from the data.  

In [None]:
numeric_data <- veg_indices[!is.na(veg_indices$Functional_group1),35:195]
numeric_data <- inf_to_na(numeric_data)
imputed_data_1 <- impute_spectra(numeric_data)
imputed_data_no_outliers <- outliers_to_na(imputed_data_1)
imputed_data <- impute_spectra(imputed_data_no_outliers)
outlier_indices <- detect_outliers_columnwise(imputed_data[,1:95])
filtered_data <- imputed_data[!outlier_indices,]
hist(dist(as.matrix(imputed_data)))
min_max_scaled_data <- columnwise_min_max_scale(imputed_data)

## Transform the Image-based Data

In [None]:
veg_index_names <- read.csv("assets/vegIndicesUsed.csv")$x
validation_indices <- get_vegetation_indices(image_validation, NULL)
# drop NAs

validation_indices <- inf_to_na(validation_indices)
validation_indices <- impute_spectra(validation_indices)
validation_indices <- outliers_to_na(validation_indices)
validation_indices <- impute_spectra(validation_indices)


min_max_scaled_validation <- columnwise_min_max_scale(validation_indices)

#hist(as.matrix(min_max_scaled_validation))

In [None]:
print(summary(min_max_scaled_validation))

In [None]:
image_weights <- targets_to_weights(validation_labels %>% as.factor())

## PCA 
This is where we calcuate PCA for the ground and image spectra

In [None]:
# fit a PCA to the ground spectra
pca_fit <- stats::prcomp(imputed_data[,1:(ncol(numeric_data) - 66)], center = FALSE, scale. = FALSE)
print(summary(pca_fit))
pca_training_data <- predict(pca_fit, imputed_data[,1:(ncol(numeric_data) - 66)])[,1:64]
boxplot(vegan::scores(pca_training_data)[,2]~targets)

## Standardization
This cell standardizes the input to center at zero with standard deviation one.

In [None]:
# standardization
indice_standardizer <- caret::preProcess(imputed_data[,1:95])
standardized_indices <- predict(indice_standardizer, imputed_data[,1:95])

val_standardizer <- caret::preProcess(validation_indices)
standardized_validation <- predict(val_standardizer, validation_indices)

In [None]:
hist(standardized_indices$Carter, breaks = 20)
hist(standardized_validation$Carter, breaks = 20)

## Min-Max Scaling
This executes the min-man scalaing (to make the data on the scale [0,1])

In [None]:
# plots
hist(min_max_scaled_validation %>% as.matrix())
hist(min_max_scaled_data %>% as.matrix())
pca_validation_data <- predict(pca_fit, validation_indices[!validation_outliers,])[,1:64] %>% as.data.frame()
boxplot(vegan::scores(pca_validation_data)[,2]~validation_labels[!validation_outliers])

## KS Tests of Transferrability
These next few cells test whether the veg indices are similarly distributed (i.e. could be samples drawn from the same distribution)

The hypothesis is that columns (veg indices) that pass this test can safely be used across models and conditions (are transferrable)

In [None]:
source("Functions/lecospectR.R")
ks_test_results <- test_transferrability(min_max_scaled_data, min_max_scaled_validation)
print(ks_test_results)

## t-SNE
Examine the clusters in the data via *t*-SNE

In [None]:
library(Rtsne)
unique_indices <- imputed_data[!duplicated(imputed_data),1:95]
normalized_veg_indices <- Rtsne::normalize_input(
    unique_indices %>% 
    as.matrix()
    )
embedding_2D <- Rtsne::Rtsne(normalized_veg_indices)
print(names(embedding_2D))

plot(embedding_2D$Y, col = as.factor(targets))
par(xpd=T)
legend("topright", legend = unique(targets), col = seq_along(unique(targets)),pch = 1)

## Vector Quantization Classifier
This fits a LVQ classifier to the data and then 

In [None]:
print(length(validation_labels))
print(nrow(min_max_scaled_validation))

In [None]:
# method 1 - transfers with 23% accuracy, which is one of the best actually
library(class)
codeBook <-  lvqinit(
    min_max_scaled_data[,1:95], 
    targets, 
10)
code_book_train <- class::olvq1(min_max_scaled_data[,1:95], targets, codeBook)
prediction <- class::lvqtest(code_book_train, min_max_scaled_data[,1:95])
lvq_conf <- caret::confusionMatrix(prediction, targets, mode = "everything")

#print(lvq_conf)

image_prediction <- class::lvqtest(code_book_train, min_max_scaled_validation)

lvq_validation_conf <- caret::confusionMatrix(image_prediction, validation_labels, mode = "everything")



In [None]:
print(lvq_conf)

In [None]:
print(lvq_validation_conf)

## Train-Test Split

Perform an 80-20 split on the data (use the split on the fly during the grid search)

In [None]:
grd_train_idx <- caTools::sample.split(targets, SplitRatio = 0.8)

In [None]:
img_train_idx <- caTools::sample.split(validation_labels, SplitRatio = 0.8)

## Random Forest
trains a random forest model

In [None]:

rf_model <- ranger::ranger(
    num.trees = 256,
    case.weights = image_weights,
    classification = TRUE,
    x = validation_indices,
    y = validation_labels
)

print(rf_model)

In [None]:
predictions <- predict(rf_model, validation_indices)$predictions %>% 
    as.factor()
confusion_matrix <- caret::confusionMatrix(
    predictions, 
    validation_labels, 
    mode = "everything")
print(confusion_matrix)

# Grid Search

This next section defines all the essentials for the grid search across our different candidate models. 

## Candidates

### Models
* Random Forest
* Learned Vector Quantization (LVQ)
* k-Nearest Neighbor (kNN)

Could also consider Support Vector Machine (SVM), Gradient Boosted Trees (e.g. LightGBM, XGBoost), matched filtering, Logistic Regression, etc.

### Data/Transformations

For each of the image/training data sets, test the following:
* raw, 
* raw (no outliers)
* standardized (z-score standardization)
* standardized (z-score standardization, no outliers)
* min-max scaled
* min-max scaled (no outliers)
* PCA
* PCA no outliers

Need to also vary how many columns are included in the analysis

In [None]:
# define the data sets to loop over
gs_train <- list(
    min_max_scaled_data[,1:95],
    min_max_scaled_validation[img_train_idx,],
    standardized_indices,
    standardized_validation[img_train_idx,]
    pca_training_data,
    pca_validation_data[img_train_idx,]
)

gs_test <- list(
    min_max_scaled_validation[-img_train_idx,],
    min_max_scaled_validation[-img_train_idx,],
    standardized_validation[-img_train_idx,],
    standardized_validation[-img_train_idx,],
    pca_validation_data[-img_train_idx],
    pca_validation_data[-img_train_idx]

)

gs_train_labels <- list(
    targets,
    validation_labels[img_train_idx],
    targets,
    validation_labels[img_train_idx],
    targets,
    validation_labels[img_train_idx],
)

gs_test_labels <- list(
    validation_labels[-img_train_idx],
    validation_labels[-img_train_idx],
    validation_labels[-img_train_idx],
    validation_labels[-img_train_idx],
    validation_labels[-img_train_idx],
    validation_labels[-img_train_idx]
)

In [None]:
gs_methods <- list(
    "svmLinear",
    #"rmda",
    "rf",
    "svmRadialWeights",
    "gbm",
    "hda"# heteroscedastic discriminant analysis
)
# add: PLS-LDA, kNN, SVM+poly Kernel, SVM+Exp Kernel, more boosting, 

In [None]:
gs_weight_text <- c(
    "prior weights",
    NULL
)

gs_weights <- list(
    weights_by_pft,
    image_weights,
    weights_by_pft,
    image_weights,
    weights_by_pft,
    image_weights
)

fit_ctrl <- caret::trainControl(
    method = "repeatedcv",
    number = 10,
    repeats = 3,
    classProbs = TRUE,
    allowParallel = TRUE
)

In [None]:
for(i in seq_along(gs_train)){
    for(j in seq_along(gs_methods)){
        # train and print intermediate results to console
        df <- data.frame(gs_train[[i]])
        df$targets <-  as.factor(gs_train_labels[[i]]) 
        print("Beginning Training")
        model <- train(
           targets ~ ., 
            data = df,
            method = gs_methods[[j]],
            trControl = fit_ctrl,
            weights = gs_weights[[i]]
            verbose = TRUE
        )
        print(model)

        model_predictions <- predict(
            model, 
            gs_test[[i]]
        ) %>% as.factor()
        
        test_labels <- gs_test_labels[[i]] %>% as.factor()
        levels(test_labels) <- c(levels(test_labels), "Forb")

        confusion_matrix <- caret::confusionMatrix(
            model_predictions, 
            test_labels,
            mode = "everything"
        )

        # append performance data to the logs for later comparison
        sink(file = "mle/logs.txt", append = TRUE)
        print("-------------------------------------------------------")
        print("---------------------- Model Data ---------------------")
        
        print(paste0("Model Type: ", gs_methods[[j]]))
        print(paste0("Data Index: ",i))
        print("---------------------- Confusion Matrix ---------------------")
        print(confusion_matrix)
        print("\n")
        print("---------------------- Class Distribution ---------------------")
        print(model_predictions %>% as.factor() %>% table())
        print("-------------------------------------------------------")
        print("\n\n")
        sink(NULL)

        
        save(model, file = paste0("mle/models/gs/model_", i, "type_", j, ".rda"))

    }
}

# Results

These are the results of the grid search on the basics.  High performing models get to go through validation

## Top performers:
* 