In [1]:
if(!dir.exists("Functions/")){
    setwd("../")
    if(!dir.exists("Functions")){
        setwd("M:/lecospec/lecospec/")
    }
}
source("Functions/lecospectR.R", echo = FALSE)

Loading required package: tidyverse

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6      [32m✔[39m [34mpurrr  [39m 0.3.5 
[32m✔[39m [34mtibble [39m 3.1.8      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.4.1 
[32m✔[39m [34mreadr  [39m 2.1.3      [32m✔[39m [34mforcats[39m 0.5.2 
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
Loading required package: compiler

Loading required package: raster

Loading required package: sp


Attaching package: ‘raster’


The following object is masked from ‘package:dplyr’:

    select


Loading required package: hsdar

Loading required package: rgdal

Please note that rgdal will be retired by the end 

In [2]:
log_model_results <- function(model_id, confusion_matrix, distribition, custom = NULL, logpath = "./gs.log"){
    # append performance data to the logs for later comparison
    sink(file = logpath, append = TRUE)
    print("-------------------------------------------------------")
    print("---------------------- Model Data ---------------------")
    
    print(paste0("Model Type: Ranger (Random Forest)"))
    print(paste0("Data Index: ",custom))
    print(paste0("Model UUID: ", model_id))
    print("---------------------- Confusion Matrix ---------------------")
    print(confusion_matrix)
    print("---------------------- Class Distribution ---------------------")
    print(distribition)
    print("-------------------------------------------------------")
    sink(NULL)
}

In [3]:
add_model_to_manifest <- function(
    model_id, 
    outlier = "", 
    preprocessing="",
    source="", 
    weight = "",
    accuracy = "",
    logpath="./gs_manifest.csv"){
    if(!file.exists(logpath)){
        header <- "source,outliers,preprocessing,weight,accuracy,model_id"
        write(header, file = logpath)
    }

    line <- paste(
        source,
        outlier,
        preprocessing,
        weight,
        accuracy,
        sep=","
    )
    line <- paste0(line, ",", model_id)

    write(line, file=logpath, append = TRUE)
}

In [4]:
train_model <- function(
    train_df, 
    train_labels,
    test_df, 
    test_labels,
    outlier_fn = NULL,
    preprocess_fn = NULL,
    weight_fn = targets_to_weights,
    model_id = uuid::UUIDgenerate(),
    ignore_cols = NULL,
    seed = NULL
){
    if(!is.null(seed)){
        set.seed(seed)
    }

    x_train <- train_df %>% as.data.frame()
    x_test <- test_df %>% as.data.frame()
    if(is.function(outlier_fn)){
        x_train <- outlier_fn(x_train)
    }
    if(is.function(preprocess_fn)){
        x_train <- preprocess_fn(x_train)
        x_test <- preprocess_fn(x_test)
    }

    model <- ranger::ranger(
            num.trees = 1000,
            case.weights = weight_fn(train_labels),
            classification = TRUE,
            x=x_train,
            y=train_labels
        )

    if(("Forb" %in% levels(train_labels)) && !("Forb"  %in% levels(test_labels))){
            levels(test_labels) <- c(levels(test_labels), "Forb")
            }

    # create predictions (ranger)
        model_predictions <- predict(
            model, 
            x_test
        )$prediction %>% as.factor()

        # generate the confusion matrix

        confusion_matrix <- caret::confusionMatrix(
            model_predictions, 
            test_labels,
            mode = "everything"
        )

        # generate an id to uniquely identify the model
        #model_id <- uuid::UUIDgenerate()

        # append performance data to the logs for later comparison
        log_model_results(
            model_id = model_id,
            confusion_matrix = confusion_matrix,
            #custom = file,
            distribition = model_predictions %>% as.factor() %>% table(),
            logpath = "./gs3.log")

        # track what levels are associated with the UUID

        # save the model using the model UUID
        save(model, file = paste0("mle/models/gs/", model_id, ".rda"))
        
        return(
            list(
                model = model,
                confusion = confusion_matrix %>% as.list()
                )
        )
}

In [5]:
base_paths <- c(
    "grd_raw_raw.csv",
    "grd_raw_corrected.csv",
    "img_raw_raw.csv",
    "img_indices_only.csv",# include veg indices
    "grd_indices_only.csv"
)
# 

In [6]:
calculate_posterior_weights <- function(validation_path ="figures/merged_validation_s.csv" ){

    validation_df <- read.csv(validation_path, header = TRUE)
    #print(head(validation_df))

    total_observations <- sum(validation_df$validation_counts)
    #print(total_observations)
    weights <- (1/ validation_df$validation_prop)
    #print(validation_df$validation_prop)

    total_by_fg1 <- aggregate(
        x = validation_df$validation_counts,
        by = list(validation_df$key),
        FUN = sum
    )

    fg1_weight_list <- list()

    for( row_idx in seq(nrow(total_by_fg1))){
        name <- total_by_fg1$Group.1[[row_idx]]
        value <- total_by_fg1$x[[row_idx]]
        fg1_weight_list[name] <- value
    }
    
    return(fg1_weight_list)
}

get_posterior_weights_from_targets <- function(target_factor, posterior_weight = calculate_posterior_weights()){
    unbiased_weights <- targets_to_weights(target_factor)

    target_name_char <- target_factor %>% as.character()

    output_weights <- seq_along(target_factor)

    for(i in seq_along(target_factor)){
        if(posterior_weight[[target_name_char[[i]]]] > 0){
            fg1_weight <- 1 / posterior_weight[[target_name_char[[i]]]]
        } else {
            fg1_weight <- 0
        }
        output_weights[[i]] <- unbiased_weights[[i]] * fg1_weight
    }

    return(output_weights)
}

In [7]:
test_p_weight <- calculate_posterior_weights()
test_p_weight 


In [8]:
outlier_functions <- list(
    clip = load_model("./mle/clip_transform.rda"),
    no_treatment = function(x, ignore_cols = NULL){return(x)}# no transform
)

outlier_treatments <- c(
    "no_treatment",
    "clip"
)

preprocess_functions <- list(
    no_treatment = function(x, ignore_cols = NULL){return(x)},# no transform
    min_max = columnwise_min_max_scale,
    robust = columnwise_robust_scale,
    standard = standardize_df
)

weight_functions <- list(
    posterior = get_posterior_weights_from_targets,
    balanced = targets_to_weights,
    no_treatment = function(x){return(NULL)}# No weights
)
weight_treatments <- c(
    "no_treatment",
    "balanced",
    "posterior"
)

preprocessing_treatments <- c(
    "robust",
    "min_max",
    "no_treatment",
    "standard"
)

In [9]:
test_data <- read.csv("Data/gs/x_test/img_raw_raw.csv")
test_labels <- read.csv("Data/gs/y_test/img_raw_raw.csv")$x %>% as.factor()
#train_labels <- read.csv("Data/gs/y_train/img_raw_raw.csv")$x %>% as.factor()

In [10]:
for(filepath in base_paths){
    train_data <- subset(read.csv(paste0("Data/gs/x_train/", filepath)), select = -c(X))
    labels <- read.csv(paste0("Data/gs/y_train/", filepath))$x %>% as.factor()

    for(o_treatment in outlier_treatments){
        for(p_treatment in preprocessing_treatments){
            for(w_treatment in weight_treatments){

                print(p_treatment)
                model_id <- uuid::UUIDgenerate()
                save_path <- paste0("mle/experiments/gs/", model_id, "/")
                if(!dir.exists(save_path)){
                    dir.create(save_path)
                } 

                rf_model_results <- train_model(
                    train_data, 
                    labels, 
                    test_data,
                    test_labels,
                    outlier_fn = outlier_functions[[o_treatment]],
                    preprocess_fn = preprocess_functions[[p_treatment]],
                    weight_fn = weight_functions[[w_treatment]],
                    model_id = model_id,
                    seed=61718
                )

                rf_model <- rf_model_results$model
                acc <- as.list(rf_model_results$confusion$overall)$Accuracy
                print(acc)

                add_model_to_manifest(
                    model_id = model_id,
                    outlier = o_treatment,
                    preprocessing = p_treatment,
                    source = filepath,
                    weight = w_treatment,
                    accuracy = acc,
                    logpath="./gs_manifest_4.csv"
                )

                results <- validate_model(
                    rf_model, 
                    save_path, 
                    outlier_processing = outlier_functions[[o_treatment]],
                    transform_type = preprocess_functions[[p_treatment]],
                )

                aggregated_results <- aggregate_results(save_path)

                plot_by_pft(
                    aggregated_results,
                    save_path = paste0(save_path, "aggregate.html"),
                    open = FALSE,
                    image_path = paste0(save_path, "aggregates.png")
                )
            #
                write_validation_table(
                    aggregated_results,
                    save_path = paste0(save_path, "table.html"),
                    open = FALSE
                )
            }
        }
    }
}

[1] "robust"


“Levels are not in the same order for reference and data. Refactoring data to match.”


[1] 0.3583333
[1] "preprocessing raster at Data/Ground_Validation/Imagery/BisonGulchQuads.envi"
[1] "Converted to Data frame?"
[1] TRUE
[1] "Noisy columns removed"
[1] TRUE
[1] "Filtered"
[1] TRUE


Using spline to predict value at new bands...

Beware the spectra are now partially smoothed.



[1] "Imputing..."
[1] "Handling Outliers with User supplied function"
[1] "Transforming Data with user supplied functions"
       x              y          X402.593_5nm       X407.593_5nm     
 Min.   :-149   Min.   :63.81   Min.   :-1.21169   Min.   :-1.28856  
 1st Qu.:-149   1st Qu.:63.81   1st Qu.:-0.59546   1st Qu.:-0.59559  
 Median :-149   Median :63.81   Median : 0.00000   Median : 0.00000  
 Mean   :-149   Mean   :63.81   Mean   :-0.06945   Mean   :-0.07053  
 3rd Qu.:-149   3rd Qu.:63.81   3rd Qu.: 0.40454   3rd Qu.: 0.40441  
 Max.   :-149   Max.   :63.81   Max.   : 1.47839   Max.   : 1.46966  
                                                                     
  X412.593_5nm       X417.593_5nm       X422.593_5nm      X427.593_5nm    
 Min.   :-1.17964   Min.   :-1.15294   Min.   :-1.1907   Min.   :-1.1371  
 1st Qu.:-0.58230   1st Qu.:-0.56811   1st Qu.:-0.5626   1st Qu.:-0.5562  
 Median : 0.00000   Median : 0.00000   Median : 0.0000   Median : 0.0000  
 Mean   :-0.06022

[1m[22m`.cols` has been renamed and is deprecated, please use `.vars`
“[1m[22m`funs()` was deprecated in dplyr 0.8.0.
[36mℹ[39m Please use a list of either functions or lambdas:

# Simple named list: list(mean = mean, median = median)

# Auto named with `tibble::lst()`: tibble::lst(mean, median)

# Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
[36mℹ[39m The deprecated feature was likely used in the [34museful[39m package.
  Please report the issue at [3m[34m<https://github.com/jaredlander/useful/issues>[39m[23m.”


          x        y z
1 -148.9508 63.80701 1
2 -148.9508 63.80701 1
3 -148.9508 63.80701 1
4 -148.9508 63.80701 1
5 -148.9508 63.80701 1
6 -148.9508 63.80701 1
          x        y z
1 -148.9508 63.80701 1
2 -148.9508 63.80701 1
3 -148.9508 63.80701 1
4 -148.9508 63.80701 1
5 -148.9508 63.80701 1
6 -148.9508 63.80701 1
[1] "Attempting to save to ./validation_saved_output.grd"
[1] "Converted to Raster"
Reading layer `Bisoon_Quadrats_georeferenced' from data source 
  `/home/krbundy/GitHub/lecospec/Data/Vectors/Bisoon_Quadrats_georeferenced.shp' 
  using driver `ESRI Shapefile'
Simple feature collection with 9 features and 3 fields
Geometry type: POLYGON
Dimension:     XY
Bounding box:  xmin: -148.9524 ymin: 63.80698 xmax: -148.9508 ymax: 63.80701
Geodetic CRS:  WGS 84
Simple feature collection with 1 feature and 3 fields
Geometry type: POLYGON
Dimension:     XY
Bounding box:  xmin: -148.9508 ymin: 63.807 xmax: -148.9508 ymax: 63.80701
Geodetic CRS:  GCS_unknown
  CLASS_ID  CLASS_NAME A

Saving 6.67 x 6.67 in image



In [None]:
print(as.list(rf_model_results$confusion$overall)$Accuracy)

In [None]:
outlier_functions[[o_treatment]]

In [None]:
preprocess_functions[[p_treatment]]

In [None]:
                plot_by_pft(
                    aggregated_results,
                    save_path = paste0(save_path, "aggregate.html"),
                    open = FALSE,
                    image_path = NULL#paste0(save_path, "aggregates.png")
                )