analysis/feature-importance.Rmd

---
title: "Feature importance analysis"
output: 
  workflowr::wflow_html:
    includes:
      in_header: header.html
editor_options:
  chunk_output_type: console
author: "Patrick Schratz"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  fig.retina = 3,
  fig.align = "center",
  fig.width = 8.5,
  fig.asp = 0.66,
  out.width = "100%",
  echo = FALSE
)

options(
  scipen = 999
)

library("drake")
library("hsdar")
library("dplyr")
library("ggplot2")
library("ggpubr")
library("ggpmisc")
library("patchwork")
library("iml")

# load drake objects
loadd(
  fi_permut_hr_buffer2,
  fi_permut_vi_buffer2,

  fi_ale_hr_buffer2,
  fi_ale_vi_buffer2,
  fi_ale_hr_buffer2_gs20,
  fi_ale_vi_buffer2_gs20,

  df_wavelengths_from_indices,
  spec_sigs
)
```

Preview the ordered feature importance results for datasets "HR" and "VI".

```{r feature-importance-1 }
fi_ranked_hr <- fi_permut_hr_buffer2$res %>%
  tibble::rownames_to_column("measure") %>%
  tidyr::pivot_longer(
    cols = starts_with("B"),
    values_to = "importance", names_to = "feature"
  ) %>%
  dplyr::mutate(wavelength = seq(420, 995, 4.75)) %>%
  dplyr::mutate(numeric_id = seq(5, 126, 1)) %>%
  dplyr::arrange(desc(importance)) %>%
  dplyr::mutate(rank = row_number()) %>%
  dplyr::select(-measure)
fi_ranked_hr

fi_ranked_vi <- fi_permut_vi_buffer2$res %>%
  tibble::rownames_to_column("measure") %>%
  tidyr::pivot_longer(
    cols = starts_with("B"),
    values_to = "importance", names_to = "feature"
  ) %>%
  # mutate(wavelength = seq(420, 995, 4.75)) %>%
  dplyr::arrange(desc(importance)) %>%
  dplyr::mutate(rank = row_number()) %>%
  dplyr::select(-measure) %>%
  dplyr::mutate(feature = stringr::str_replace(feature, "bf2_", ""))
fi_ranked_vi
```

# Create a virtual Spectral Signature (mean) of vegetation using PROSAIL.

PROSAIL is a algorithm simulating Spectral Signature (mean)s of vegetation, see `?hsdar::PROSAIL`.
Reflectance is scaled to 0-10 to be able to plot it in the same plot as the feature importance rankings -> the axis limits for the y and z axis needs to match.

PROSAIL returns a Spectral Signature (mean) from 400 nm to 2500 nm -> we take the values only and subset to 400 nm - 1000 nm.
Because we order from 1 - 10 with 1 being the best rank, we have to reverse the scaling of the reflectance values.

```{r feature-importance-2 }
spectra_sim <- hsdar::PROSAIL()
spectra_df <- data.frame(
  reflectance = as.vector(spectra_sim@spectra@spectra_ma),
  wavelength = seq(400, 2500, 1)
) %>%
  dplyr::filter(wavelength < 1000) %>%
  # scale the reflectance to [0, 2] to play nicely with the y-axis later (mean dec in rmse)
  dplyr::mutate(reflectance = scale(reflectance,
    center = FALSE,
    scale = max(reflectance, na.rm = TRUE) / 2
  ))
```

Next we bind the simulated data with the feature importance rankings.
To join both data.frames we need to round the reflectance centers of the bands to integers to match with the reflectance values created by PROSAIL.

```{r feature-importance-3 }
# round the wavelengths of the HR dataset to match with the simulated ones
fi_ranked_hr$wavelength <- round(fi_ranked_hr$wavelength)

data_hr_merged <- left_join(spectra_df, fi_ranked_hr, by = c("wavelength")) %>%
  left_join(spec_sigs, by = c("wavelength")) %>%
  mutate(class = "HR") %>%
  mutate(reflectance = as.numeric(reflectance))
```

To label only a subset of the data, a custom data.frame is created.

```{r feature-importance-4 }
df_wavelengths_from_indices_imp <- df_wavelengths_from_indices %>%
  left_join(fi_ranked_vi, by = c("class" = "feature")) %>%
  na.omit()

df_wavelengths_from_indices_label <- df_wavelengths_from_indices_imp %>%
  arrange(rank) %>%
  group_by(class, rank) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(rank) %>%
  dplyr::slice(1:10)

df_wavelengths_from_hr <- data_hr_merged %>%
  arrange(rank) %>%
  group_by(class, rank) %>%
  dplyr::slice(1) %>%
  dplyr::ungroup() %>%
  dplyr::arrange(rank) %>%
  dplyr::slice(1:10) %>%
  dplyr::select(-class) %>%
  dplyr::rename(class = feature)

df_label_all <- df_wavelengths_from_indices_label %>%
  bind_rows(df_wavelengths_from_hr)
```

```{r feature-importance-5, echo=FALSE}
plot_reflectance_imp_hr <- function(data) {
  ggplot(data, aes(x = .data[["wavelength"]], y = .data[["importance"]])) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$laukiz1), ],
      aes(
        x = wavelength, y = laukiz1,
        color = "Spectral Signature (mean) of Laukiz1"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$laukiz2), ],
      aes(
        x = wavelength, y = laukiz2,
        color = "Spectral Signature (mean) of Laukiz2"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$luiando), ],
      aes(
        x = wavelength, y = luiando,
        color = "Spectral Signature (mean) of Luiando"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$oiartzun), ],
      aes(
        x = wavelength, y = oiartzun,
        color = "Spectral Signature (mean) of Oiartzun"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +

    # HR: plot only rank 11:last
    geom_point(
      data = data_hr_merged[which(!(data_hr_merged$rank %in% c(1:10))), ],
      aes(color = "HR: Rank 11:122"),
      size = 2.5,
      shape = 1,
      na.rm = TRUE
    ) +
    # HR: plot only rank 1:10
    geom_point(
      data = data_hr_merged[which(data_hr_merged$rank %in% c(1:10)), ],
      aes(color = "HR: Rank 1:10"),
      shape = 20,
      size = 3,
      na.rm = TRUE
    ) +

    scale_color_manual(values = c(
      "HR: Rank 11:122" = "grey",
      "HR: Rank 1:10" = "black",
      "Spectral Signature (mean) of Laukiz1" = "#BC3C29",
      "Spectral Signature (mean) of Laukiz2" = "#0072B5",
      "Spectral Signature (mean) of Luiando" = "#E18727",
      "Spectral Signature (mean) of Oiartzun" = "#20854E"
    )) +
    scale_x_continuous(limits = c(400, 1000), breaks = scales::pretty_breaks()) +
    scale_y_continuous(
      sec.axis = sec_axis(~ scale(-.,
        center = FALSE,
        scale = max(., na.rm = TRUE)
      ),
      labels = c(1.0, 0.75, 0.55, 0.25, 0),
      name = "Scaled Reflectance [0, 1]"
      )
    ) +
    guides(color = guide_legend(
      title = NULL,
      override.aes = list(
        linetype = c("blank", "blank", "solid", "solid", "solid", "solid"),
        shape = c(20, 1, NA, NA, NA, NA),
        color = c("black", "grey", "#BC3C29", "#0072B5", "#E18727", "#20854E")
      )
    )) +
    labs(
      title = "Permutation-based Variable Importance for dataset 'HR'",
      subtitle = paste0(
        "The ten most important features are labeled by their band number."
      ),
      caption = "Learner: SVM; 100 Monte-Carlo Iterations",
      y = "Mean Decrease in RMSE", x = "Wavelength [nm]"
    ) +
    ggrepel::geom_label_repel(
      # only label the best 10 features
      data = df_wavelengths_from_hr,
      label = df_wavelengths_from_hr$class,
      size = 4,
      na.rm = TRUE
    ) +
    ggpubr::theme_pubclean(base_size = 14)
}

plot_reflectance_imp_vi <- function(data) {
  ggplot(data, aes(x = .data[["wavelength"]], y = .data[["importance"]])) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$laukiz1), ],
      aes(
        x = wavelength, y = laukiz1,
        color = "Spectral Signature (mean) of Laukiz1"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$laukiz2), ],
      aes(
        x = wavelength, y = laukiz2,
        color = "Spectral Signature (mean) of Laukiz2"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$luiando), ],
      aes(
        x = wavelength, y = luiando,
        color = "Spectral Signature (mean) of Luiando"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +
    geom_line(
      data = data_hr_merged[!is.na(data_hr_merged$oiartzun), ],
      aes(
        x = wavelength, y = oiartzun,
        color = "Spectral Signature (mean) of Oiartzun"
      ),
      na.rm = TRUE,
      linetype = "solid",
      size = 0.6
    ) +

    # VI: rank 1:10
    geom_point(
      data = df_wavelengths_from_indices_imp[which((df_wavelengths_from_indices_imp$rank %in% c(1:10))), ],
      aes(x = wavelength, y = importance, group = class),
      size = 3, shape = 20
    ) +
    # VI: rank 1:10
    geom_line(
      data = df_wavelengths_from_indices_imp[which((df_wavelengths_from_indices_imp$rank %in% c(1:10))), ],
      aes(x = wavelength, y = importance, group = class, color = "VI: Rank 1:10"),
      size = 0.2, linetype = "dashed"
    ) +

    # VI: rank 11:89
    geom_point(
      data = df_wavelengths_from_indices_imp[which(!(df_wavelengths_from_indices_imp$rank %in% c(1:10))), ],
      aes(x = wavelength, y = importance, group = class),
      color = "grey",
      shape = 1, size = 2.5
    ) +
    # VI: rank 11:89
    geom_line(
      data = df_wavelengths_from_indices_imp[which(!(df_wavelengths_from_indices_imp$rank %in% c(1:10))), ],
      aes(x = wavelength, y = importance, group = class, color = "VI: Rank 11:89"),
      size = 0.2, linetype = "dashed"
    ) +

    scale_color_manual(values = c(
      "VI: Rank 11:89" = "grey",
      "VI: Rank 1:10" = "black",
      "Spectral Signature (mean) of Laukiz1" = "#BC3C29",
      "Spectral Signature (mean) of Laukiz2" = "#0072B5",
      "Spectral Signature (mean) of Luiando" = "#E18727",
      "Spectral Signature (mean) of Oiartzun" = "#20854E"
    )) +
    scale_x_continuous(limits = c(400, 1000), breaks = scales::pretty_breaks()) +
    scale_y_continuous(
      sec.axis = sec_axis(~ scale(-.,
        center = FALSE,
        scale = max(., na.rm = TRUE)
      ),
      labels = c(1.0, 0.75, 0.55, 0.25, 0),
      name = "Scaled Reflectance [0, 1]"
      )
    ) +
    guides(color = guide_legend(
      title = NULL,
      override.aes = list(
        linetype = c("solid", "solid", "solid", "solid", "dashed", "dashed"),
        shape = c(NA, NA, NA, NA, 20, 1),
        color = c("#BC3C29", "#0072B5", "#E18727", "#20854E", "black", "grey")
      )
    )) +
    labs(
      title = "Permutation-based Variable Importance for dataset 'VI'",
      subtitle = paste0(
        "The ten most important features are labeled by their index name."
      ),
      caption = "Learner: SVM; 100 Monte-Carlo Iterations",
      y = "Mean Decrease in RMSE", x = "Wavelength [nm]"
    ) +
    ggrepel::geom_label_repel(
      # only label the best 10 features
      data = df_wavelengths_from_indices_label,
      label = df_wavelengths_from_indices_label$class,
      size = 4,
      na.rm = TRUE
    ) +
    ggpubr::theme_pubclean(base_size = 14)
}

plot_reflectance_imp_absolute <- function(data, x_identifier, class) {
  pl <- ggplot(data, aes(x = .data[[x_identifier]], y = .data[["importance"]])) +
    labs(y = "Importance", x = "Band number") +
    geom_segment(aes(
      x = .data[[x_identifier]], y = 0,
      xend = .data[[x_identifier]], yend = .data[["importance"]]
    ),
    color = "grey", show.legend = FALSE
    ) +
    geom_point(size = 1, color = "black", show.legend = T) +
    labs(
      title = glue::glue("Permutation-based Variable Importance for Dataset '{class}'"),
      subtitle = "Absolute importance values by band",
      caption = "Learner: SVM; 100 Monte-Carlo iterations"
    ) +
    ggpubr::theme_pubclean()

  if (is.character(data[[x_identifier]])) {
    pl
  } else {
    pl + scale_x_continuous(breaks = seq(5, 125, 5))
    pl
  }
}
```

# P1 Main plot

```{r fi-permut-hr, warning=FALSE, fig.show='hide'}
p11 <- data_hr_merged %>%
  plot_reflectance_imp_hr()
p11
```

```{r fi-permut-vi, warning=FALSE, fig.show='hide'}
p12 <- data_hr_merged %>%
  plot_reflectance_imp_vi()
p12
```

```{r fi-permut-vi-hr, dev = c("png", "pdf"), warning=FALSE, fig.asp=1.2}
p11 / p12
```

# Plots by dataset {.tabset .tabset-fade}

## HR

### P2 Absolute permutation based Var Imp 

```{r fi-permut-hr-abs, dev = c("png", "pdf")}
p2 <- fi_ranked_hr %>%
  plot_reflectance_imp_absolute("numeric_id", class = "HR")
p2
```

## VI

### P3 Absolute permutation based Var Imp 

```{r fi-permut-vi-abs, dev = c("png", "pdf")}
p3 <- fi_ranked_vi %>%
  plot_reflectance_imp_absolute("feature", class = "VI") +
  ggpubr::rotate_x_text()
p3
```

Vogelmann2 $(R_{734}-R_{747})/(R_{715}+R_{726})$ Vogelmann et al. (1993)

Vogelmann4 $(R_{734}-R_{747})/(R_{715}+R_{720})$ Vogelmann et al. (1993)

Vogelmann3 $D_{715}/D_{705}$ Vogelmann et al. (1993)

Vogelmann  $R_{740}/R_{720}$ Vogelmann et al. (1993)

NPCI       $(R_{680}-R_{430})/(R_{680}+R_{430})$

D2         $D_{705}/D_{722}$

Datt3      $D_{754}/D_{704}$

PWI        $R_{900}/R_{970}$

SR7        $R_{440}/R_{690}$

SRPI       $R_{430}/R_{680}$

Dxxx: First derivation of reflectance values at wavelength 'xxx'.
Rxxx: Reflectance at wavelength 'xxx'.

Reference: `?hsdar::vegindex()`

## Combined

```{r fi-hr-vi-abs-comb, fig.width=9.5, fig.asp=1.1}
p2 / p3
```

# ALE plots {.tabset .tabset-fade}

ALE plots via package {iml}

## P2 HR

### Grid size: 100

Top ten HR features from permutation Vimp

```{r fi-hr-ale, dev = c("png", "pdf"), warning=FALSE}
ale_p1 = fi_ale_hr_buffer2$plot(
  features = df_wavelengths_from_hr$class,
  ncol = 2
) & scale_y_continuous(
  n.breaks = 2
) &
  labs(y = "ALE") &
  theme_pubr(base_size = 14) &
  theme(
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    axis.title.y = element_text(size = 8),
    axis.text.y = element_text(size = 9)
  )

# Custom y axis breaks for subplots
ale_p1[[2]] = ale_p1[[2]] + scale_y_continuous(
  n.breaks = 3
)

ale_p1[[3]] = ale_p1[[3]] + scale_y_continuous(
  n.breaks = 4
)

ale_p1[[7]] = ale_p1[[7]] + scale_y_continuous(
  n.breaks = 4
)

ale_p1[[8]] = ale_p1[[8]] + scale_y_continuous(
  n.breaks = 4
)

ale_p1[[9]] = ale_p1[[9]] + scale_y_continuous(
  n.breaks = 4
)

ale_p1[[10]] = ale_p1[[10]] + scale_y_continuous(
  n.breaks = 4
)
ale_p1
```

### Grid size: 20

Top ten HR features from permutation Vimp

```{r, warning=FALSE}
ale_p2 = fi_ale_hr_buffer2_gs20$plot(
  features = df_wavelengths_from_hr$class,
  ncol = 2
) & scale_y_continuous(
  n.breaks = 2
) &
  labs(y = "ALE") &
  theme_pubr(base_size = 14) &
  theme(
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    axis.title.y = element_text(size = 8),
    axis.text.y = element_text(size = 9)
  )

# Custom y axis breaks for subplots
ale_p2[[1]] = ale_p2[[1]] + scale_y_continuous(
  n.breaks = 4
)
ale_p2[[2]] = ale_p2[[2]] + scale_y_continuous(
  n.breaks = 5
)
ale_p2[[3]] = ale_p2[[3]] + scale_y_continuous(
  n.breaks = 4
)
ale_p2[[4]] = ale_p2[[4]] + scale_y_continuous(
  n.breaks = 4
)
ale_p2[[6]] = ale_p2[[6]] + scale_y_continuous(
  n.breaks = 4
)
ale_p2[[7]] = ale_p2[[7]] + scale_y_continuous(
  n.breaks = 4
)
ale_p2[[10]] = ale_p2[[10]] + scale_y_continuous(
  n.breaks = 3
)
ale_p2
```

## P3 VI

### Grid size: 100

Top ten VI features from permutation Vimp

```{r fi-vi-ale, dev = c("png", "pdf"), warning=FALSE}
ale_p3 = fi_ale_vi_buffer2$plot(
  features = paste0("bf2_", df_wavelengths_from_indices_label$class),
  ncol = 2
) & scale_y_continuous(
  n.breaks = 2
) &
  labs(y = "ALE") &
  theme_pubr(base_size = 14) &
  theme(
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    axis.title.y = element_text(size = 8),
    axis.text.y = element_text(size = 9)
  )

# Custom y axis breaks for subplots
ale_p3[[1]] = ale_p3[[1]] + scale_y_continuous(
  n.breaks = 4
)
ale_p3[[2]] = ale_p3[[2]] + scale_y_continuous(
  n.breaks = 4
)
ale_p3[[3]] = ale_p3[[3]] + scale_y_continuous(
  n.breaks = 4
)
ale_p3[[4]] = ale_p3[[4]] + scale_y_continuous(
  n.breaks = 4
)
ale_p3
```

### Grid size: 20

Top ten HR features from permutation Vimp

```{r, warning=FALSE}
ale_p4 = fi_ale_vi_buffer2_gs20$plot(
  features = paste0("bf2_", df_wavelengths_from_indices_label$class),
  ncol = 2
) & scale_y_continuous(
  n.breaks = 2
) &
  labs(y = "ALE") &
  theme_pubr(base_size = 14) &
  theme(
    legend.text = element_text(size = 13),
    legend.title = element_text(size = 13),
    axis.title.y = element_text(size = 8),
    axis.text.y = element_text(size = 9)
  )

# Custom y axis breaks for subplots
ale_p4[[2]] = ale_p3[[2]] + scale_y_continuous(
  n.breaks = 4
)
ale_p4[[3]] = ale_p3[[3]] + scale_y_continuous(
  n.breaks = 4
)
ale_p4[[4]] = ale_p3[[4]] + scale_y_continuous(
  n.breaks = 4
)
ale_p4
```