analysis/eda.Rmd

---
title: "Exploratory data analysis"
output: 
  workflowr::wflow_html:
    includes:
      in_header: header.html
editor_options:
  chunk_output_type: console
author: "Patrick Schratz"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(
  fig.retina = 3,
  fig.align = "center",
  fig.width = 6.93,
  fig.height = 6.13,
  out.width = "100%",
  echo = FALSE
)

library("drake")
library("magrittr")
R.utils::sourceDirectory("R")
R.utils::sourceDirectory("code")

# load drake objects
loadd(
  vi_data_corrected_buffer2, nri_data_corrected_buffer2,
  bands_data_corrected_buffer2, coords_vi_nri_clean_corrected,

  filter_values,
  filter_info_gain_nbins,

  trees_with_bands_corrected_buffer2,
  trees_with_bands_corrected_no_buffer
)
library("DataExplorer")
library("dplyr")
library("ggsci")
library("ggpubr")
library("ggplot2")
library("knitr")
library("purrr")
library("sp")
library("raster")
library("fs")
```

# Datasets {.tabset .tabset-fade}

## VI

### Overview

```{r eda-1 }
intro <- introduce(vi_data_corrected_buffer2)
intro_df <- data.frame(
  "Name" = c(
    "Rows", "Columns",
    "Discrete columns", "Continuous columns", "All missing columns",
    "Missing observations", "Complete Rows",
    "Total observations"
  ),
  "Value" = c(
    format(intro[["rows"]], big.mark = ","),
    format(intro[["columns"]], big.mark = ","),
    format(intro[["discrete_columns"]], big.mark = ","),
    format(intro[["continuous_columns"]], big.mark = ","),
    format(intro[["all_missing_columns"]], big.mark = ","),
    format(intro[["total_missing_values"]], big.mark = ","),
    format(intro[["complete_rows"]], big.mark = ","),
    format(intro[["total_observations"]], big.mark = ",")
  )
)
knitr::kable(intro_df)
```

### Histograms

```{r eda-2 }
plot_histogram(vi_data_corrected_buffer2)
```

### PCA

```{r eda-3 }
plot_prcomp(vi_data_corrected_buffer2)
```

### Corr

```{r eda-4 }
plot_correlation(vi_data_corrected_buffer2)
```

## NRI

### Overview

```{r eda-5 }
intro <- introduce(nri_data_corrected_buffer2)
intro_df <- data.frame(
  "Name" = c(
    "Rows", "Columns",
    "Discrete columns", "Continuous columns", "All missing columns",
    "Missing observations", "Complete Rows",
    "Total observations"
  ),
  "Value" = c(
    format(intro[["rows"]], big.mark = ","),
    format(intro[["columns"]], big.mark = ","),
    format(intro[["discrete_columns"]], big.mark = ","),
    format(intro[["continuous_columns"]], big.mark = ","),
    format(intro[["all_missing_columns"]], big.mark = ","),
    format(intro[["total_missing_values"]], big.mark = ","),
    format(intro[["complete_rows"]], big.mark = ","),
    format(intro[["total_observations"]], big.mark = ",")
  )
)
kable(intro_df)
```

### Histograms

No histograms for NRI -> too many features.

### PCA

```{r eda-6 }
plot_prcomp(nri_data_corrected_buffer2)
```

### Corr

```{r eda-7 }
plot_correlation(nri_data_corrected_buffer2)
```

## HR

### Overview

```{r eda-8 }
intro <- introduce(bands_data_corrected_buffer2)
intro_df <- data.frame(
  "Name" = c(
    "Rows", "Columns",
    "Discrete columns", "Continuous columns", "All missing columns",
    "Missing observations", "Complete Rows",
    "Total observations"
  ),
  "Value" = c(
    format(intro[["rows"]], big.mark = ","),
    format(intro[["columns"]], big.mark = ","),
    format(intro[["discrete_columns"]], big.mark = ","),
    format(intro[["continuous_columns"]], big.mark = ","),
    format(intro[["all_missing_columns"]], big.mark = ","),
    format(intro[["total_missing_values"]], big.mark = ","),
    format(intro[["complete_rows"]], big.mark = ","),
    format(intro[["total_observations"]], big.mark = ",")
  )
)
kable(intro_df)
```

### Histograms

```{r eda-9 }
plot_histogram(bands_data_corrected_buffer2)
```

### PCA

```{r eda-10 }
plot_prcomp(bands_data_corrected_buffer2)
```

### Corr

```{r eda-11 }
plot_correlation(bands_data_corrected_buffer2)
```

# Custom plots

```{r eda-12 }
vi_data_plot <- vi_data_corrected_buffer2 %>%
  mutate(plot = factor(rep(
    c("Laukiz 1", "Laukiz 2", "Luiando", "Oiartzun"),
    c(559, 451, 301, 497)
  )))
```

# Mean defoliation per plot

```{r eda-13 }
mean_defol <- vi_data_plot %>%
  group_by(plot) %>%
  summarise(mean(defoliation)) %>%
  pull(.)
vi_data_plot %>%
  group_by(plot) %>%
  summarise(mean(defoliation))
```

# Coefficient of variation

```{r eda-14 }
cov_defol <- vi_data_plot %>%
  group_by(plot) %>%
  summarise((sd(defoliation) / mean(defoliation)) * 100) %>%
  pull(.)
vi_data_plot %>%
  group_by(plot) %>%
  summarise((sd(defoliation) / mean(defoliation)) * 100)
```

# sd / skewness

```{r eda-15 }
sd_skewness_defol <- vi_data_plot %>%
  group_by(plot) %>%
  summarise(((sd(defoliation) / mean(defoliation)) * 100) / e1071::skewness(defoliation)) %>%
  pull(.)
vi_data_plot %>%
  group_by(plot) %>%
  summarise(((sd(defoliation) / mean(defoliation)) * 100) / e1071::skewness(defoliation))
```

```{r eda-16 }
sd_defol <- vi_data_plot %>%
  group_by(plot) %>%
  summarise(sd(defoliation)) %>%
  pull(.)
vi_data_plot %>%
  group_by(plot) %>%
  summarise(sd(defoliation))
```

```{r defoliation-distribution-plot, dev = c("png", "pdf")}
boxplot_defol <- vi_data_plot %>%
  group_by(plot) %>%
  ggboxplot(
    x = "plot", y = "defoliation", color = "plot",
    add = "jitter", add.params = list(size = "defoliation")
  ) +
  scale_size(range = c(0.5, 0.5)) +
  annotate("text",
    label = expression(bold(atop("n = 559", bar(x) ~ "= 57.23"))), x = 1,
    y = 112, size = 4, colour = "#BC3C29", fontface = 2
  ) +
  annotate("text",
    label = expression(bold(atop("n = 451", bar(x) ~ "= 13.54"))), x = 2,
    y = 112, size = 4, colour = "#0072B5", fontface = 2
  ) +
  annotate("text",
    label = expression(bold(atop("n = 301", bar(x) ~ "= 68.36"))), x = 3,
    y = 112, size = 4, colour = "#E18727", fontface = 2
  ) +
  annotate("text",
    label = expression(bold(atop("n = 497", bar(x) ~ "= 69.07"))), x = 4,
    y = 112, size = 4, colour = "#20854E", fontface = 2
  ) +
  scale_color_nejm() +
  ggpubr::theme_pubr(base_size = 14) +
  theme(legend.position = "none") +
  labs(y = "Total defoliation per tree (%)", x = "Plot")
boxplot_defol
```

# Point density

In Meters.

```{r eda-18 }
plots <- list("Laukiz 1", "Laukiz 2", "Luiando", "Oiartzun")
dist <- map(plots, ~ {
  coords <- coords_vi_nri_clean_corrected %>%
    mutate(plot = factor(rep(
      c("Laukiz 1", "Laukiz 2", "Luiando", "Oiartzun"),
      c(559, 451, 301, 497)
    ))) %>%
    filter(plot == .x) %>%
    dplyr::select(-plot)
  points <- SpatialPoints(
    coords = coords,
    proj4string = CRS("+proj=utm +zone=30 +ellps=WGS84 +datum=WGS84 +units=m +no_defs")
  )
  distance <- mean(as.dist(pointDistance(points,
    allpairs = TRUE,
    lonlat = FALSE
  )))
})
set_names(dist, plots)
dist_plots <- unlist(dist)
```

```{r eda-19, echo = FALSE}
# file_move("docs/figure/eda.Rmd/defoliation-distribution-plot-1.pdf",
#           "code/98-paper/journal/")
```

# Effects of different buffer sizes when extracting values to trees

The following buffer sizes were used to extract the remote sensing information (1 m spatial resolution) to the trees (points):

- 0 m
- 0.5 m
- 1 m
- 1.5 m
- 2 m

For this EDA only the direct reflectance values are used.
It is assumed that the VI and NRI datasets will be affected in the same way by the different buffer sizes.

```{r buffer-size-comp}
compare_buffer <- function(trees_with_buffer, trees_without_buffer, plot_name) {
  if (plot_name == "laukiz1") {
    targets_sub_ids <- 1
  } else if (plot_name == "laukiz2") {
    targets_sub_ids <- 2
  } else if (plot_name == "luiando") {
    targets_sub_ids <- 3
  } else if (plot_name == "oiartzun") {
    targets_sub_ids <- 4
  }

  foo_buffer <- trees_with_buffer[seq(targets_sub_ids, 12, 4)]

  foo_buffer <- foo_buffer %>%
    map(~ sf::st_set_geometry(., NULL)) %>%
    map(~ dplyr::summarise_at(., vars(starts_with("B")), mean, na.rm = TRUE)) %>%
    map(~ tidyr::pivot_longer(., everything(),
      names_to = "band", values_to = "reflectance"
    )) %>%
    dplyr::bind_rows(.id = "Buffer size")


  foo_no_buffer <- trees_without_buffer[[targets_sub_ids]]

  foo_no_buffer <- foo_no_buffer %>%
    sf::st_set_geometry(NULL) %>%
    dplyr::summarise_at(vars(starts_with("B")), mean) %>%
    tidyr::pivot_longer(everything(),
      names_to = "band", values_to = "reflectance"
    ) %>%
    dplyr::bind_rows(.id = "Buffer size") %>%
    dplyr::mutate(`Buffer size` = "No Buffer")

  all <- bind_rows(foo_buffer, foo_no_buffer) %>%
    dplyr::mutate(band = as.numeric(stringr::str_replace(band, "B", "")))

  plot <- ggplot(all, aes(x = band, y = reflectance)) +
    geom_line(aes(color = `Buffer size`), size = 0.6) +
    labs(title = tools::toTitleCase(plot_name)) +
    ggsci::scale_color_nejm() +
    ggpubr::theme_pubr(legend = "right")

  return(plot)
}
```

```{r}
map(
  c("laukiz1", "laukiz2", "luiando", "oiartzun"),
  ~ compare_buffer(
    trees_with_bands_corrected_buffer2,
    trees_with_bands_corrected_no_buffer,
    .x
  )
) %>%
  patchwork::wrap_plots() +
  patchwork::guide_area() +
  patchwork::plot_layout(guides = "collect", ncol = 3, nrow = 2) +
  patchwork::plot_annotation(
    title = "Comparing the effect of different buffer sizes when extracting reflectance values to points obsverations (trees)",
    subtitle = "Mean values of each hyperspectral band for each plot "
  )
```