In [1]:
Sys.setenv("RETICULATE_PYTHON" = "/srv/conda/bin/python")

library(keras)
library(dplyr)

source("tutorial_functions.R")


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
raw_data <- readr::read_csv("tutorial_data_test.csv") 

head(raw_data)


[36m──[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────────────────────────────────[39m
cols(
  .default = col_double(),
  id = [31mcol_character()[39m,
  location_id = [31mcol_character()[39m,
  date = [34mcol_date(format = "")[39m
)
[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.




id,location_id,date,gap_days,year,classification,total_toxicity,t1,t2,t3,⋯,t6,t7,t8,t9,t10,t11,t12,sst,sst_cum,par_8DR
<chr>,<chr>,<date>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
PSP10.011_2014-05-19_mytilus,PSP10.011,2014-05-19,0,2014,0,4.195796,0,4.195796,0,⋯,0.0,0,0,0,0.0,0,0,283.199,38557.52,45.05425
PSP10.011_2014-05-26_mytilus,PSP10.011,2014-05-26,7,2014,0,0.2631432,0,0.0,0,⋯,0.2631432,0,0,0,0.0,0,0,284.331,40546.11,38.4885
PSP10.011_2014-06-02_mytilus,PSP10.011,2014-06-02,7,2014,0,7.637664,0,0.0,0,⋯,3.7181819,0,0,0,3.919482,0,0,285.625,42536.2,49.99571
PSP10.011_2014-06-03_mytilus,PSP10.011,2014-06-03,1,2014,0,1.58497,0,0.0,0,⋯,1.5849696,0,0,0,0.0,0,0,285.979,42822.18,52.47667
PSP10.011_2014-06-09_mytilus,PSP10.011,2014-06-09,6,2014,0,0.4214835,0,0.0,0,⋯,0.4214835,0,0,0,0.0,0,0,287.473,44542.7,49.40933
PSP10.011_2014-06-16_mytilus,PSP10.011,2014-06-16,7,2014,0,15.77553,0,15.775531,0,⋯,0.0,0,0,0,0.0,0,0,287.477,46556.85,41.76971


In [4]:
#Generate images from data
image_list <- make_image_list(raw_data,
                              tox_levels =     c(0,10,30,80),
                              forecast_steps = 1,
                              n_steps =        2,
                              minimum_gap =    4,
                              maximum_gap =    10,
                              toxins =         c("t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "t10", "t11", "t12"),
                              environmentals = c("sst_cum"))

#Splits image_list by year for grouping into train/test data
years <- sapply(image_list, function(x) {return(x$year)})
image_list <- split(image_list, years)

#configuration
YEARS_TRAINING <-   c("2014", "2015", "2016")
YEARS_TESTING <-    "2017"
  
#Make a training set
train <- pool_images_and_labels(image_list[YEARS_TRAINING], num_classes = 4)

#Make a test set
test <- pool_images_and_labels(image_list[YEARS_TESTING], num_classes = 4)

str(train)
dim(train$image)

List of 7
 $ labels         : num [1:3894, 1:4] 1 1 0 0 0 1 1 1 1 0 ...
 $ image          : num [1:3894, 1:26] 0 0 0 0 0.0315 ...
 $ classifications: num [1:3894] 0 0 1 2 2 0 0 0 0 1 ...
 $ toxicity       : num [1:3894] 1.01 0 26.3 39.83 50.17 ...
 $ locations      : chr [1:3894] "PSP21.2" "PSP24.13" "PSP12.002" "PSP16.25" ...
 $ dates          : num [1:3894] 16588 16595 16581 16218 16657 ...
 $ scaling_factors: NULL


In [5]:
model <- keras::keras_model_sequential() %>% 
  keras::layer_dense(units=64, 
                     activation = "relu", 
                     input_shape = dim(train$image)[2],
                     name = "input_layer") %>%
  keras::layer_dropout(rate = 0.4,
                       name = "dropout_1") %>% 
  keras::layer_dense(units=32, 
                     activation = "relu",
                     name = "hidden_1") %>% 
  keras::layer_dropout(rate=0.3,
                       name = "dropout_2") %>% 
  keras::layer_dense(units=32, 
                     activation = "relu",
                     name = "hidden_2") %>% 
  keras::layer_dropout(rate=0.2,
                       name = "dropout_3") %>%
  keras::layer_dense(units = 4, 
                     activation = "softmax",
                     name = "output")

summary(model)

Model: "sequential"
________________________________________________________________________________
Layer (type)                        Output Shape                    Param #     
input_layer (Dense)                 (None, 64)                      1728        
________________________________________________________________________________
dropout_1 (Dropout)                 (None, 64)                      0           
________________________________________________________________________________
hidden_1 (Dense)                    (None, 32)                      2080        
________________________________________________________________________________
dropout_2 (Dropout)                 (None, 32)                      0           
________________________________________________________________________________
hidden_2 (Dense)                    (None, 32)                      1056        
________________________________________________________________________________
dropout_

In [6]:
model %>% keras::compile(optimizer =  "adam",
                         loss =       "categorical_crossentropy", 
                         metrics =    "categorical_accuracy")

In [7]:
model %>% keras::fit(x = train$image,
                     y = train$labels,
                     batch_size = 128,
                     epochs = 64,
                     verbose=2,
                     validation_split = 0.2,
                     shuffle = TRUE)

In [None]:
metrics <- model %>% 
  keras::evaluate(x = test$image,
                  y = test$labels)

predictions <- model %>% 
  keras::predict_classes(test$image)

predicted_probs <- model %>% 
      keras::predict_proba(test$image)

metrics

In [None]:
results <- dplyr::tibble(location = test$locations,
                         date = as.Date(as.numeric(test$dates), origin = as.Date("1970-01-01")),
                         actual_classification = test$classifications,
                         predicted_classification = predictions) %>% 
      dplyr::mutate(prob_0 = predicted_probs[,1]*100,
                    prob_1 = predicted_probs[,2]*100,
                    prob_2 = predicted_probs[,3]*100,
                    prob_3 = predicted_probs[,4]*100)

results

In [None]:
num_levels <- 4
levels <- seq(from=0, to=(num_levels-1))

cm <- as.data.frame(table(predicted = factor(predictions, levels), actual = factor(test$classifications, levels)))
  
confusion_matrix <- ggplot2::ggplot(data = cm,
                                    mapping = ggplot2::aes(x = .data$predicted, y = .data$actual)) +
  ggplot2::geom_tile(ggplot2::aes(fill = log(.data$Freq+1))) +
  ggplot2::geom_text(ggplot2::aes(label = sprintf("%1.0f", .data$Freq)), vjust = 1, size=8) +
  ggplot2::scale_fill_gradient(low = "white", 
                               high = "blue") +
  ggplot2::labs(x = "Predicted Classifications", 
                y = "Actual Classifications", 
                title=paste("Confusion Matrix -", YEARS_TESTING, "Toxin Testing Season Hindcast",sep=" "),
                subtitle=paste("Loss:", round(metrics[1], 3), "Accuracy:", round(metrics[2], 3), sep=" "),
                caption=paste(Sys.Date())) +
  ggplot2::theme_linedraw() +
  ggplot2::theme(axis.text=  ggplot2::element_text(size=14),
                 axis.title= ggplot2::element_text(size=14,face="bold"),
                 title =     ggplot2::element_text(size = 14, face = "bold"),
                 legend.position = "none") 

confusion_matrix