Most of the required files are now included. Still need some work to …

…put plots in a new file and QA/QC.
noaa-nwfsc · May 4, 2024 · cf3d2d9 · cf3d2d9
1 parent 830b537
commit cf3d2d9
Show file tree

Hide file tree

Showing 8 changed files with 449 additions and 209 deletions.
diff --git a/R/createStoredObjects.R b/R/createStoredObjects.R
@@ -2,13 +2,6 @@
 #  This code creates those datasets and saves them as RData files
 #  This only has to be run to update the data, not each time you want to use the code
 
-library(reshape2)
-library(sf)
-library(ncdf4)
-
-source('R/get_index.R') # This is not the same script as the app version
-#source('scripts/dam_counts/run_fetch_FPC_counts_single_species.R')
-
 #********************************************************
 # get the land for plotting (wrap across antimeridian)
 #********************************************************
@@ -45,172 +38,3 @@ oceanData_ERSST<-getOceanData(dataSet=dataSet,
 save(x = "oceanData_ERSST", file = 'data/oceanSSTData.RData')
 load('data/oceanSSTData.RData')
 
-# SSH
-dataSet='SSH'
-# Full globe
-# SSH longitude goes from 0.5 to 359.5, by 1
-min.lon=0
-max.lon=360
-# SSH latitude goes from -74 to 65, by 1/3?
-min.lat=-90
-max.lat=90
-years=seq(1980, 2022, 1)
-months=seq(1,12,1)
-removeBering=FALSE
-returnDataType='anom'
-returnObjectType='array'
-
-oceanData_SSH<-getOceanData(dataSet=dataSet,
-                        returnDataType=returnDataType, returnObjectType=returnObjectType,
-                        min.lon=min.lon, max.lon=max.lon,
-                        min.lat=min.lat, max.lat=max.lat,
-                        years = years, months = months,
-                        removeBering = removeBering)
-save(x = "oceanData_SSH", file = 'data/oceanSSHData.RData')
-load('data/oceanSSHData.RData')
-
-
-#  Code to extract dam counts and save to a file
-# The data this retreives has errors in 2023 - I'm moving to DART as the source
-# spCK <- fetch_FPC_counts_single_species(my_species="spCK", my_age="adult", Year_start = 1970, Year_end = 2023)
-# spCK$val <- spCK$BON[[1]]
-# spCK <- spCK[,c("Year","val")]
-# colnames(spCK) <- c("year","spCK")
-# faCK <- fetch_FPC_counts_single_species(my_species="faCK", my_age="adult", Year_start = 1970, Year_end = 2023)
-# faCK$val <- faCK$BON[[1]]
-# faCK <- faCK[,c("Year","val")]
-# colnames(faCK) <- c("year","faCK")
-# steel <- fetch_FPC_counts_single_species(my_species="steel", my_age="adult", Year_start = 1970, Year_end = 2023)
-# steel$val <- steel$BON[[1]]
-# steel <- steel[,c("Year","val")]
-# colnames(steel) <- c("year","steel")
-# response <- merge(merge(spCK, faCK), steel)
-response <- read.csv("data/response/response.csv")
-
-# lag response (this is now done in the app)
-#response$year <- response$year - 2
-response <- response[response$year %in% c(1970:2023), ]
-save(x = "response", file = 'CMISSTapp/data/responseData.RData')
-load(file = 'CMISSTapp/data/responseData.RData')
-
-# Use MCN data as a test for user-defined files
-spCK <- fetch_FPC_counts_single_species(my_species="spCK", my_age="adult")
-spCK$val <- spCK$MCN[[1]]
-response <- spCK[,c("Year","val")]
-colnames(response) <- c("year","val")
-# lag response
-#response$year <- response$year - 2
-response <- response[response$year %in% c(1981:2023), ]
-write.csv(x = response, file = 'CMISSTapp/data/user_response.csv', row.names = FALSE)
-
-
-
-#*******************************
-#*  PDO, NPGO, ONI, SSTarc, and PC1
-#*******************************
-
-pdo<-read.csv("https://oceanview.pfeg.noaa.gov/erddap/tabledap/cciea_OC_PDO.csv?time,PDO", header = TRUE, stringsAsFactors = FALSE)
-pdo<-pdo[-1,]
-pdo$PDO<-as.numeric(pdo$PDO)
-pdo$year<-as.numeric(substr(pdo$time,1,4))
-pdo$month <- as.numeric(substr(pdo$time,6,7))
-pdo<-pdo[,c(3,4,2)]
-colnames(pdo)[3]<-"index"
-
-# Make the seasonal indices
-pdo.win<-pdo
-pdo.win<-pdo.win[pdo.win$month %in% 1:3,]
-pdo.win<-aggregate(pdo.win$index, by=list(pdo.win$year), mean, na.rm=TRUE)
-colnames(pdo.win)<-c("year","pdo.win")
-pdo.spr<-pdo[pdo$month %in% 4:6,]
-pdo.spr<-aggregate(pdo.spr$index, by=list(pdo.spr$year), mean, na.rm=TRUE)
-colnames(pdo.spr)<-c("year","pdo.spr")
-pdo.sum<-pdo[pdo$month %in% 7:9,]
-pdo.sum<-aggregate(pdo.sum$index, by=list(pdo.sum$year), mean, na.rm=TRUE)
-colnames(pdo.sum)<-c("year","pdo.sum")
-pdo.aut<-pdo[pdo$month %in% 10:12,]
-pdo.aut<-aggregate(pdo.aut$index, by=list(pdo.aut$year), mean, na.rm=TRUE)
-colnames(pdo.aut)<-c("year","pdo.aut")
-
-otherIndicators<-merge(merge(merge(pdo.win, pdo.spr), pdo.sum), pdo.aut)
-
-# NPGO
-npgo<-read.csv("https://oceanview.pfeg.noaa.gov/erddap/tabledap/cciea_OC_NPGO.csv?time,NPGO", header = TRUE, stringsAsFactors = FALSE)
-npgo<-npgo[-1,]
-npgo$NPGO<-as.numeric(npgo$NPGO)
-npgo$year<-as.numeric(substr(npgo$time,1,4))
-npgo$month <- as.numeric(substr(npgo$time,6,7))
-npgo<-npgo[,c(3,4,2)]
-colnames(npgo)[3]<-"index"
-# Make the seasonal indices
-npgo.win<-npgo
-npgo.win<-npgo.win[npgo.win$month %in% 1:3,]
-npgo.win<-aggregate(npgo.win$index, by=list(npgo.win$year), mean, na.rm=TRUE)
-colnames(npgo.win)<-c("year","npgo.win")
-npgo.spr<-npgo[npgo$month %in% 4:6,]
-npgo.spr<-aggregate(npgo.spr$index, by=list(npgo.spr$year), mean, na.rm=TRUE)
-colnames(npgo.spr)<-c("year","npgo.spr")
-npgo.sum<-npgo[npgo$month %in% 7:9,]
-npgo.sum<-aggregate(npgo.sum$index, by=list(npgo.sum$year), mean, na.rm=TRUE)
-colnames(npgo.sum)<-c("year","npgo.sum")
-npgo.aut<-npgo[npgo$month %in% 10:12,]
-npgo.aut<-aggregate(npgo.aut$index, by=list(npgo.aut$year), mean, na.rm=TRUE)
-colnames(npgo.aut)<-c("year","npgo.aut")
-
-otherIndicators<-merge(merge(merge(merge(otherIndicators, npgo.win), npgo.spr), npgo.sum), npgo.aut)
-
-# ONI
-oni<-read.csv("https://oceanview.pfeg.noaa.gov/erddap/tabledap/cciea_OC_ONI.csv?time,ONI", header = TRUE, stringsAsFactors = FALSE)
-oni<-oni[-1,]
-oni$ONI<-as.numeric(oni$ONI)
-oni$year<-as.numeric(substr(oni$time,1,4))
-oni$month <- as.numeric(substr(oni$time,6,7))
-oni<-oni[,c(3,4,2)]
-colnames(oni)[3]<-"index"
-# Make the seasonal indices
-oni.win<-oni
-oni.win<-oni.win[oni.win$month %in% 1:3,]
-oni.win<-aggregate(oni.win$index, by=list(oni.win$year), mean, na.rm=TRUE)
-colnames(oni.win)<-c("year","oni.win")
-oni.spr<-oni[oni$month %in% 4:6,]
-oni.spr<-aggregate(oni.spr$index, by=list(oni.spr$year), mean, na.rm=TRUE)
-colnames(oni.spr)<-c("year","oni.spr")
-oni.sum<-oni[oni$month %in% 7:9,]
-oni.sum<-aggregate(oni.sum$index, by=list(oni.sum$year), mean, na.rm=TRUE)
-colnames(oni.sum)<-c("year","oni.sum")
-oni.aut<-oni[oni$month %in% 10:12,]
-oni.aut<-aggregate(oni.aut$index, by=list(oni.aut$year), mean, na.rm=TRUE)
-colnames(oni.aut)<-c("year","oni.aut")
-
-otherIndicators<-merge(merge(merge(merge(otherIndicators, oni.win), oni.spr), oni.sum), oni.aut)
-
-# SSTarc
-arc<-read.csv("../OceanTeamData/scripts/indicators/ersstArc/ersstArc.anom.csv", header = TRUE, stringsAsFactors = FALSE)
-colnames(arc)[3]<-"index"
-# Make the seasonal indices
-arc.win<-arc
-arc.win<-arc.win[arc.win$month %in% 1:3,]
-arc.win<-aggregate(arc.win$index, by=list(arc.win$year), mean, na.rm=TRUE)
-colnames(arc.win)<-c("year","arc.win")
-arc.spr<-arc[arc$month %in% 4:6,]
-arc.spr<-aggregate(arc.spr$index, by=list(arc.spr$year), mean, na.rm=TRUE)
-colnames(arc.spr)<-c("year","arc.spr")
-arc.sum<-arc[arc$month %in% 7:9,]
-arc.sum<-aggregate(arc.sum$index, by=list(arc.sum$year), mean, na.rm=TRUE)
-colnames(arc.sum)<-c("year","arc.sum")
-arc.aut<-arc[arc$month %in% 10:12,]
-arc.aut<-aggregate(arc.aut$index, by=list(arc.aut$year), mean, na.rm=TRUE)
-colnames(arc.aut)<-c("year","arc.aut")
-
-otherIndicators<-merge(merge(merge(merge(otherIndicators, arc.win), arc.spr), arc.sum), arc.aut)
-
-save(otherIndicators, file = 'CMISSTapp/data/otherIndicators.RData')
-
-# If we add PC1 from the stoplight, we're going to have lots of NAs,
-#  so let's save it in a different file
-# SSTarc
-pc1<-read.csv("../../../indicators/2024/Stoplight_for_forecasts.csv", header = TRUE, stringsAsFactors = FALSE)
-pc1<-pc1[,c("Year","PC1")]; colnames(pc1)[1]<-"year"
-
-otherIndicators_pc1<-merge(otherIndicators, pc1)
-save(otherIndicators_pc1, file = 'CMISSTapp/data/otherIndicators_pc1.RData')
diff --git a/R/create_OceanData_Object.R b/R/create_OceanData_Object.R
@@ -1,6 +1,16 @@
 # Extract data from global data sets
 # User sets extent and return object type
 
+
+#*****************************************
+#*****************************************
+# Not needed by most users. The resulting data
+#  have been stored in RData files, which will
+#  be updated annually by Brian Burke
+#*****************************************
+#*****************************************
+
+
 # SST data are from ERSST (https://www.ncei.noaa.gov/pub/data/cmb/ersst/v5/netcdf/)
 # ERSST is a 2x2 degree gloabal dataset
 

diff --git a/R/crossValidation.R b/R/crossValidation.R
@@ -0,0 +1,128 @@
+# Run LOO cross-validation, removing each year sequentially and predicting that year
+
+# library(ncdf4)
+# library(RColorBrewer)
+# library(sp)
+# library(maptools)
+# library(reshape2)
+# library(ggplot2)
+# library(sdmTMB)
+# library(ggeffects)
+# library(visreg)
+# library(doBy)
+
+
+#*************************************************************
+#  Create a large loop right here for cross-validation
+#*************************************************************
+LOO_CV <- function(response = response,
+                   oceanData = oceanData, loocvYears = 5,
+                   min.lon = min.lon, max.lon = max.lon,
+                   min.lat = min.lat, max.lat = max.lat,
+                   years = years, months = months,
+                   includePDO = FALSE, includePC1 = FALSE) {
+
+  # Verify that the response is what we expect
+  if (ncol(response)!=2) { print("incorrect data - requires a 2-column data frame with year and the response"); return(NA) }
+  colnames(response)<-c("year","val")
+
+  year_mo<-data.frame(year=rep(years, each=length(months)), month=rep(months, length(years)),
+                      label=paste(rep(years, each=length(months)), rep(months, length(years)), sep = "_"))
+
+  # Index the locations in the file
+  lons <- as.numeric(dimnames(oceanData)[[1]])
+  lats <- as.numeric(dimnames(oceanData)[[2]])
+  yr_mo <- dimnames(oceanData)[[3]]
+  lon.index<-which(lons >= min.lon & lons <= max.lon) 
+  lat.index<-which(lats >= min.lat & lats <= max.lat)
+  yr_mo.index<-which(yr_mo %in% year_mo$label)
+  oceanData <- oceanData[lon.index, lat.index, yr_mo.index]
+
+  # Container for results
+  mae<-data.frame(model=as.character(), season=as.character(), year=as.numeric(),
+                    response=as.numeric(), pred=as.numeric(),
+                    mae=as.numeric(), stringsAsFactors = FALSE)
+  n<-length(years)
+  for (this_year in years[(n-(loocvYears-1)):n]) { # just the last "loocvYears" years
+    years.fit<-years[!years %in% this_year]
+    years.pred<-years[years %in% this_year]
+
+
+    #************************************************
+    # Create seasonal datasets
+    #************************************************
+
+    createSeasonalData_LOOCV<-function(oceanData, years = years, months = months, year_mo=year_mo, season=1) {
+
+      seasonal<-array(NA, c(dim(oceanData)[1], dim(oceanData)[2], length(years)), dimnames = list(dimnames(oceanData)[[1]], dimnames(oceanData)[[2]], years))
+      for (yy in 1:length(years)) {
+        if (season==1) seasonal[,,yy]<-(oceanData[,,year_mo$month == 1 & year_mo$year==years[yy]]+oceanData[,,year_mo$month == 2 & year_mo$year==years[yy]]+oceanData[,,year_mo$month == 3 & year_mo$year==years[yy]])/3
+        if (season==2) seasonal[,,yy]<-(oceanData[,,year_mo$month == 4 & year_mo$year==years[yy]]+oceanData[,,year_mo$month == 5 & year_mo$year==years[yy]]+oceanData[,,year_mo$month == 6 & year_mo$year==years[yy]])/3
+        if (season==3) seasonal[,,yy]<-(oceanData[,,year_mo$month == 7 & year_mo$year==years[yy]]+oceanData[,,year_mo$month == 8 & year_mo$year==years[yy]]+oceanData[,,year_mo$month == 9 & year_mo$year==years[yy]])/3
+        if (season==4) seasonal[,,yy]<-(oceanData[,,year_mo$month == 10 & year_mo$year==years[yy]]+oceanData[,,year_mo$month == 11 & year_mo$year==years[yy]]+oceanData[,,year_mo$month == 12 & year_mo$year==years[yy]])/3
+      } 
+      # This scales (Z-score) the data cell-wise
+      # The aperm is needed because for some reason the apply function returns the third dimension (time) as the first dimension
+      oceanData.scl <- aperm(apply(seasonal, 1:2, scale), c(2,3,1))
+      dimnames(oceanData.scl)[[3]]<-years
+      return(oceanData.scl)
+    }
+
+    # Create the data by calling our function (returns scaled sst array and full dataset with fish)
+    oceanData.s1.scl <- createSeasonalData_LOOCV(oceanData = oceanData, years = years, months = months, year_mo=year_mo, season = 1)
+    oceanData.s2.scl <- createSeasonalData_LOOCV(oceanData = oceanData, years = years, months = months, year_mo=year_mo, season = 2)
+    oceanData.s3.scl <- createSeasonalData_LOOCV(oceanData = oceanData, years = years, months = months, year_mo=year_mo, season = 3)
+    oceanData.s4.scl <- createSeasonalData_LOOCV(oceanData = oceanData, years = years, months = months, year_mo=year_mo, season = 4)
+
+    # Get covariance between each cell's temperature and survival
+    covs1<-apply(oceanData.s1.scl[,,as.character(eval(years.fit))], 1:2, function(x) cov(x,response$val[years %in% years.fit], use="pairwise.complete.obs"))
+    covs2<-apply(oceanData.s2.scl[,,as.character(eval(years.fit))], 1:2, function(x) cov(x,response$val[years %in% years.fit], use="pairwise.complete.obs"))
+    covs3<-apply(oceanData.s3.scl[,,as.character(eval(years.fit))], 1:2, function(x) cov(x,response$val[years %in% years.fit], use="pairwise.complete.obs"))
+    covs4<-apply(oceanData.s4.scl[,,as.character(eval(years.fit))], 1:2, function(x) cov(x,response$val[years %in% years.fit], use="pairwise.complete.obs"))
+
+    #********************************************************************
+    # Create the index (how similar is each year to the covariance map)
+    #   The covariance maps were created without the holdout year
+    #   but we want to create the index value for all years, so we can forecast
+    #********************************************************************
+    coefs_cov<-NULL
+    options(na.action="na.omit")
+    for (tt in 1:dim(oceanData.s1.scl)[3])
+      coefs_cov<-rbind(coefs_cov, c(lm(as.vector(oceanData.s1.scl[,,tt]) ~ -1 + as.vector(covs1))$coef,
+                                    lm(as.vector(oceanData.s2.scl[,,tt]) ~ -1 + as.vector(covs2))$coef,
+                                    lm(as.vector(oceanData.s3.scl[,,tt]) ~ -1 + as.vector(covs3))$coef,
+                                    lm(as.vector(oceanData.s4.scl[,,tt]) ~ -1 + as.vector(covs4))$coef))
+    coefs_cov<-data.frame(coefs_cov)
+    coefs_cov$year<-years
+    index_cov<-cbind(coefs_cov,response$val)
+    colnames(index_cov)<-c("win.cov","spr.cov","sum.cov","aut.cov","year","val")
+
+    #*****************************************
+    # Calculate MAE
+    #*****************************************
+
+    index.fit<-index_cov[index_cov$year %in% years.fit,]
+    index.pred<-index_cov[index_cov$year %in% years.pred,]
+
+    # Need to loop over seasons
+    for(season in c("win","spr","sum","aut")) {
+
+      index.fit$var <- index.fit[,paste0(season,".cov")]
+      index.pred$var <- index.pred[,paste0(season,".cov")]
+      mdl<-lm(val~var, data=index.fit)
+      pred<-predict(mdl, newdata = index.pred)
+      #cov_mae<-sqrt(mean((pred - index.pred$val)^2)) # This is RMSEP
+      cov_mae<-mean(abs(pred - index.pred$val)) # This is MAE
+      # Assign it to the data.frame
+      mae<-rbind(mae, data.frame(model="cmisst", season=season, year=this_year,
+                                     response=index.pred$val, pred=pred,
+                                     mae=cov_mae, stringsAsFactors = FALSE))
+    } # End looping over season
+  } # End looping over years
+
+  se <- function(x) { return(sd(x)/sqrt(length(x))) }
+  mae$season<-factor(mae$season, levels = c("win","spr","sum","aut"))
+  # Trying this out - if I leave this line out, I might get all the individual predictions
+  if(exists("pred_out") & !pred_out) mae<-summaryBy(mae ~ model+season, data = mae, FUN = c(mean, se))
+  return(list(mae))
+}
diff --git a/R/get_index.R b/R/get_index.R
@@ -26,25 +26,17 @@ get_CMISST_index <- function(response, oceanData=oceanData_ERSST,
   # Extract and scale the spatial data
   #***************************************************************
 
-  #  The original script loaded individual .nc files as needed
-  #  The revised version, for shiny, loads a saved version of the 
-  #    full globe and subsets it.
-  #  Data that needs to be loaded is in createStoredObjects.R
-
-  # if (dataSet == "ERSST") oceanData <- oceanData_ERSST
-  # if (dataSet == "SSH") oceanData <- oceanData_SSH
-
   # Index the locations in the file
   lons <- as.numeric(dimnames(oceanData)[[1]])
   lats <- as.numeric(dimnames(oceanData)[[2]])
   yr_mo <- dimnames(oceanData)[[3]]
   lon.index<-which(lons >= min.lon & lons <= max.lon) 
     lat.index<-which(lats >= min.lat & lats <= max.lat)
   yr_mo.index<-which(yr_mo %in% year_mo$label)
+  # Subset the ocean data with user-defined extent
   oceanData <- oceanData[lon.index, lat.index, yr_mo.index]
-
-
 
+  # Create the function to calculate seasonal averages
   createSeasonalData<-function(oceanData,
                                years = years, months = months, year_mo=year_mo, season=1) {
     seasonal<-array(NA, c(dim(oceanData)[1], dim(oceanData)[2], length(years)), dimnames = list(dimnames(oceanData)[[1]], dimnames(oceanData)[[2]], years))
@@ -75,8 +67,6 @@ get_CMISST_index <- function(response, oceanData=oceanData_ERSST,
 
   #********************************************************************
   # Create the index (how similar is each year to the covariance map)
-  # For the PDO, I think they regress each year onto the pdo pattern
-  # But are there other methods of comparing similarity of matrices that we should think about?
   #********************************************************************
   coefs_cov<-NULL
   options(na.action="na.omit")
@@ -87,9 +77,15 @@ get_CMISST_index <- function(response, oceanData=oceanData_ERSST,
                                   lm(as.vector(oceanData.s4.scl[,,tt]) ~ -1 + as.vector(covs4))$coef))
   coefs_cov<-data.frame(coefs_cov)
   coefs_cov$year<-years
-  #index_cov<-cbind(coefs_cov,response$val)
   index_cov<-merge(coefs_cov, response[response$year %in% years.fit,], all.x=TRUE)
   colnames(index_cov)<-c("year","win.cov","spr.cov","sum.cov","aut.cov","val")
 
+  # Returns index as a list
+  #  cmisst[[1]] contains 6 columns (as one list item): 4 seasonal indices, year, response
+  #  cmisst[[2]] winter spatial covariance values (for maps)
+  #  cmisst[[3]] spring spatial covariance values (for maps)
+  #  cmisst[[4]] summer spatial covariance values (for maps)
+  #  cmisst[[5]] autumn spatial covariance values (for maps)
+  #  cmisst[[6]] lat, long min and max, 1 list item
   return(list(index_cov, covs1, covs2, covs3, covs4, c(min.lat, max.lat, min.lon, max.lon)))
 }