In [165]:
#load packages
.libPaths("/local/home/katrinac/miniconda3/envs/r4.0/lib/R/library") #rgdal won't work with the older version of R, so specify libpaths to the more current R version 
Packages <- c("tidyr", "data.table", "lubridate", "broom", "sdmpredictors")
invisible(suppressPackageStartupMessages(lapply(Packages, library, character.only = TRUE)))

#define functions
"%!in%" <- function(x,table) match(x,table, nomatch = 0) == 0

#read in data
SimConnMeta <- fread(file="~/oceanography/script_output/ROMSDataTables/SimConnectivityTableWithMeta.csv") #this is all of the simulated data plus the meta data for each site
GenSimConn <- fread(file="~/oceanography/script_output/ROMSDataTables/GenSimConnectivityFullTable.csv")[, -"AnnRecPart"] #this is the simulated data AND the observations of parentage matches and other field surveying data
#rename the data.table so that columns names are clear and verbose!
setnames(GenSimConn, c("SourceSampled", "DestSampled", "SimMonth", "SimDay", "SimYear", "YearSampled", "SimMonsoon", "DailyParticles", "ParticlesReleasedDaily", "SourcePropSamp", "DestPropSamp"), c("source_sampled", "dest_sampled", "sim_month", "sim_day", "sim_year", "year_sampled", "sim_monsoon", "daily_particles_recruited", "daily_particles_released", "source_prop_samp", "dest_prop_samp") )

#read in the discretized dispersal kernel connectivity matrices from the genetics
GenConnMat <- fread(file="~/DispersalVariation/data/script_output/final_results/dispersal_route_correlation/KernelConnMatrices.csv")[, -c("prob_disp15", "prob_disp16", "prob_disp17", "prob_disp18")] #drop the years not coinciding with model  ]
setnames(GenConnMat, c("org_site", "dest_site", "prob_disp12", "prob_disp13", "prob_disp14"), c("source", "destination", "2012", "2013", "2014"))

#read in environmental data
Enso <- fread("~/oceanography/empirical_data/environment/enso/meiv2.csv")
#reformat to match month ID in ROMs data. MEI has overlapping 2 month seasons
setnames(Enso, c("YEAR","DJ", "JF", "FM", "MA", "AM", "MJ", "JJ", "JA", "AS", "SO", "ON", "ND"), c("year","12,1", "1,2", "2,3", "3,4", "4,5", "5,6", "6,7", "7,8", "8,9", "9,10", "10,11", "11,12"))

#read in the centroids adjusted for the simulation, so the Magbangons combined 
centroids <- fread(file="~/oceanography/empirical_data/site_centroids_SimTest.csv")
setorder(centroids, site)

UnsampledSites <- c("SF1", "SF2", "SF3", "SF4", "SF5", "SF6", "Pangasugan", "Other", "CAI") 


In [166]:
#add in the discretized kernel connectivity matrix from the genetics
#reformat to be long form and fixed discrepancies in Magbangon names
GenConnMat <- unique(melt(GenConnMat, id.vars=c("source", "destination"), variable.name="year", value.name="prob_disp")[#make long format
    grep("N. Magbangon|S. Magbangon", GenConnMat$source), source :="Magbangon"][ #replace the S/N Magbangons with just Magbangon in the source column
    grep("N. Magbangon|S. Magbangon", GenConnMat$destination), destination :="Magbangon"][ #then the destination
    , prob_disp := sum(prob_disp), by=c("source", "destination", "year")], by=c("source", "destination", "year"))[#finally aggregate and sum the data by source/dest and only the rows with Magbangons will be collapsed into one upon using unique to get rid of repeats
    , year := as.numeric(as.character(year))]#melt turns year into a factor for a reason I don't understand. change back to a number
                                                                                                                  #note to self, I checked this code worked as expected by making sure the combined Magbangon prob_disp values added to what the prob_disp column ends up saying after the unique()

GenSimConn <- GenConnMat[GenSimConn, on=.(source, destination, year=year_sampled)] #join discretized kernels into sim connectivity table
setnames(GenSimConn, "year", "year_sampled")
#for each source, what is the normalized recruitment at each destination? 
GenSimConn[, annual_source_normalized_recruitment := sum(daily_particles_recruited)/sum(daily_particles_released), by=c("source", "destination","year_sampled")]

#check that it worked as expected 
sum(unique(GenSimConn[source=="Sitio Baybayon" & year_sampled ==2014], by=c("source", "destination"))[, annual_source_normalized_recruitment]) #should equal 1




In [167]:
#add in all of the physical environmental data

#add in the enso data
#convert to long form
Enso <- melt(Enso, id.vars=c("year"), variable.name="month", value.name="mei")[#make long form with month id column to house what were column headers
    , lapply(.SD, function(x) unlist(tstrsplit(x, ","))), #split the comma separated month numbers into separate rows
   .SDcols = "month",by = c("year","mei")][, month := as.numeric(month)] #change month column from character to number
Enso <- unique(Enso, by=c("year", "month"))      #mei is calculated in 2 month overlapping windows... not sure how to deal with this if MEI is two values for one month. I will just pick the first row and be consistent with using that, but follow up later with others to think if that is fine.

nrow(Enso) #should be 504

ConnPhysMeta <- Enso[GenSimConn, on=.(year=sim_year, month=sim_month)][source %!in% UnsampledSites & destination %!in% UnsampledSites]

In [168]:
head(ConnPhysMeta)

year,mei,month,source,destination,year_sampled,prob_disp,source_sampled,dest_sampled,sim_day,sim_monsoon,daily_particles_recruited,daily_particles_released,dist_km,bearing,direction,source_prop_samp,dest_prop_samp,obs_disp,annual_source_normalized_recruitment
2011,-1.37,10,Sitio Baybayon,Palanas,2012,0.0003697343,yes,yes,10,SWM,0,1984,27.29194,-14.977111,345.0229,0,0.2898551,0,0
2011,-1.37,10,Sitio Baybayon,Wangag,2012,0.0007438418,yes,yes,10,SWM,0,1984,26.86207,-14.278206,345.7218,0,0.1821306,0,0
2011,-1.37,10,Sitio Baybayon,Magbangon,2012,0.001345831,yes,yes,10,SWM,0,1984,25.11346,-12.688177,347.3118,0,0.4593838,0,0
2011,-1.37,10,Sitio Baybayon,Cabatoan,2012,0.0003951127,yes,yes,10,SWM,0,1984,24.91056,-12.353326,347.6467,0,0.4230769,0,0
2011,-1.37,10,Sitio Baybayon,Caridad Cemetery,2012,0.0009045703,yes,yes,10,SWM,0,1984,22.43538,-8.054163,351.9458,0,0.0,0,0
2011,-1.37,10,Sitio Baybayon,Caridad Proper,2012,0.0004313866,yes,yes,10,SWM,0,1984,21.75097,-7.170156,352.8298,0,0.0,0,0


In [175]:
test_mod <- glm(data=ConnPhysMeta, prob_disp ~ as.factor(year))

In [176]:
summary(test_mod)


Call:
glm(formula = prob_disp ~ as.factor(year), data = ConnPhysMeta)

Deviance Residuals: 
      Min         1Q     Median         3Q        Max  
-0.006863  -0.004402  -0.001590   0.002793   0.054914  

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)    
(Intercept)          0.0071480  0.0003615  19.775   <2e-16 ***
as.factor(year)2012 -0.0010035  0.0005274  -1.903   0.0574 .  
as.factor(year)2013 -0.0010730  0.0005274  -2.035   0.0422 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for gaussian family taken to be 3.775797e-05)

    Null deviance: 0.030331  on 800  degrees of freedom
Residual deviance: 0.030131  on 798  degrees of freedom
  (172 observations deleted due to missingness)
AIC: -5879.5

Number of Fisher Scoring iterations: 2


In [128]:

head(GenSimConn)

source,destination,year_sampled,prob_disp,source_sampled,dest_sampled,sim_month,sim_day,sim_year,sim_monsoon,daily_particles_recruited,daily_particles_released,dist_km,bearing,direction,source_prop_samp,dest_prop_samp,obs_disp,annual_source_normalized_recruitment
Other,Palanas,2012,,no,yes,10,10,2011,SWM,69,4497728,,,,,0.2898551,0,1.534108e-05
Other,Wangag,2012,,no,yes,10,10,2011,SWM,68,4497728,,,,,0.1821306,0,1.511874e-05
Other,Magbangon,2012,,no,yes,10,10,2011,SWM,84,4497728,,,,,0.4593838,0,1.86761e-05
Other,Cabatoan,2012,,no,yes,10,10,2011,SWM,59,4497728,,,,,0.4230769,0,1.311773e-05
Other,Caridad Cemetery,2012,,no,yes,10,10,2011,SWM,54,4497728,,,,,0.0,0,1.200606e-05
Other,Caridad Proper,2012,,no,yes,10,10,2011,SWM,101,4497728,,,,,0.0,0,2.245578e-05


In [138]:
#add in MARSPEC data
test <- list_layers(marine=TRUE)
setDT(test)

test[ end_year >=2014]

dataset_code,layer_code,name,description,terrestrial,marine,freshwater,cellsize_equalarea,cellsize_lonlat,units,...,start_year,start_month,start_day,end_year,end_month,end_day,derivation,month,is_surface,version
Bio-ORACLE,BO_bathymin,Bathymetry (minimum),Minimum depth of the seafloor,FALSE,TRUE,FALSE,7000,0.08333333,meters,...,2016,3,18,2016,3,18,minimum,,TRUE,1
Bio-ORACLE,BO_bathymax,Bathymetry (maximum),Maximum depth of the seafloor,FALSE,TRUE,FALSE,7000,0.08333333,meters,...,2016,3,18,2016,3,18,maximum,,TRUE,1
Bio-ORACLE,BO_bathymean,Bathymetry (mean),Average depth of the seafloor,FALSE,TRUE,FALSE,7000,0.08333333,meters,...,2016,3,18,2016,3,18,mean,,TRUE,1
Bio-ORACLE,BO2_chlomax_bdmax,Chlorophyll concentration (maximum at max depth),Maximum mass concentration of chlorophyll in sea water at maximum bottom depth,FALSE,TRUE,FALSE,7000,0.08333333,mg/mŸ,...,2000,,,2014,,,maximum value at maximum bottom depth,,FALSE,2
Bio-ORACLE,BO2_chlomax_bdmean,Chlorophyll concentration (maximum at mean depth),Maximum mass concentration of chlorophyll in sea water at mean bottom depth,FALSE,TRUE,FALSE,7000,0.08333333,mg/mŸ,...,2000,,,2014,,,maximum value at mean bottom depth,,FALSE,2
Bio-ORACLE,BO2_chlomax_bdmin,Chlorophyll concentration (maximum at min depth),Maximum mass concentration of chlorophyll in sea water at minimum bottom depth,FALSE,TRUE,FALSE,7000,0.08333333,mg/mŸ,...,2000,,,2014,,,maximum value at minimum bottom depth,,FALSE,2
Bio-ORACLE,BO2_chlomean_bdmax,Chlorophyll concentration (mean at max depth),Mean mass concentration of chlorophyll in sea water at maximum bottom depth,FALSE,TRUE,FALSE,7000,0.08333333,mg/mŸ,...,2000,,,2014,,,mean value at maximum bottom depth,,FALSE,2
Bio-ORACLE,BO2_chlomean_bdmean,Chlorophyll concentration (mean at mean depth),Mean mass concentration of chlorophyll in sea water at mean bottom depth,FALSE,TRUE,FALSE,7000,0.08333333,mg/mŸ,...,2000,,,2014,,,mean value at mean bottom depth,,FALSE,2
Bio-ORACLE,BO2_chlomean_bdmin,Chlorophyll concentration (mean at min depth),Mean mass concentration of chlorophyll in sea water at minimum bottom depth,FALSE,TRUE,FALSE,7000,0.08333333,mg/mŸ,...,2000,,,2014,,,mean value at minimum bottom depth,,FALSE,2
Bio-ORACLE,BO2_chlomin_bdmax,Chlorophyll concentration (minimum at max depth),Minimum mass concentration of chlorophyll in sea water at maximum bottom depth,FALSE,TRUE,FALSE,7000,0.08333333,mg/mŸ,...,2000,,,2014,,,minimum value at maximum bottom depth,,FALSE,2


In [150]:
bath_mean <- load_layers("BO_bathymean", datadir = "~/oceanography/empirical_data/environment/") 
bath_mean
max_vel <- load_layers("BO2_curvelmax_bdmin", datadir = "~/oceanography/empirical_data/environment/")
max_vel
sst <- load_layers("BO21_tempmean_ss" , datadir = "~/oceanography/empirical_data/environment/")
sst

class      : RasterStack 
dimensions : 2160, 4320, 9331200, 1  (nrow, ncol, ncell, nlayers)
resolution : 0.08333333, 0.08333333  (x, y)
extent     : -180, 180, -90, 90  (xmin, xmax, ymin, ymax)
crs        : +proj=longlat +datum=WGS84 +no_defs 
names      : BO_bathymean 
min values :       -10494 
max values :         2138 


class      : RasterStack 
dimensions : 2160, 4320, 9331200, 1  (nrow, ncol, ncell, nlayers)
resolution : 0.08333333, 0.08333333  (x, y)
extent     : -180, 180, -90, 90  (xmin, xmax, ymin, ymax)
crs        : +proj=longlat +datum=WGS84 +no_defs 
names      : BO2_curvelmax_bdmin 
min values :        6.589556e-05 
max values :             2.44297 


class      : RasterStack 
dimensions : 2160, 4320, 9331200, 1  (nrow, ncol, ncell, nlayers)
resolution : 0.08333333, 0.08333333  (x, y)
extent     : -180, 180, -90, 90  (xmin, xmax, ymin, ymax)
crs        : +proj=longlat +datum=WGS84 +no_defs 
names      : BO21_tempmean_ss 
min values :        -1.797733 
max values :         30.17863 


In [156]:
sst$year

ERROR: Error in .local(x, ...): invalid layer names


In [145]:
#geographic extent of our area
extent <-c(min(centroids$lon), max(centroids$lon), min(centroids$lat), max(centroids$lat))

In [178]:
install.packages("oce", dependencies = TRUE)

Installing package into ‘/local/home/katrinac/miniconda3/envs/r4.0/lib/R/library’
(as ‘lib’ is unspecified)
also installing the dependencies ‘intervals’, ‘spacetime’, ‘FNN’, ‘filehash’, ‘bit’, ‘units’, ‘gstat’, ‘gdistance’, ‘plotrix’, ‘shape’, ‘adehabitatMA’, ‘bit64’, ‘blob’, ‘DBI’, ‘gsw’, ‘sf’, ‘akima’, ‘automap’, ‘marmap’, ‘ocedata’, ‘RSQLite’, ‘tiff’, ‘XML’

“installation of package ‘marmap’ had non-zero exit status”

In [180]:
library("ncdf4")

In [187]:
nc <- nc_open("~/oceanography/empirical_data/environment/noaa_sst/sst.wkmean.1990-present.nc")

In [188]:
print(nc)

File ~/oceanography/empirical_data/environment/noaa_sst/sst.wkmean.1990-present.nc (NC_FORMAT_CLASSIC):

     2 variables (excluding dimension variables):
        short sst[lon,lat,time]   
            long_name: Weekly Mean of Sea Surface Temperature
            unpacked_valid_range: -5
             unpacked_valid_range: 40
            actual_range: -1.79999995231628
             actual_range: 36.1599998474121
            units: degC
            add_offset: 0
            scale_factor: 0.00999999977648258
            missing_value: 32767
            precision: 2
            least_significant_digit: 2
            var_desc: Sea Surface Temperature
            dataset: NOAA Optimum Interpolation (OI) SST V2
            level_desc: Surface
            statistic: Weekly Mean
            parent_stat: Individual obs
            standard_name: sea_surface_temperature
            valid_range: -500
             valid_range: 4000
        double time_bnds[nbnds,time]   
            long_name: Time

In [194]:
sst <- data.table(lat = ncvar_get(nc, "lat"),
lon = ncvar_get(nc, "lon"),
time = ncvar_get(nc, "time"),
sst = ncvar_get(nc, "sst"))[, time := as.POSIXct(time, origin = '1981-01-01 00:00:00', tz = 'UTC')]


“Item 4 is of size 360 but maximum size is 1638 (recycled leaving remainder of 198 items)”

In [200]:
summary(sst$time)

                 Min.               1st Qu.                Median 
"1981-01-01 19:16:35" "1981-01-01 20:04:19" "1981-01-01 20:52:04" 
                 Mean               3rd Qu.                  Max. 
"1981-01-01 20:52:04" "1981-01-01 21:39:49" "1981-01-01 22:27:34" 

In [None]:

SST_DT <- setDT(matrix(nrow=0, ncol=4))

cols_conn_mat <- c("date","lat", "lon", "weekly_mean")
cols_tmp <- c("lat", "lon", "weekly_mean")

names(conn_mat_full) <- cols_conn_mat

for(i in 1:nrow(all_dates2)){
    
    index_eval <- all_dates2$index[i]
    
    tmp <- as.data.frame(conn_mat_all[[i]], xy=T)   #pull out a data frame for each date
    names(tmp) <- cols_tmp
    
    date <- filter(all_dates2, index==index_eval)[2] #grab the date
    date_df <- do.call("rbind", replicate(784, date, simplify = FALSE)) #make a df of the date
    tmp2 <- bind_cols(date_df, tmp)

 
    conn_mat_full <- rbind(conn_mat_full, tmp2) 
}
