In [13]:
Packages <- c("dplyr",  "nleqslv", "broom","cubature", "geosphere", "data.table",  "ggplot2", "bbmle", "stringr",  "lubridate", "RColorBrewer")

invisible(suppressPackageStartupMessages(lapply(Packages, library, character.only = TRUE)))

setwd('/local/home/katrinac/oceanography')
"%!in%" <- function(x,table) match(x,table, nomatch = 0) == 0
source("~/parentage/kernel_fitting/1340_loci/functions/ll_kt_both_bbmle.R")
#source("~/oceanography/scripts/PredictedProportions.R")

#read in the kernel fitting summary
kernels <- fread(file="~/parentage/kernel_fitting/1340_loci/final_results/tables/kernel_fitting_summary.csv")
kernel2012_14 <- fread(file="~/oceanography/empirical_data/genetics/GenKernelsForROMSComp2012-14.csv")

#read in the centroids adjusted for the simulation, so the Magbangons combined 
#centroids <- fread(file="~/oceanography/script_output/SurveyData/SimulationCentroids.csv")
Centroids <- fread(file="~/oceanography/empirical_data/site_centroids_SimTest.csv")
setorder(Centroids, site)
#read in the table with number of recruits sampled at each site for each year
AnnualRecsSamp <- fread(file="~/oceanography/script_output/SurveyData/AnnualRecruitsSampled.csv")
#read in the table of the proportion of anemones sampled at each site for each year
PropSamp <- fread(file="~/oceanography/script_output/SurveyData/ProportionHabitatSampled.csv")
setnames(PropSamp, c("PropAnemSamp", "TotalAnems"), c("prop_anem_samp", "total_anems"))
#read in the ROMS simulation connectivity table with metadata, not yet subsetted (*but check this)
#DateJoin <- fread(file="~/oceanography/script_output/ROMSDataTables/LongFormConnWithProbs.csv")
#setnames(DateJoin, c("YearSampled", "SurvWeight", "Dest", "DestNOffsAnnual", "DestPropSamp", "DestTotalAnems", "Source", "SourceNOffsAnnual", "SourcePropSamp", "SourceTotalAnems", "SimMonth", "SimDay", "SimYear", "SimMonsoon", "DailyParticles", "ParticleID", "SourceSize"), 
#        c("year_sampled", "surv_weight", "dest", "dest_n_rec_annual", "dest_prop_samp", "dest_total_anems", "source", "source_n_rec_annual", "source_prop_samp", "source_total_anems", "sim_month", "sim_day", "sim_year", "sim_monsoon", "daily_particles", "particle_id", "source_size"))

#make vectors defining sites we didn't sample, but that are in the model, and the sandflats specifically 
unsampled_sites <- c("SF1", "SF2", "SF3", "SF4", "SF5", "SF6", "Pangasugan", "Other", "CAI") 
sand_flats <- c("SF1", "SF2", "SF3", "SF4", "SF5", "SF6") 
sites_to_remove <- c("SF1", "SF2", "SF3", "SF4", "SF5", "SF6", "Pangasugan") 
#make the constant inputs for the kernel fitting function
#distance matrix using the centroids with combined Magbangon
### List of source locations
SitesSource <- Centroids

### List of destination locations
SitesDest <- Centroids

DistMatm <- distm(SitesSource[,c('lon','lat')], SitesSource[,c('lon','lat')], fun=distVincentyEllipsoid)
Distances <- DistMatm*10^-3
#read in the reef areas for the kernel fitting
Area <- fread("~/oceanography/empirical_data/site_area_header_nonsurveyed_simulation_kernels_test.csv") %>%
    arrange(site) %>%
    filter(site %!in% c("near_north_full1", "near_north_full2", "near_north_full3", "near_south_full1", "near_south_full2", "near_south_full3")) %>%
    mutate(kmsq=msq*10^-6)# %>%
    #select(kmsq) #need to uncomment for functions to work
setorder(Area, site)
reef_sizes <- as.matrix(Area$kmsq)

#make a table with the survey information for each site (how many fish sampled, prop anems sampled, total number of anems at site)
SurveyData <- AnnualRecsSamp[PropSamp, on=.(year=end_year, site)][#join the sampling tables together
    is.na(n_offs_gen), n_offs_gen := 0][#change NA's to 0
    ,-"time_frame"]#drop the time_frame column, we can key with end_year
#setnames(SurveyData, c("PropAnemSamp", "TotalAnems"), c("prop_anem_samp", "total_anems"))
#setkey(SurveyData, site)
#check all sites are represented in centroids and area (and indirectly distances, which comes from centroids)
#Area[site %!in% centroids$site] #should be nothing

#genetic data to make predicted proportions
Assignments2012_4 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/parentage_matrix_2012-14ForROMSComp.csv", header=FALSE))
Adult_sample_proportions2012_4 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/prop_samp12-14.csv", header=FALSE))
Sampled_reefs2012_4 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/site_index12-14.csv", header=FALSE))

Distances_gen <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/distance_matrix_unsurveyed.csv", header=FALSE))
Reef_sizes_gen <- as.matrix(read.csv("~/parentage/kernel_fitting/894_loci/area_unsurveyed.csv", header=FALSE))
Centroids_gen <- as.matrix(read.csv("~/parentage/kernel_fitting/894_loci/centroids_unsurveyed.csv", header=T))

Assignments2012 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/parentage_matrix12_corrected.csv", header=FALSE))
Adult_sample_proportions2012 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/prop_samp12.csv", header=FALSE))
Sampled_reefs2012 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/site_index12.csv", header=FALSE))

Assignments2013 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/parentage_matrix13_corrected.csv", header=FALSE))
Adult_sample_proportions2013 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/prop_samp13.csv", header=FALSE))
Sampled_reefs2013 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/site_index13.csv", header=FALSE))

Assignments2014 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/parentage_matrix14_corrected.csv", header=FALSE))
Adult_sample_proportions2014 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/prop_samp14.csv", header=FALSE))
Sampled_reefs2014 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/site_index14.csv", header=FALSE))

#Allison's abundance time series data 
#download.file(url = "https://github.com/pinskylab/Clownfish_persistence/blob/master/Data/Script_outputs/females_df_F.RData?raw=true", destfile = "~/oceanography/empirical_data/genetics/females_df_F.RData")
load("~/oceanography/empirical_data/genetics/females_df_F.RData")
Abundance <- as.data.table(females_df_F)

“Previous fread() session was not cleaned up properly. Cleaned up ok at the beginning of this fread() call.”

In [74]:
#make a site index table, use this for Sampled_reefs input in kernel fitting
SiteIndex <- unique(Centroids, by="site")[, "site"][, index := .I] #add the row number as the unique site index


In [15]:
#join the survey sampling tables together
SurveyData <- AnnualRecsSamp[PropSamp, on=.(year=end_year, site)][
    is.na(n_offs_gen), n_offs_gen := 0][#change NA's to 0
    ,-"time_frame"]#drop the time_frame

SurveyData <- Abundance[, c("year", "site", "nF")][SurveyData, on=.(year, site)]#join in Allison's estimate of female abundance. There are NA values, but that's okay we can figure those out when we start thinking about incorporating uncertainty in this


In [106]:
sites_to_remove <- c("SF1", "SF2", "SF3", "SF4", "SF5", "SF6", "Pangasugan") 


date,source,destination,SourceSampled,DestSampled,SimMonth,SimDay,SimYear,YearSampled,SimMonsoon,DailyParticles,ParticlesReleasedDaily,dist_km,bearing,direction
2010-10-01,Other,Palanas,no,yes,10,10,2010,2011,SWM,9,4497728,,,
2010-10-01,Other,Wangag,no,yes,10,10,2010,2011,SWM,4,4497728,,,
2010-10-01,Other,Magbangon,no,yes,10,10,2010,2011,SWM,11,4497728,,,
2010-10-01,Other,Cabatoan,no,yes,10,10,2010,2011,SWM,12,4497728,,,
2010-10-01,Other,Caridad Cemetery,no,yes,10,10,2010,2011,SWM,0,4497728,,,
2010-10-01,Other,Caridad Proper,no,yes,10,10,2010,2011,SWM,1,4497728,,,


In [1]:
head(SurveyData)

ERROR: Error in head(SurveyData): object 'SurveyData' not found


In [62]:
head(DestSampled)

year,num_parentage_matches,date,surv_weight,dest,dest_n_offs_annual,dest_prop_samp,dest_total_anems,source,source_n_offs_annual,source_prop_samp,source_total_anems,sim_month,sim_day,sim_year,monsoon,daily_particles,particle_id,source_size
2012,3,2011-10-01,0.4016461,Palanas,13,0.2898551,138,Other,,,,10,10,2011,SWM,69,P54670320,
2012,3,2011-10-01,0.4016461,Palanas,13,0.2898551,138,Other,,,,10,10,2011,SWM,69,P54670321,
2012,3,2011-10-01,0.4016461,Palanas,13,0.2898551,138,Other,,,,10,10,2011,SWM,69,P54670322,
2012,3,2011-10-01,0.4016461,Palanas,13,0.2898551,138,Other,,,,10,10,2011,SWM,69,P54670323,
2012,3,2011-10-01,0.4016461,Palanas,13,0.2898551,138,Other,,,,10,10,2011,SWM,69,P54670324,
2012,3,2011-10-01,0.4016461,Palanas,13,0.2898551,138,Other,,,,10,10,2011,SWM,69,P54670325,


In [61]:
head(DestSampled)

year_sampled,date,surv_weight,dest,dest_n_rec_annual,dest_prop_samp,dest_total_anems,source,source_n_rec_annual,source_prop_samp,source_total_anems,sim_month,sim_day,sim_year,sim_monsoon,daily_particles,particle_id,source_size
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P1,
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P2,
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P3,
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P4,
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P5,
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P6,


__below is where I left off 06/01/2021. rename columns to be the vector format like_this, and finish making the connectivity matrix for the biophysical simulation next__

In [None]:
#prep biophysical connectivity matrix
#outside of the loop, trim this to only be the destinations we sampled

DestSampled <- DateJoin[dest_prop_samp >0]
DestSampled <- kernels[Year %in% c("2012", "2013", "2014")][, Year:=as.integer(Year)][,c("Year", "NumParentageMatches")][DestSampled, on=.(Year=year_sampled)]#add in a column for the observed number of parentage matches
#rename the monsoon column in the full table for consistency
setnames(DestSampled, c("sim_monsoon", "Year", "NumParentageMatches"), c("monsoon", "year", "num_parentage_matches"))

#each year will require a different set of survey data, so make a list of each and index by site for fast look up
PropSampTable <- SurveyData[prop_anem_samp >0, c("year", "site")]

#make a site index table, use this for Sampled_reefs input in kernel fitting
SiteIndex <- unique(Centroids, by="site")[, "site"][, index := .I] #add the row number as the unique site index

#make sure all sampled sites are represented by joining the survey data to the sampled simulation
PropSampTable <- rbind(SurveyData[prop_anem_samp >0 & year %in% c(2012, 2013, 2014), c("year", "site")][, .(source=site, dest=site, year=year)][ #will join to the simulated sampling table by source and dest, so make those each a column from site and preserve the year variable as a key
    , c("year", "source", "dest")][, monsoon := "NEM"], SurveyData[prop_anem_samp >0 & year %in% c(2012, 2013, 2014), c("year", "site")][, .(source=site, dest=site, year=year)][ #will join to the simulated sampling table by source and dest, so make those each a column from site and preserve the year variable as a key
    , c("year", "source", "dest")][, monsoon := "SWM"])
UnqSurvey <- unique(PropSampTable, by=c("source", "dest", "year", "monsoon"))#add in the diff Monsoon seasons so there are complete parentage matrices later
AddDest <- rbind(SurveyData[year %in% c(2012, 2013, 2014) & prop_anem_samp >0][, c("year", "site")][, monsoon := "NEM"], 
                  SurveyData[year %in% c(2012, 2013, 2014) & prop_anem_samp >0][ , c("year", "site")][, monsoon := "SWM"])  #what destinations were sampled, for use with unassigned table




In [100]:
head(DateJoin)

year_sampled,date,surv_weight,dest,dest_n_rec_annual,dest_prop_samp,dest_total_anems,source,source_n_rec_annual,source_prop_samp,source_total_anems,sim_month,sim_day,sim_year,sim_monsoon,daily_particles,particle_id,source_size
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P1,
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P2,
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P3,
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P4,
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P5,
2011,2010-10-01,0.003292181,Palanas,,,,Other,,,,10,10,2010,SWM,9,P6,


In [None]:
#make a parentage matrix for the whole biophysical results
FullBiophysMat <- as.matrix(rbind(dcast(DestSampled[source != "Other" & source !="CAI" & source != "Gabas", .(source, dest)][, parentage :=1][order(source, dest)], source ~ dest, value.var="parentage", fun.aggregate = sum)[,"source"], 
      dcast(DestSampled[source == "Other" |source == "CAI" , .(source, dest)][, parentage :=1][order(source, dest)][, source := "unsampled"], source ~ dest, value.var="parentage", fun.aggregate = sum)[,"source"] ))

In [None]:
FullBiophysMat

In [None]:
dim(FullBiophysMat)

In [89]:
dim(FullBiophysMat)

In [101]:
Area

site,msq,kmsq
CAI,61575000,61.575
Cabatoan,53004,0.053004
Caridad Cemetery,7963,0.007963
Caridad Proper,7829,0.007829
Elementary School,14214,0.014214
Gabas,3015,0.003015
Haina,42715,0.042715
Hicgop South,12401,0.012401
Magbangon,105998,0.105998
Palanas,40695,0.040695


In [93]:
x <- list(Distances=Distances, Assignments=FullBiophysMat, Sampled_reefs=t(SiteIndex[site %in% SurveyData[, site], index]),
                  Reef_sizes=reef_sizes, Adult_sample_proportions=matrix(nrow=ncol(FullBiophysMat), ncol=1, 1)) #put inputs into a list because that's the bbmle format
Sim2012_4Fit <- suppressWarnings(mle2(LL_kt_bbmle, start=list(k=-3, theta=1), lower=c(-10, 0.15), upper=c(10, 8), method="L-BFGS-B", data=x, control=list(maxit=500)))


ERROR: Error in ObsVector * log(ProbVector): non-numeric argument to binary operator


In [95]:
dim(Assignments2012)
dim(Adult_sample_proportions2012)
dim(Sampled_reefs2012)

In [96]:
x

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0.0,36.5946593,37.5112742,37.625351,38.1057602,39.049765,37.8230185,38.166948,36.5195041,36.237763,...,32.701393,28.926277,27.351389,33.695741,37.50328,38.516482,37.597513,37.054742,37.642947,39.033551
36.59466,0.0,3.0328732,3.7812405,7.7170642,15.862935,24.458546,6.099642,0.2491593,2.6540921,...,39.54491,35.144215,36.419898,40.865998,6.213381,10.206371,12.895821,28.272891,31.104711,34.88923
37.51127,3.0328732,0.0,0.7611792,4.7583636,13.024371,21.9388356,3.089401,3.2818305,5.6803278,...,37.871146,33.627822,35.066563,39.166201,9.13498,13.118841,15.893159,25.916903,28.787651,32.595295
37.62535,3.7812405,0.7611792,0.0,3.9999295,12.278269,21.2468135,2.328784,4.0295957,6.4329063,...,37.352525,33.148293,34.623834,38.640953,9.896052,13.879354,16.652684,25.25603,28.133246,31.943863
38.10576,7.7170642,4.7583636,3.9999295,0.0,8.315917,17.5336836,1.722545,7.9606595,10.3654013,...,34.492511,30.513924,32.177918,35.74176,13.887636,17.875087,20.612864,21.692121,24.593653,28.409519
39.04977,15.8629348,13.0243709,12.278269,8.3159171,0.0,9.7139044,10.038211,16.0957979,18.4582391,...,28.304747,24.963682,27.009579,29.438628,22.076224,26.069278,28.674096,14.147121,17.039911,20.803673
37.82302,24.458546,21.9388356,21.2468135,17.5336836,9.713904,0.0,19.212801,24.6650474,26.8648033,...,19.483034,16.994224,19.361938,20.477052,30.548069,34.469402,36.732489,4.514775,7.357179,11.093097
38.16695,6.0996424,3.089401,2.3287841,1.7225454,10.038211,19.2128007,0.0,6.3468215,8.7536412,...,35.900425,31.835169,33.429679,37.164646,12.221398,16.200943,18.981054,23.33051,26.226313,30.042362
36.5195,0.2491593,3.2818305,4.0295957,7.9606595,16.095798,24.6650474,6.346821,0.0,2.4075738,...,39.680972,35.268567,36.530808,41.003925,5.982029,9.973883,12.652356,28.46572,31.293899,35.075943
36.23776,2.6540921,5.6803278,6.4329063,10.3654013,18.458239,26.8648033,8.753641,2.4075738,0.0,...,41.331275,36.827657,37.975455,42.668201,3.683507,7.641936,10.25072,30.567283,33.36672,37.127744

source,Cabatoan,Caridad Cemetery,Caridad Proper,Elementary School,Haina,Hicgop South,Magbangon,Palanas,Poroc Rose,Poroc San Flower,San Agustin,Sitio Baybayon,Sitio Lonas,Sitio Tugas,Tamakin Dacot,Visca,Wangag
Cabatoan,921,328,858,211,48,271,693,78,69,122,330,341,286,274,55,1,192
Caridad Cemetery,620,329,998,85,12,156,636,66,48,121,446,480,613,268,1,0,76
Caridad Proper,484,105,636,39,8,167,333,4,53,134,540,216,548,208,4,2,48
Elementary School,250,416,803,953,27,2046,239,1,180,280,745,94,710,1141,8,1,8
Gabas,27,12,55,474,120,205,1,2,729,1056,5084,2,8475,291,96,8280,0
Haina,2,0,2,0,1999,0,2,0,0,2,54,43,9,6,357,41,4
Hicgop South,495,1023,1838,1239,415,2658,365,7,605,1052,1716,632,1971,1115,233,73,39
Magbangon,685,796,1454,128,46,462,387,240,41,81,499,793,211,188,23,13,492
Palanas,651,422,584,1,0,81,782,477,9,1,38,1697,1,38,0,0,480
Poroc Rose,109,84,233,351,125,192,22,9,702,1061,2056,29,4357,369,72,616,12

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19

0
61.575
0.053004
0.007963
0.007829
0.014214
0.003015
0.042715
0.012401
0.105998
0.040695

0
1
1
1
1
1
1
1
1
1
1


In [94]:
dim(FullBiophysMat)

In [31]:
dcast(DestSampled[source == "Other", .(source, dest)][, parentage :=1][order(source, dest)], source ~ dest, value.var="parentage", fun.aggregate = sum)

source,Cabatoan,Caridad Cemetery,Caridad Proper,Elementary School,Haina,Hicgop South,Magbangon,Palanas,Poroc Rose,Poroc San Flower,San Agustin,Sitio Baybayon,Sitio Lonas,Sitio Tugas,Tamakin Dacot,Visca,Wangag
Other,15268,10495,12894,14377,30495,21701,15118,16818,6698,8343,10862,16942,17949,7338,82466,6206,14974


In [None]:
#Mike Bode said I should be less aggressively subsampling the biophysical data, but how should I do that? The kernel fitting still requires that I have a prop samp vector... but when I tried putting 1's in that vector and using all of the particles for fitting I got crazy 

In [64]:
#sample the particle data
SimSample <- DestSampled[, .SD[sample(.N, dest_n_rec_annual)], by = c("year", "dest")] #, prob=surv_weight #randomly sample rows (particles) from the table according to the survival weighting, based on the number we sampled at each site in each year of surveys
check1 <- nrow(SimSample)

#assign parentage
SimParentage <- SimSample[source_prop_samp > 0][, .SD[sample(.N, num_parentage_matches)], by = .(year)][#Prob=SurvWeight, #now randomly assign parentage or not parentage, based on how well we sampled the source and the number of parentage matches we had in that year
                , parentage := 1]
#for faster searching, set keys
setkey(SimParentage, particle_id)
setkey(SimSample, particle_id)

l <- list(SimSample[particle_id %!in% SimParentage$particle_id][, parentage := 0], SimParentage)
SimSample <- rbindlist(l, use.names = TRUE, fill=TRUE, idcol = NULL)[, c("year", "source", "dest", "parentage", "monsoon")] #add back in to the unassigned particles, select only the columns necessary

#check results, for testing loop only
#nrow(SimSample)==check1 #should be TRUE
#sum(SimSample$parentage) #should be 37

In [65]:
sum(SimSample$parentage)

In [48]:

#calculate the unassigned row
Unassigned <- unique(SimSample[parentage==0][#not counting parentage!
    , num_sampled := .(.N), by= c("dest", "year", "monsoon")], by=c("dest", "year", "monsoon"))[, -"source"]
#add destinations not sampled in loop iteration to unassigned 
Unassigned <- Unassigned[AddDest, on=.(year=year, dest=site, monsoon)]
Unassigned$num_sampled[is.na(Unassigned$num_sampled)] <- 0
#sum(Unassigned$num_sampled, na.rm=T)==check1-37 #total should be the total sampled particles minus the total assigned
setorder(Unassigned, year, dest)

##adding in the possible sampled routes needs to happen AFTER calculating unassigned because unassigned is calculated from row counts
SimSample <- PropSampTable[SimSample, on=.(year, source, dest, monsoon)]
#check all is well- for testing loop only
#sum(SimSample$parentage) #should be 37

#add in the routes we could have assigned given our sampling so the parentage matrix is complete
UnqSimSample <- unique(SimSample, by=c("source", "dest", "year", "monsoon"))

AddRoutes <- UnqSurvey[!UnqSimSample, on = names(UnqSurvey)][ #what combos are not appearing because we didn't sample particles, but the route is possible based on our survey sampling
    , `:=`(parentage= 0, num_sampled = 0) ] #add the parentage column 

#add back into the sampled simulation data
l <- list(SimSample, AddRoutes[,-"num_sampled"])
SimSample <- rbindlist(l, use.names = TRUE, fill=TRUE, idcol = NULL)
setorder(SimSample, year, source, dest)

#make summary tables for each time frame, to be used for making parentage matrix
SimSampleByYear <- SimSample[,  .(total_parentage =sum(parentage)), by=c("year", "source", "dest")]
#sum(SimSampleByYear$total_parentage)
UnassignedByYear <- Unassigned[, .(total_sampled = sum(num_sampled)), by=c("year", "dest")]


SimSampleInterannual <- SimSample[,  .(total_parentage =sum(parentage)), by=c("source", "dest")]
#sum(SimSampleInterannual$total_parentage)
UnassignedInterannual <- Unassigned[, .(total_sampled = sum(num_sampled)), by=c("dest")]


SimSampleMonsoon <- SimSample[,  .(total_parentage =sum(parentage)), by=c("monsoon", "source", "dest")]
sum(SimSampleMonsoon$total_parentage)
UnassignedMonsoon <- Unassigned[, .(total_sampled = sum(num_sampled)), by=c("monsoon", "dest")]

#make a parentage matrix for each year
mat2012 <- dcast(SimSampleByYear[year==2012], source ~ dest, value.var="total_parentage", fun.aggregate = sum)[source %in% SurveyData[year==2012 & prop_anem_samp >0, site]][, -"source"] 
mat2012 <- as.matrix(rbind(mat2012, t(UnassignedByYear[year==2012][, total_sampled]), use.names=F))

mat2013 <- dcast(SimSampleByYear[year==2013], source ~ dest, value.var="total_parentage", fun.aggregate = sum)[source %in% SurveyData[year==2013 & prop_anem_samp >0, site]][, -"source"] 
mat2013 <- as.matrix(rbind(mat2013, t(UnassignedByYear[year==2013][, total_sampled]), use.names=F))

mat2014 <- dcast(SimSampleByYear[year==2014], source ~ dest, value.var="total_parentage", fun.aggregate = sum)[source %in% SurveyData[year==2014 & prop_anem_samp >0, site]][, -"source"] 
mat2014 <- as.matrix(rbind(mat2014, t(UnassignedByYear[year==2014][, total_sampled]), use.names=F))

mat2012_4 <- dcast(SimSampleInterannual, source ~ dest, value.var="total_parentage", fun.aggregate = sum)[source %in% SurveyData[year %in% c(2012, 2013, 2014) & prop_anem_samp >0, site]][, -"source"] 
mat2012_4 <- as.matrix(rbind(mat2012_4, t(UnassignedInterannual[, total_sampled]), use.names=F))

matNEM <- dcast(SimSampleMonsoon[monsoon=="NEM"], source ~ dest, value.var="total_parentage", fun.aggregate = sum)[source %in% SurveyData[year %in% c(2012, 2013, 2014) & prop_anem_samp >0, site]][, -"source"] 
matNEM <- as.matrix(rbind(matNEM, t(UnassignedMonsoon[monsoon=="NEM"][, total_sampled]), use.names=F))

matSWM <- dcast(SimSampleMonsoon[monsoon=="SWM"], source ~ dest, value.var="total_parentage", fun.aggregate = sum)[source %in% SurveyData[year %in% c(2012, 2013, 2014) & prop_anem_samp >0, site]][, -"source"] 
matSWM <- as.matrix(rbind(matSWM, t(UnassignedMonsoon[monsoon=="SWM"][, total_sampled]), use.names=F))



In [49]:
matSWM

Cabatoan,Caridad Cemetery,Caridad Proper,Elementary School,Haina,Hicgop South,Magbangon,Palanas,Poroc Rose,Poroc San Flower,San Agustin,Sitio Baybayon,Sitio Lonas,Sitio Tugas,Tamakin Dacot,Visca,Wangag
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [18]:
#Biophysical source normalized matrix
#for each source, what is the normalized recruitment at each destination? 
GenSimConn[, annual_source_normalized_recruitment := sum(daily_particles_recruited)/sum(daily_particles_released), by=c("source", "destination","year_sampled")]


year,site,nF,n_offs_gen,PropAnemSamp,TotalAnems
2012,Cabatoan,8,4,0.42307692,26
2012,Caridad Cemetery,,0,0.00000000,4
2012,Caridad Proper,,0,0.00000000,4
2012,Elementary School,,0,0.00000000,7
2012,Gabas,,0,0.00000000,9
2012,Haina,,0,0.00000000,104
2012,Hicgop South,,0,0.00000000,18
2012,Magbangon,,10,0.45938375,139
2012,Palanas,49,13,0.28985507,138
2012,Poroc Rose,9,9,1.00000000,13


__Code function for likelihood of parentage data given a biophysical model, based on Bode et al. 2019 in Plos Bio__

In [None]:
#lay out all the pieces
pop_size_vec <- #vector of pop sizes for all reefs (a). This term is also used in parentage kernel fitting, but reef sizes are substituted as a proxy for pop size. This is should be bootstrapped to account for uncertainty.
BioPhysMat <- #source normalized biophysical connectivity matrix. In Eqn. S3.4, this is m ajt/r a (*should it be r at? As in all particles released in time period t?)
prop_samp_vec <- #vector of proportion of habitat sampled for all reefs in time period t
unassigned_vec#from genetic parentage data- a vector of the number of unassigned recruits at each destination reef in the system- we only have this for all sampled reefs.... what should the dimensions be?*


In [None]:
#format the data

In [None]:
reef

In [3]:
head(AnnualRecsSamp)

year,site,n_offs_gen
2012,Cabatoan,4
2012,Palanas,13
2012,Poroc Rose,9
2012,Poroc San Flower,9
2012,Sitio Lonas,1
2012,Visca,1


In [None]:
#Eqn. S3.4 term 1 for loop

for(j in 1:nrow(centroids)){ #for each destination reef in the whole Camotes system
    
dest_unassigned <- sum(unassigned_vec[t==time_period]) #the sum of all unassigned recruits at time period t
    
        for(i in 1:nrow(centroids)){ #term 1- for each source reef in the whole Camotes system, expected unassigned at each destination reef
            
            term1_num <- ((1-prop_samp_vec[i,t== time_period])^2)*pop_size_vec[i]*BioPhysMat[i,j,t]
            
            for(a in 1:nrow(centroids)){ #for all reefs total in the whole Camotes system, denominator of term 1
                
                value_a <-  pop_size_vec[a]*BioPhysMat[a, j, t]
                term1_denom <- sum(value_a)
            }
        
        term1 = term1_num/term1_denom
        }

    for(k in 1:nrow(centroids)){ #term 2- for each source reef in the whole Camotes system, expected assigned at each destination reef
        
        term2_num <- (1-(1-prop_samp_vec[i,t== time_period])^2)*pop_size_vec[i]*BioPhysMat[i,j,t]
            
            for(a in 1:nrow(centroids)){ #for all reefs total in the whole Camotes system, denominator of term 2
                
                value_a <-  pop_size_vec[a]*BioPhysMat[a, j, t]
                term2_denom <- sum(value_a)
            }
        
        term2 = term2_num/term2_denom
        
        }
        
    

prob_unsampled <- dest_unassigned*log(term1)  
prob_sampled <- dest_assigned*log(term2)
ll = prob_unsampled + prob_sampled
}

In [None]:

for(i in 1:NumSampledReefs){
   This_SS_A = Adult_sample_proportions[i]#same
   for(j in 1:NumSampledReefs){
    SettlersFromAssignedReefs = Settlers[Sampled_reefs[i],Sampled_reefs[j]]#same
    #Not all settlers from assigned reefs will be assigned, because not all adults were sampled
    AssignedSettlers[i,j] = SettlersFromAssignedReefs*(This_SS_A^2 + 2*This_SS_A*(1 - This_SS_A))
    AssignedSettlers[NumSampledReefs+1,j] = AssignedSettlers[NumSampledReefs+1,j] + SettlersFromAssignedReefs*(1-This_SS_A)^2 #The three dots '...' tell matlab that the code on a given line continues on the next line.
   }
}
Unsampled = as.matrix(setdiff(1:NumReefs,Sampled_reefs))


for(j in 1:NumSampledReefs){
   AssignedSettlers[NumSampledReefs+1,j] = AssignedSettlers[NumSampledReefs+1,j] + sum(Settlers[Unsampled,Sampled_reefs[,j]]) 
}
   



__Loop through sampling different proportions of other and CAI source particles and compare the unassigned proportions of total sample particles to the genetic observations from survey data__

In [48]:
PropAssignedTable <- rbind(kernels[Year %in% c("2012", "2013", "2014")][
    , PropAssigned := PercentAssigned/100][ #change to proportion note percent
    , c("Year", "NumParentageMatches", "NumOffsSampled", "PropAssigned")],                      
    unique(kernels[Year %in% c("2012", "2013", "2014")][ #only the years coinciding with the models
    , `:=` (NumParentageMatches=sum(NumParentageMatches), NumOffsSampled=sum(NumOffsSampled), PropAssigned = NumParentageMatches/NumOffsSampled, Year = "2012-4")][ #summarise across the 3 years
    , c("Year","NumParentageMatches", "NumOffsSampled", "PropAssigned")], by="Year"))
    

PropAssignedTable[]

#add in the average sampled proportion of anemones
AvgPropSamp <- SurveyData[PropAnemSamp >0, .(PropAnemSamp = mean(PropAnemSamp)), by="year"][ #average for each site we sampled, how well we sampled
    year %in% c("2012", "2013", "2014")][
    , year :=as.character(year)]

ExpectedPropAssigned <- AvgPropSamp[PropAssignedTable, on=.(year=Year)]
ExpectedPropAssigned$PropAnemSamp[is.na(ExpectedPropAssigned$PropAnemSamp)] <- mean(ExpectedPropAssigned$PropAnemSamp, na.rm = T) #replace the 2012-4 NA with the average from the 3 years

#what's the normalized self recruitment proportion back to the population
ExpectedPropAssigned[, ExpAssigned := NumParentageMatches/(NumOffsSampled*PropAnemSamp)][] #this is the expected assignment for the whole surveyed population if we had sampled all adults (which we kind of do when we use the simulation results)
#at some point... maybe it would be better to compare on a site to site level? idk that's pretty fine scale, I don't know that our ROMS model can be expected to compare so well with that

Year,NumParentageMatches,NumOffsSampled,PropAssigned
2012,3,63,0.0476
2013,21,150,0.14
2014,13,181,0.0718
2012-4,37,394,0.04761905


year,PropAnemSamp,NumParentageMatches,NumOffsSampled,PropAssigned,ExpAssigned
2012,0.6566669,3,63,0.0476,0.07251629
2013,0.5793625,21,150,0.14,0.24164492
2014,0.5714427,13,181,0.0718,0.12568749
2012-4,0.6024907,37,394,0.04761905,0.15586735


__For the expected values of recruits from outside of our sampled region, I'm using the intermediate "PredictedProportions" matrix from the Bode kernel fitting script (https://github.com/MikeBode/Parentage_kernel_fitting/blob/master/Kernel_Fitting_Function.m) because it accounts for how well sites were sampled when estimating proportions of recruits from unsampled sites__

In [55]:
#####outside of the loop*****

PropSampTable <- SurveyData[PropAnemSamp >0, c("year", "site")]

#make sure all sampled sites are represented by joining the survey data to the sampled simulation
PropSampTable <- rbind(SurveyData[PropAnemSamp >0 & year %in% c(2012, 2013, 2014), c("year", "site")][, .(Source=site, Dest=site, Year=year)][ #will join to the simulated sampling table by source and dest, so make those each a column from site and preserve the year variable as a key
    , c("Year", "Source", "Dest")][, Monsoon := "NEM"], SurveyData[PropAnemSamp >0 & year %in% c(2012, 2013, 2014), c("year", "site")][, .(Source=site, Dest=site, Year=year)][ #will join to the simulated sampling table by source and dest, so make those each a column from site and preserve the year variable as a key
    , c("Year", "Source", "Dest")][, Monsoon := "SWM"])
#unq_survey <- unique(PropSampTable, by=c("Source", "Dest", "Year", "Monsoon"))#, unique(PropSampTable, by=c("Source", "Dest", "Year"))[, Monsoon := "SWM"]) #add in the diff Monsoon seasons so there are complete parentage matrices later
#add_dest <- rbind(SurveyData[year %in% c(2012, 2013, 2014) & PropAnemSamp >0][, c("year", "site")][, Monsoon := "NEM"], SurveyData[year %in% c(2012, 2013, 2014) & PropAnemSamp >0][, c("year", "site")][, Monsoon := "SWM"])  #what destinations were sampled, for use with unassigned table

###outside of the loop
PropToEval <- seq(0.1, 1, 0.1) #make a vector of proportions to sample iterativaley and compare
#empty table to hold results
PropSampOtherCAI <- data.table(TimeScale=character(), TimeID=character(), PropUnassigned=numeric(), ExpUnassigned=numeric(),  PropSampEval=numeric(), Check1=character(), Check2=character(), NrowSimConn=numeric())



In [59]:
pb <- txtProgressBar(min = 0, max =length(PropToEval), style = 3)

StartTime <- Sys.time()

for(i in 1:length(PropToEval)){

PropSampOtherCAI_int <- data.table(TimeScale=character(), TimeID=character(), PropUnassigned=numeric(), ExpUnassigned=numeric(),  PropSampEval=numeric(), Check1=character(), Check2=character(), NrowSimConn=numeric())[1:4]

dest_sampled <- date_join[DestPropSamp >0]
check1 <- nrow(dest_sampled)
dest_sampled <- dest_sampled[c(dest_sampled[, .I[Source != "Other"]], sample(dest_sampled[, .I[Source == "Other"]], length(dest_sampled[, .I[Source == "Other"]])*PropToEval[i]))]
check2 <- nrow(dest_sampled)
dest_sampled <- dest_sampled[c(dest_sampled[, .I[Source != "CAI"]], sample(dest_sampled[, .I[Source == "CAI"]], length(dest_sampled[, .I[Source == "CAI"]])*PropToEval[i]))]
check3 <- nrow(dest_sampled)

#check that we have less rows, should both be TRUE
test1 <- check1 > check2
test2 <- check2 > check3
#check1 > check2
#check2 > check3


#join in the number of parentage matches observed by year
dest_sampled <- kernels[Year %in% c("2012", "2013", "2014")][, Year:=as.integer(Year)][,c("Year", "NumParentageMatches")][dest_sampled, on=.(Year=YearSampled)]#[

#randomly subsample the sampled particle data
sim_sample <- dest_sampled[, .SD[sample(.N, DestNOffsAnnual, prob=SurvWeight)], by = c("Year", "Dest")] #randomly sample rows (particles) from the table according to the survival weighting, based on the number we sampled at each site in each year of surveys

PropUnassignedByYear <- (sim_sample[Source == "CAI"| Source == "Other", .(.N), by="Year"][, N]/#total particales sampled from other/CAI sources
sim_sample[, .(.N), by="Year"][, N]) #total particles sampled

PropSampOtherCAI_int$TimeScale[1] <- "annual"
PropSampOtherCAI_int$TimeID[1] <- "2012"
PropSampOtherCAI_int$PropUnassigned[1] <- PropUnassignedByYear[1]
PropSampOtherCAI_int$ExpUnassigned[1] <- 1-(ExpectedPropAssigned[1, ExpAssigned])
PropSampOtherCAI_int$PropSampEval[1] <- PropToEval[i]
PropSampOtherCAI_int$Check1[1] <- test1
PropSampOtherCAI_int$Check2[1] <- test2
PropSampOtherCAI_int$NrowSimConn[1] <- nrow(dest_sampled)
    
PropSampOtherCAI_int$TimeScale[2] <- "annual"
PropSampOtherCAI_int$TimeID[2] <- "2013"
PropSampOtherCAI_int$PropUnassigned[2] <- PropUnassignedByYear[2]
PropSampOtherCAI_int$ExpUnassigned[2] <- 1-(ExpectedPropAssigned[2, ExpAssigned])
PropSampOtherCAI_int$PropSampEval[2] <- PropToEval[i]
PropSampOtherCAI_int$Check1[2] <- test1
PropSampOtherCAI_int$Check2[2] <- test2
PropSampOtherCAI_int$NrowSimConn[2] <- nrow(dest_sampled)
    
PropSampOtherCAI_int$TimeScale[3] <- "annual"
PropSampOtherCAI_int$TimeID[3] <- "2014"
PropSampOtherCAI_int$PropUnassigned[3] <- PropUnassignedByYear[3]
PropSampOtherCAI_int$ExpUnassigned[3] <- 1-(ExpectedPropAssigned[3, ExpAssigned])
PropSampOtherCAI_int$PropSampEval[3] <- PropToEval[i]
PropSampOtherCAI_int$Check1[3] <- test1
PropSampOtherCAI_int$Check2[3] <- test2
PropSampOtherCAI_int$NrowSimConn[3] <- nrow(dest_sampled)
    
PropSampOtherCAI_int$TimeScale[4] <- "interannual"
PropSampOtherCAI_int$TimeID[4] <- "2012_4"
PropSampOtherCAI_int$PropUnassigned[4] <- nrow(sim_sample[Source == "CAI"| Source == "Other"])/nrow(sim_sample)
PropSampOtherCAI_int$ExpUnassigned[4] <- 1-(ExpectedPropAssigned[4, ExpAssigned])
PropSampOtherCAI_int$PropSampEval[4] <- PropToEval[i]
PropSampOtherCAI_int$Check1[4] <- test1
PropSampOtherCAI_int$Check2[4] <- test2
PropSampOtherCAI_int$NrowSimConn[4] <- nrow(dest_sampled)

l <- list(PropSampOtherCAI, PropSampOtherCAI_int)
PropSampOtherCAI <- rbindlist(l, use.names = TRUE, fill=TRUE, idcol = FALSE)
setTxtProgressBar(pb, i)
    
}

close(pb)
EndTime <- Sys.time()
EndTime-StartTime

fwrite(PropSampOtherCAI, file="~/oceanography/script_output/SimulationSummaryTables/PropSampOtherCAIEvaluation.csv")




Time difference of 1.066185 mins

In [58]:
PropSampOtherCAI[order(-PropUnassigned)]

TimeScale,TimeID,PropUnassigned,ExpUnassigned,PropSampEval,Check1,Check2,NrowSimConn
annual,2013,0.6666667,0.7583551,0.9,True,True,579512
annual,2013,0.6666667,0.7583551,1.0,False,False,610419
annual,2012,0.6507937,0.9274837,0.8,True,True,548606
annual,2014,0.6464088,0.8743125,0.9,True,True,579512
interannual,2012_4,0.6395939,0.8441326,0.9,True,True,579512
annual,2014,0.6353591,0.8743125,0.8,True,True,548606
interannual,2012_4,0.6294416,0.8441326,1.0,False,False,610419
interannual,2012_4,0.6269036,0.8441326,0.8,True,True,548606
annual,2014,0.6187845,0.8743125,1.0,False,False,610419
annual,2013,0.6066667,0.7583551,0.8,True,True,548606


__Seems like good justification to not subsample the ROMS particles from other/Camotes Islands__

In [10]:
nrow(dest_sampled)
nrow(SimConn)

In [7]:
#save inter file
#fwrite(dest_sampled, file="~/oceanography/script_output/LongFormConnWithProbsTest.csv")
#see if I can write as a compressed file so it can be stored on github
#https://stackoverflow.com/questions/42788401/is-possible-to-use-fwrite-from-data-table-with-gzfile