In [328]:
Packages <- c("dplyr","nleqslv", "broom","cubature", "geosphere", "data.table",  "ggplot2", "bbmle", "stringr",  "lubridate", "RColorBrewer")

invisible(suppressPackageStartupMessages(lapply(Packages, library, character.only = TRUE)))

setwd('/local/home/katrinac/oceanography')
"%!in%" <- function(x,table) match(x,table, nomatch = 0) == 0
source("~/parentage/kernel_fitting/1340_loci/functions/ll_kt_both_bbmle.R")
source("~/parentage/kernel_fitting/1340_loci/functions/GenGausKernInt_sum0.5.R") #integrate_kernel_sum1
source("~/parentage/kernel_fitting/1340_loci/functions/GenGausKernInt_sum1.R") #integrate_kernel_sum0.5
source("~/parentage/kernel_fitting/1340_loci/functions/cdf_solve.R") #median
source("~/parentage/kernel_fitting/1340_loci/functions/cdf_solve90.R") #dist 90% retained

#read in the kernel fitting summary
kernels <- fread(file="~/parentage/kernel_fitting/1340_loci/final_results/tables/kernel_fitting_summary.csv")
#read in the centroids adjusted for the simulation, so the Magbangons combined 
centroids <- fread(file="~/oceanography/script_output/SurveyData/SimulationCentroids.csv")
#read in the table with number of recruits sampled at each site for each year
AnnualRecsSamp <- fread(file="~/oceanography/script_output/SurveyData/AnnualRecruitsSampled.csv")
#read in the table of the proportion of anemones sampled at each site for each year
PropSamp <- fread(file="~/oceanography/script_output/SurveyData/ProportionHabitatSampled.csv")

#read in the ROMS simulation connectivity table
SimConn <- fread(file="~/oceanography/script_output/SimulationSummaryTables/SimConnectivityTable.csv")

#make vectors defining sites we didn't sample, but that are in the model, and the sandflats specifically 
UnsampledSites <- c("SF1", "SF2", "SF3", "SF4", "SF5", "SF6", "Pangasugan", "Other", "CAI") 
SandFlats <- c("SF1", "SF2", "SF3", "SF4", "SF5", "SF6") 

#make the constant inputs for the kernel fitting function
#distance matrix using the centroids with combined Magbangon
### List of source locations
sites_source <- centroids

### List of destination locations
sites_dest <- centroids

dist_mat_m <- distm(sites_source[,c('lon','lat')], sites_source[,c('lon','lat')], fun=distVincentyEllipsoid)
Distances <- dist_mat_m*10^-3
#read in the reef areas for the kernel fitting
Area <- read.csv("~/oceanography/empirical_data/site_area_header_nonsurveyed_simulation_kernels.csv", header=TRUE, stringsAsFactors = F) %>%
    arrange(site) %>%
    filter(site %!in% c("near_north_full1", "near_north_full2", "near_north_full3", "near_south_full1", "near_south_full2", "near_south_full3")) %>%
    mutate(kmsq=msq*10^-6) %>%
    select(kmsq)
Reef_sizes <- as.matrix(Area)

In [2]:
#make the connectivity data long form
SimConn <- SimConn[destination !="Other" & source %!in% SandFlats][ #we don't care about "other" as a destination, and sand flats can't be a source
    ,!c("SourceSampled", "DestSampled")]#drop these columns, it was used for sorting in the process_ROMS script but at this point it's confusing
check1 <- sum(SimConn$DailyParticles)   
#setkey(SimConn, source)
SimConn <- SimConn[rep(seq_len(nrow(SimConn)), SimConn$DailyParticles), 1:ncol(SimConn)][ #make long form, repeat each row the number of times specified in "DailyParticles"
    , ParticleID := paste ("P", .I, sep = "", collapse = NULL)]#finally, make a new column that's the particle ID for each row

nrow(SimConn)==check1 #should be TRUE, meaning the correct number of particles are retained

#the below code is subsampling particles, but we want to do that within the loop of kernel fitting
#SimConnDTDir3 <- SimConnDTDir2[c(SimConnDTDir2[, .I[source != "Other"]], sample(SimConnDTDir2[, .I[source == "Other"]], length(SimConnDTDir2[, .I[source == "Other"]])*.1))]
#SimConnDTDir3 <- SimConnDTDir2[c(SimConnDTDir3[, .I[source != "CAI"]], sample(SimConnDTDir3[, .I[source == "CAI"]], length(SimConnDTDir3[, .I[source == "CAI"]])*.1))]


In [1]:
SimConn[, .(.N), by = .(source)]

ERROR: Error in eval(expr, envir, enclos): object 'SimConn' not found


In [8]:
#make a table with the survey information for each site (how many fish sampled, prop anems sampled, total number of anems at site)
SurveyData <- AnnualRecsSamp[PropSamp, on=.(year=end_year, site)][#join the sampling tables together
    is.na(n_offs_gen), n_offs_gen := 0][#change NA's to 0
    ,-"time_frame"]#drop the time_frame column, we can key with end_year
#setkey(SurveyData, site)


In [5]:
#join the simulation data with the survey data
source_join <- SurveyData[SimConn, on = .(site = source, year=YearSampled)]
setnames(source_join, c("site", "year", "n_offs_gen", "PropAnemSamp", "TotalAnems"), c("Source","YearSampled", "SourceNOffsAnnual",  "SourcePropSamp", "SourceTotalAnems"))

destination_join <- SurveyData[source_join, on = .( site= destination, year=YearSampled)]
setnames(destination_join, c("site", "year", "n_offs_gen", "PropAnemSamp", "TotalAnems"), c("Dest","YearSampled", "DestNOffsAnnual",  "DestPropSamp", "DestTotalAnems"))



In [6]:
#lets see if we can do all the kernel fitting in one loop! be ambitious! this table created here will be what we loop through for kernel fitting
SimDates <- destination_join[,.(date= unique(date)), by=.(YearSampled)][ #make a data.table of dates in that year-
    , DateSeqLin := ((0.25/2)/5)*.I, by=.(YearSampled)][ #then add a column of row number mulitplied by the linear survival model slope
    , SurvWeight := DateSeqLin/max(DateSeqLin), by=.(YearSampled)][ #then add a column normalizing that as the survival probability, BUT if not eliminating new recruits (months 4/5) multiply the denominator by 1.25 so that nothing is an arbitrary prob=1
    ,-"DateSeqLin"]

#check this looks right- 2014 has fewer dates than the other years because the simulation is truncated for that time, and should see the SurvWeight column get low again when the year changes
#SimDates[, .(.N), by = .(YearSampled)]
#SimDates[240:250,]


#now join with the simulation data table
date_join <- SimDates[destination_join, on=.(date, YearSampled)][#joining with YearSampled too is redundant, can otherwise drop the column from SimDates before joining #
    SimMonth != c(4,5)] #filter out particles recruiting in April/May, they would be new recruits
 
nrow(date_join)==nrow(SimConn[SimMonth != c(4,5)]) #should be TRUE

#finally, weight the sources by their number of anemones relative to the total number of anemones across the surveyed region
total_anems <- sum(SurveyData[year=="2014"][,"TotalAnems"]) #get the total number of anems recorded for the simulation time period
date_join <- date_join[, SourceSize := SourceTotalAnems/total_anems, by=.(Source)]

#first thing is sampling the simulation data like we sampled for the survey data

#outside of the loop, trim this to only be the destinations we sampled, and 10% of the other/camotes particles (so the loop is faster). After finishing this code, we're going to figure out the most representative proportion to sample*

dest_sampled <- date_join[DestPropSamp >0]
check1 <- nrow(dest_sampled)
dest_sampled <- dest_sampled[c(dest_sampled[, .I[Source != "Other"]], sample(dest_sampled[, .I[Source == "Other"]], length(dest_sampled[, .I[Source == "Other"]])*.1))]
check2 <- nrow(dest_sampled)
dest_sampled <- dest_sampled[c(dest_sampled[, .I[Source != "CAI"]], sample(dest_sampled[, .I[Source == "CAI"]], length(dest_sampled[, .I[Source == "CAI"]])*.1))]
check3 <- nrow(dest_sampled)

#check that we have less rows, should both be TRUE
check1 > check2
check2 > check3

#join in the number of parentage matches observed by year
dest_sampled <- kernels[Year %in% c("2012", "2013", "2014")][, Year:=as.integer(Year)][,c("Year", "NumParentageMatches")][dest_sampled, on=.(Year=YearSampled)]#[
    #, ParticleID := paste ("P", .I, sep = "", collapse = NULL)]#finally, make a new column that's the particle ID for each row

#setkey(dest_sampled, "Year", "Dest", "Source")




In [10]:
nrow(dest_sampled)
nrow(SimConn)

In [7]:
#save inter file
#fwrite(dest_sampled, file="~/oceanography/script_output/LongFormConnWithProbsTest.csv")
#see if I can write as a compressed file so it can be stored on github
#https://stackoverflow.com/questions/42788401/is-possible-to-use-fwrite-from-data-table-with-gzfile

In [4]:
#read in the inter file instead of re-running above code
dest_sampled <- fread(file="~/oceanography/script_output/LongFormConnWithProbsTest.csv")
head(dest_sampled)

Year,NumParentageMatches,date,SurvWeight,Dest,DestNOffsAnnual,DestPropSamp,DestTotalAnems,Source,SourceNOffsAnnual,SourcePropSamp,SourceTotalAnems,SimMonth,SimDay,SimYear,SimMonsoon,DailyParticles,ParticleID,SourceSize
2012,3,2011-10-01,0.5020576,Sitio Lonas,1,0.6666667,3,Gabas,0.0,0.0,9.0,10,10,2011,SWM,2,P54899246,0.002719033
2012,3,2011-10-01,0.5020576,Sitio Lonas,1,0.6666667,3,Gabas,0.0,0.0,9.0,10,10,2011,SWM,2,P54899247,0.002719033
2012,3,2011-10-01,0.5020576,Sitio Lonas,1,0.6666667,3,Pangasugan,,,,10,10,2011,SWM,1,P54899263,
2012,3,2011-10-01,0.5020576,Sitio Lonas,1,0.6666667,3,Poroc San Flower,9.0,1.0,11.0,10,10,2011,SWM,5,P54899265,0.003323263
2012,3,2011-10-01,0.5020576,Sitio Lonas,1,0.6666667,3,Poroc San Flower,9.0,1.0,11.0,10,10,2011,SWM,5,P54899266,0.003323263
2012,3,2011-10-01,0.5020576,Sitio Lonas,1,0.6666667,3,Poroc San Flower,9.0,1.0,11.0,10,10,2011,SWM,5,P54899267,0.003323263


In [183]:
#####outside of the loop*****
#each year will require a different set of survey data, so make a list of each and index by site for fast look up
PropSampTable <- SurveyData[PropAnemSamp >0, c("year", "site")]#[
    #, c("Year", "Source", "Dest", "Parentage")] #only grab the columns we need
#setkey(year)

#make a site index table, use this for Sampled_reefs input in kernel fitting
site_index <- unique(SurveyData, by="site")[, "site"][, index := .I] #add the row number as the unique site index

#make sure all sampled sites are represented by joining the survey data to the sampled simulation
PropSampTable <- SurveyData[PropAnemSamp >0 & year %in% c(2012, 2013, 2014), c("year", "site")][, .(Source=site, Dest=site, Year=year)][ #will join to the simulated sampling table by source and dest, so make those each a column from site and preserve the year variable as a key
    , c("Year", "Source", "Dest")] #only grab the columns we need
unq_survey <- unique(PropSampTable, by=c("Source", "Dest", "Year"))
add_dest <- SurveyData[year %in% c(2012, 2013, 2014) & PropAnemSamp >0][, c("year", "site")] #what destinations were sampled, for use with unassigned table



In [444]:
#make an empty data.table to hold all of the simulation results
row_num <- 4*1000
SimKernels <- data.table(year=character(), k=numeric(), theta=numeric(), mdd=numeric(), med=numeric(), dist90=numeric(), iteration=numeric())[1:row_num]


pb <- txtProgressBar(min = 0, max = 1000, style = 3)#7 years in each interation

StartTime <- Sys.time()


for(i in 1:1000){
#sample the particle data
sim_sample <- dest_sampled[, .SD[sample(.N, DestNOffsAnnual, prob=SurvWeight)], by = c("Year", "Dest")] #randomly sample rows (particles) from the table according to the survival weighting, based on the number we sampled at each site in each year of surveys
#check1 <- nrow(sim_sample)

#assign parentage
sim_parentage <- sim_sample[SourcePropSamp > 0][, .SD[sample(.N, NumParentageMatches, prob=SourcePropSamp)], by = .(Year)][#now randomly assign parentage or not parentage, based on how well we sampled the source and the number of parentage matches we had in that year
                , Parentage := 1]

#for faster searching, set keys
setkey(sim_parentage, ParticleID)
setkey(sim_sample, ParticleID)

sim_sample <- rbind(sim_sample[ParticleID %!in% sim_parentage$ParticleID][, Parentage := 0], sim_parentage)[, c("Year", "Source", "Dest", "Parentage")] #add back in to the unassigned particles, select only the columns necessary

#check results, for testing loop only
#nrow(sim_sample)==check1 #should be TRUE
#sum(sim_sample$Parentage) #should be 37

#calculate the unassigned row
unassigned <- unique(sim_sample[Parentage==0][#not counting parentage!
    , num_sampled := .(.N), by= c("Dest", "Year")], by=c("Dest", "Year"))[, -"Source"]
#add destinations not sampled in loop iteration to unassigned 
unassigned <- unassigned[add_dest, on=.(Year=year, Dest=site)]
unassigned$num_sampled[is.na(unassigned$num_sampled)] <- 0
#sum(unassigned$num_sampled, na.rm=T)==check1-37 #total should be the total sampled particles minus the total assigned
setorder(unassigned, Year, Dest)

##adding in the possible sampled routes needs to happen AFTER calculating unassigned because unassigned is calculated from row counts
sim_sample <- PropSampTable[sim_sample, on=.(Year, Source, Dest)]
#check all is well- for testing loop only
#sum(sim_sample$Parentage) #should be 37

#add in the routes we could have assigned given our sampling so the parentage matrix is complete
unq_sim_sample <- unique(sim_sample, by=c("Source", "Dest", "Year"))

add_routes <- unq_survey[!unq_sim_sample, on = names(unq_survey)][ #what combos are not appearing because we didn't sample particles, but the route is possible based on our survey sampling
    , `:=`(Parentage= 0, num_sampled = 0) ] #add the parentage column 

#add back into the sampled simulation data
sim_sample <- rbind(sim_sample, add_routes[,-"num_sampled"])
setorder(sim_sample, Year, Source, Dest)
#check all is well-
#sum(sim_sample$Parentage, na.rm=T) #should be 37

#make a parentage matrix for each year
mat2012 <- dcast(sim_sample[Year==2012], Source ~ Dest, value.var="Parentage", fun.aggregate = sum)[Source %in% SurveyData[year==2012 & PropAnemSamp >0, site], -"Source"] 
mat2012 <- as.matrix(rbind(mat2012, t(unassigned[Year==2012, num_sampled]), use.names=F))

mat2013 <- dcast(sim_sample[Year==2013], Source ~ Dest, value.var="Parentage", fun.aggregate = sum)[Source %in% SurveyData[year==2013 & PropAnemSamp >0, site], -"Source"] 
mat2013 <- as.matrix(rbind(mat2013, t(unassigned[Year==2013, num_sampled]), use.names=F))

mat2014 <- dcast(sim_sample[Year==2014], Source ~ Dest, value.var="Parentage", fun.aggregate = sum)[Source %in% SurveyData[year==2014 & PropAnemSamp >0, site], -"Source"] 
mat2014 <- as.matrix(rbind(mat2014, t(unassigned[Year==2014, num_sampled]), use.names=F))

mat2012_4 <- dcast(sim_sample, Source ~ Dest, value.var="Parentage", fun.aggregate = sum)[Source %in% SurveyData[year %in% c(2012, 2013, 2014) & PropAnemSamp >0, site], -"Source"] 
mat2012_4 <- as.matrix(rbind(mat2012_4, t(unique(unassigned[, total_sampled := sum(num_sampled), by="Dest"], by="Dest")[, total_sampled]), use.names=F))


#fit the kernels
x <- list(Distances=Distances, Assignments=mat2012, Sampled_reefs=t(site_index[site %in% SurveyData[year==2012 & PropAnemSamp >0 , site], index]),
                  Reef_sizes=Reef_sizes, Adult_sample_proportions=SurveyData[year==2012 & PropAnemSamp >0 , PropAnemSamp]) #put inputs into a list because that's the bbmle format
Sim2012Fit <- suppressWarnings(mle2(LL_kt_bbmle, start=list(k=-3, theta=1), lower=c(-10, 0.15), upper=c(10, 8), method="L-BFGS-B", data=x, control=list(maxit=500)))

x <- list(Distances=Distances, Assignments=mat2013, Sampled_reefs=t(site_index[site %in% SurveyData[year==2013 & PropAnemSamp >0 , site], index]),
                  Reef_sizes=Reef_sizes, Adult_sample_proportions=SurveyData[year==2013 & PropAnemSamp >0 , PropAnemSamp]) #put inputs into a list because that's the bbmle format
Sim2013Fit <- suppressWarnings(mle2(LL_kt_bbmle, start=list(k=-3, theta=1), lower=c(-10, 0.15), upper=c(10, 8), method="L-BFGS-B", data=x, control=list(maxit=500)))

x <- list(Distances=Distances, Assignments=mat2014, Sampled_reefs=t(site_index[site %in% SurveyData[year==2014 & PropAnemSamp >0 , site], index]),
                  Reef_sizes=Reef_sizes, Adult_sample_proportions=SurveyData[year==2014 & PropAnemSamp >0 , PropAnemSamp]) #put inputs into a list because that's the bbmle format
Sim2014Fit <- suppressWarnings(mle2(LL_kt_bbmle, start=list(k=-3, theta=1), lower=c(-10, 0.15), upper=c(10, 8), method="L-BFGS-B", data=x, control=list(maxit=500)))

x <- list(Distances=Distances, Assignments=mat2012_4, Sampled_reefs=t(site_index[site %in% SurveyData[year==2014 & PropAnemSamp >0 , site], index]),
                  Reef_sizes=Reef_sizes, Adult_sample_proportions=matrix(SurveyData[year==2014 & PropAnemSamp >0 , PropAnemSamp])) #put inputs into a list because that's the bbmle format
Sim2012_4Fit <- suppressWarnings(mle2(LL_kt_bbmle, start=list(k=-3, theta=1), lower=c(-10, 0.15), upper=c(10, 8), method="L-BFGS-B", data=x, control=list(maxit=500)))

#store data from the fits
BestK2012 <- as.numeric(coef(Sim2012Fit)[1])
BestTheta2012 <- as.numeric(coef(Sim2012Fit)[2])
MDD2012 <- as.numeric(cubintegrate(integrate_kernel_sum1, lower = 0, upper = Inf, k=BestK2012, theta=BestTheta2012, , method = "pcubature")$integral)
k_eval <- BestK2012
theta_eval <- BestTheta2012
Med2012  <- round(nleqslv(x = 7, fn = cdf_solve)$x, 2) 
Dist90_2012 <- round(nleqslv(x = 7, fn = cdf_solve90)$x, 2)

SimKernels$year[i] <- "2012"
SimKernels$k[i] <-  BestK2012
SimKernels$theta[i] <- BestTheta2012
SimKernels$mdd[i] <- MDD2012
SimKernels$med[i] <- Med2012
SimKernels$dist90[i] <- Dist90_2012
SimKernels$iteration[i] <- i

BestK2013 <- as.numeric(coef(Sim2013Fit)[1])
BestTheta2013 <- as.numeric(coef(Sim2013Fit)[2])
MDD2013 <- as.numeric(cubintegrate(integrate_kernel_sum1, lower = 0, upper = Inf, k=BestK2013, theta=BestTheta2013, , method = "pcubature")$integral)
k_eval <- BestK2013
theta_eval <- BestTheta2013
Med2013  <- round(nleqslv(x = 7, fn = cdf_solve)$x, 2) 
Dist90_2013 <- round(nleqslv(x = 7, fn = cdf_solve90)$x, 2)

SimKernels$year[i+1] <- "2013"
SimKernels$k[i+1] <-  BestK2013
SimKernels$theta[i+1] <- BestTheta2013
SimKernels$mdd[i+1] <- MDD2013
SimKernels$med[i+1] <- Med2013
SimKernels$dist90[i+1] <- Dist90_2013
SimKernels$iteration[i+1] <- i

BestK2014 <- as.numeric(coef(Sim2014Fit)[1])
BestTheta2014 <- as.numeric(coef(Sim2014Fit)[2])
MDD2014 <- as.numeric(cubintegrate(integrate_kernel_sum1, lower = 0, upper = Inf, k=BestK2014, theta=BestTheta2014, , method = "pcubature")$integral)
k_eval <- BestK2014
theta_eval <- BestTheta2014
Med2014  <- round(nleqslv(x = 7, fn = cdf_solve)$x, 2) 
Dist90_2014 <- round(nleqslv(x = 7, fn = cdf_solve90)$x, 2)

SimKernels$year[i+2] <- "2014"
SimKernels$k[i+2] <-  BestK2014
SimKernels$theta[i+2] <- BestTheta2014
SimKernels$mdd[i+2] <- MDD2014
SimKernels$med[i+2] <- Med2014
SimKernels$dist90[i+2] <- Dist90_2014
SimKernels$iteration[i+2] <- i

BestK2012_4 <- as.numeric(coef(Sim2012_4Fit)[1])
BestTheta2012_4 <- as.numeric(coef(Sim2012_4Fit)[2])
MDD2012_4 <- as.numeric(cubintegrate(integrate_kernel_sum1, lower = 0, upper = Inf, k=BestK2012_4, theta=BestTheta2012_4, , method = "pcubature")$integral)
k_eval <- BestK2012_4
theta_eval <- BestTheta2012_4
Med2012_4  <- round(nleqslv(x = 7, fn = cdf_solve)$x, 2) 
Dist90_2012_4 <- round(nleqslv(x = 7, fn = cdf_solve90)$x, 2)

SimKernels$year[i+3] <- "2012-4"
SimKernels$k[i+3] <-  BestK2012_4
SimKernels$theta[i+3] <- BestTheta2012_4
SimKernels$mdd[i+3] <- MDD2012_4
SimKernels$med[i+3] <- Med2012_4
SimKernels$dist90[i+3] <- Dist90_2012_4
SimKernels$iteration[i+3] <- i

setTxtProgressBar(pb, n)
    
    }
close(pb)
EndTime <- Sys.time()
EndTime-StartTime


write.csv(SimKernels, file="~/oceanography/script_output/KernelFits/Adjust_Survival_NoRecentRecruits/", row.names=F)



__Next step- seaosonal!!__

In [None]:
SimulationSWM <- SimulationAllSurv %>% 
    filter(SimMonsoon=="SWM")
SimulationNEM <- SimulationAllSurv %>%
    filter(SimMonsoon=="NEM")

In [None]:
#SWM of the corresponding simulation years combined (2011-SWM)

col <- c("season", "k", "theta", "mdd", "med", "dist90", "iteration")
SimulatedKernelsSWM <- as.data.frame(matrix(nrow=0, ncol=7), stringsAsFactors = FALSE)
colnames(SimulatedKernelsSWM) <- col

#don't print warnings for this loop, they are only for setting row names in a tibble which is depracated. But it works. 
options(warn=-1)

pb <- txtProgressBar(min = 0, max = 1000, style = 3)

StartTime <- Sys.time()


for(n in 1:1000){
        
    #SAMPLE THE SIMULATION DATA, then fit a kernel
    
    SimulatedSamplingSWM = SimulationSWM[FALSE,]
    
    
    for(i in 1:nrow(AllYearsRecruits)){ 
        
        destination_eval <- as.character(AllYearsRecruits$site[i]) #pick out a destination site
        
        SimSampDestination <- SimulationSWM %>%
                                    filter(destination==destination_eval) %>%
                                    sample_n(as.numeric(AllYearsRecruits$n_offs_gen_all_years[i]), weight=SurvivalWeightLin, replace=F) %>% #sample particles that landed at that destination, number corresponding to the actual number sampled at that site in this year
                                    select(one_of(names(SimulationAll)))
        
        SimulatedSamplingSWM <- bind_rows(SimulatedSamplingSWM, SimSampDestination) #build into a sampled particle df
    }

    #assign a numeric ID for each row (which is a sampled particle)
    SimulatedSamplingSWM <- SimulatedSamplingSWM %>%
        mutate(ParticleSampID=paste ("P", row_number(), sep = "", collapse = NULL))
    
    #nrow(SimulatedSamplingSWM)==sum(NGenOffsSWM$n_offs_gen) #should be TRUE
    
   #now randomly assign parentage match status to some of these rows
    NumPar <- as.numeric(kernels %>% #should be 37 for 2012-2014
        filter(Year %in% c("2012", "2013", "2014")) %>%
        select(NumParentageMatches) %>%
        summarise(NumParentageMatches=sum(NumParentageMatches)))
    
    SimulatedSamplingSWMPar <- SimulatedSamplingSWM %>%
        ungroup() %>%
        filter(source %in% AllYearsRecruits$site) %>% #we can only assign parentage if we sampled the source site, we've already ensured that we sampled the destination site in the for loop by indexing destination sites to evaluate 
        sample_n(NumPar, replace=F) %>% #sample number of rows consitent with numbers of parentage matches
        mutate(Parentage=1) #%>% #assign these rows a positive match value
        #arrange(date, source, destination) %>%#arrange by the same order as the orginal df, then bind columns together to avoid unexpected repeats from a join
        #select(date, source, destination, Parentage) #drop columns that will result in repeats 
    
    #find the rows that haven't been assigned parentage
    SimulatedSamplingSWM_2 <- anti_join((SimulatedSamplingSWM %>% ungroup()), SimulatedSamplingSWMPar, by=c("ParticleSampID")) 
    
    SimulatedSamplingSWM_2$Parentage <- 0 #add in the column for parentage
    
    #combine parentage and non-parentage dfs
    SimulatedSamplingSWMPar3 <- bind_rows(SimulatedSamplingSWMPar, SimulatedSamplingSWM_2) %>%
        mutate(YearSampled=as.character(YearSampled))
    
    
    #NOW FORMAT INTO PARENTAGE MATRIX
    
                PropSampSWM <- PropSamp %>%  #add in SWM sampled sites as possible parent sites
                            filter(end_year =="2014" & PropSamp > 0) %>%
                            #rename(source="site") %>%
                            select(-end_year, -PropSamp)

                #PropSampSWM$destination <- PropSampSWM$source #make another column for destination
                
                #make a table with the total number of particles sampled at each site
                TotalSimSamp <- SimulatedSamplingSWMPar3 %>% 
                            select(source, destination) %>%
                            group_by(destination) %>%
                            summarise(NumSimSampRec=n()) %>%
                            ungroup()
               
                
                #add in SWM surveyed sites
                TotalSimSamp2 <- left_join(PropSampSWM, TotalSimSamp, by=c(site="destination")) %>% #join sites to destination, which will add a row for the surveyed sites with no samples
                    mutate(NumSimSampRec=ifelse(is.na(NumSimSampRec), 0, NumSimSampRec)) %>% #change the Na's that result to 0s, because we didn't get samples there
                    rename(destination="site")#rename sites to destination after joining
                
                
                
                #make a table with the total number of parentage matches found in the simulated loop
                TotalSimPar <- SimulatedSamplingSWMPar3 %>% 
                            select(source, destination, Parentage) %>%
                            group_by(source,destination) %>%
                            mutate(NumPar=sum(Parentage)) %>% #sum SWM the parentage observations along each ROUTE
                            select(-Parentage) %>%
                            ungroup() %>%
                            distinct(source, destination, .keep_all = T)
                
                
                #combine for a summary table BY DESTINATION, this below will be for UNASSIGNED ROWS
                SimSampSummarySWM <- left_join(TotalSimSamp2, TotalSimPar, by="destination") %>%
                    distinct(source, destination, .keep_all = T) %>%
                    group_by(destination) %>%
                    mutate(NumPar=ifelse(is.na(NumPar), 0, NumPar)) %>% #change the Na's that result to 0s, because we didn't get samples there
                    mutate(NumPar=sum(NumPar)) %>% #sum SWM the parentage observations at each DESTINATION
                    distinct(destination, .keep_all = T) %>%
                    select(-source) %>%
                    mutate(NumUnassigned=NumSimSampRec-NumPar)
                                
                
                #finSWMy, join SWM of the sites surveyed for a complete parentage matrix
                #for this, need to have a source/destination column for SWM sites
                PropSampSWM <- PropSampSWM %>%
                                rename(source="site")
                
                PropSampSWM$destination <- PropSampSWM$source #make another column for destination
                
                #this below will become parentage matrix! 
                TotalSimPar2 <- full_join(TotalSimPar,PropSampSWM, by=c("source", "destination")) %>%
                    mutate(NumPar=ifelse(is.na(NumPar), 0, NumPar))
        
                
                #spread into matrix format
                SimDispMat <- TotalSimPar2 %>%
                    filter(source %in% AllYearsRecruits$site) %>%
                    arrange(source, destination)  %>%
                    spread(destination, NumPar) %>%
                    select(-source)
                SimDispMat[is.na(SimDispMat)] <- 0
                
                #add in unassigned row
                unassigned <- t(as.matrix(SimSampSummarySWM %>% arrange(destination) %>% ungroup() %>% select(NumUnassigned)))
                rownames(unassigned)<-NULL
                colnames(unassigned)<-names(SimDispMat)
                
                SimDispMatFullSWM <- rbind(SimDispMat, unassigned)
                colnames(SimDispMatFullSWM) <- NULL

        #The full remade parentage matrix
        AssignmentsSWM <- SimDispMatFullSWM
    
    #NOW FIT THE KERNEL


x <- list(Distances=Distances, Assignments=AssignmentsSWM, Sampled_reefs=Sampled_reefsAll, Reef_sizes=Reef_sizes, Adult_sample_proportions=Adult_sample_proportionsAll) #put inputs into a list because that's the bbmle format

SimSWMFit <- suppressWarnings(mle2(LL_kt_bbmle, start=list(k=-3, theta=1), lower=c(-10, 0.15), upper=c(10, 8), method="L-BFGS-B", data=x, control=list(maxit=500)))

BestKSWM <- as.numeric(coef(SimSWMFit)[1])
BestThetaSWM <- as.numeric(coef(SimSWMFit)[2])
MDDSWM <- as.numeric(cubintegrate(integrate_kernel_sum1, lower = 0, upper = Inf, k=BestKSWM, theta=BestThetaSWM, , method = "pcubature")$integral)
k_eval <- BestKSWM
theta_eval <- BestThetaSWM
MedSWM  <- round(nleqslv(x = 7, fn = cdf_solve)$x, 2) 
Dist90_SWM <- round(nleqslv(x = 7, fn = cdf_solve90)$x, 2)
    
    #store the info in this df
    SimulatedKernelsSWM_beta <- as.data.frame(matrix(nrow=1, ncol=7), stringsAsFactors = FALSE)
    colnames(SimulatedKernelsSWM_beta) <- col
    
    SimulatedKernelsSWM_beta$season <- NA
    SimulatedKernelsSWM_beta$k <- BestKSWM
    SimulatedKernelsSWM_beta$theta <- BestThetaSWM
    SimulatedKernelsSWM_beta$med <- MedSWM
    SimulatedKernelsSWM_beta$mdd <- MDDSWM
    SimulatedKernelsSWM_beta$dist90 <- Dist90_SWM
    SimulatedKernelsSWM_beta$iteration <- n

    
#join results into larger df
SimulatedKernelsSWM <- bind_rows(SimulatedKernelsSWM, SimulatedKernelsSWM_beta) %>%
    mutate(season="SWM")
    
setTxtProgressBar(pb, n)


}
close(pb)
EndTime <- Sys.time()
EndTime-StartTime
options(warn=0) #turn warnings back on


write.csv(SimulatedKernelsSWM, file="~/oceanography/script_output/KernelFits/Adjust_Survival_NoRecentRecruits/1000SimulatedKernelsSWMSurvNoRecentRec.csv", row.names=F)

In [None]:
#NEM of the corresponding simulation years combined (2011-NEM)

col <- c("season", "k", "theta", "mdd", "med", "dist90", "iteration")
SimulatedKernelsNEM <- as.data.frame(matrix(nrow=0, ncol=7), stringsAsFactors = FALSE)
colnames(SimulatedKernelsNEM) <- col

#don't print warnings for this loop, they are only for setting row names in a tibble which is depracated. But it works. 
options(warn=-1)

pb <- txtProgressBar(min = 0, max = 1000, style = 3)

StartTime <- Sys.time()


for(n in 1:1000){
        
    #SAMPLE THE SIMULATION DATA, then fit a kernel
    
    SimulatedSamplingNEM = SimulationNEM[FALSE,]
    
    
    for(i in 1:nrow(AllYearsRecruits)){ 
        
        destination_eval <- as.character(AllYearsRecruits$site[i]) #pick out a destination site
        
        SimSampDestination <- SimulationNEM %>%
                                    filter(destination==destination_eval) %>%
                                    sample_n(as.numeric(AllYearsRecruits$n_offs_gen_all_years[i]), weight=SurvivalWeightLin, replace=F) %>% #sample particles that landed at that destination, number corresponding to the actual number sampled at that site in this year
                                    select(one_of(names(SimulationAll)))
        
        SimulatedSamplingNEM <- bind_rows(SimulatedSamplingNEM, SimSampDestination) #build into a sampled particle df
    }

    #assign a numeric ID for each row (which is a sampled particle)
    SimulatedSamplingNEM <- SimulatedSamplingNEM %>%
        mutate(ParticleSampID=paste ("P", row_number(), sep = "", collapse = NULL))
    
    #nrow(SimulatedSamplingNEM)==sum(NGenOffsNEM$n_offs_gen) #should be TRUE
    
   #now randomly assign parentage match status to some of these rows
    NumPar <- as.numeric(kernels %>% #should be 37 for 2012-2014
        filter(Year %in% c("2012", "2013", "2014")) %>%
        select(NumParentageMatches) %>%
        summarise(NumParentageMatches=sum(NumParentageMatches)))
    
    SimulatedSamplingNEMPar <- SimulatedSamplingNEM %>%
        ungroup() %>%
        filter(source %in% AllYearsRecruits$site) %>% #we can only assign parentage if we sampled the source site, we've already ensured that we sampled the destination site in the for loop by indexing destination sites to evaluate 
        sample_n(NumPar, replace=F) %>% #sample number of rows consitent with numbers of parentage matches
        mutate(Parentage=1) #%>% #assign these rows a positive match value
        #arrange(date, source, destination) %>%#arrange by the same order as the orginal df, then bind columns together to avoid unexpected repeats from a join
        #select(date, source, destination, Parentage) #drop columns that will result in repeats 
    
    #find the rows that haven't been assigned parentage
    SimulatedSamplingNEM_2 <- anti_join((SimulatedSamplingNEM %>% ungroup()), SimulatedSamplingNEMPar, by=c("ParticleSampID")) 
    
    SimulatedSamplingNEM_2$Parentage <- 0 #add in the column for parentage
    
    #combine parentage and non-parentage dfs
    SimulatedSamplingNEMPar3 <- bind_rows(SimulatedSamplingNEMPar, SimulatedSamplingNEM_2) %>%
        mutate(YearSampled=as.character(YearSampled))
    
    
    #NOW FORMAT INTO PARENTAGE MATRIX
    
                PropSampNEM <- PropSamp %>%  #add in NEM sampled sites as possible parent sites
                            filter(end_year =="2014" & PropSamp > 0) %>%
                            #rename(source="site") %>%
                            select(-end_year, -PropSamp)

                #PropSampNEM$destination <- PropSampNEM$source #make another column for destination
                
                #make a table with the total number of particles sampled at each site
                TotalSimSamp <- SimulatedSamplingNEMPar3 %>% 
                            select(source, destination) %>%
                            group_by(destination) %>%
                            summarise(NumSimSampRec=n()) %>%
                            ungroup()
               
                
                #add in NEM surveyed sites
                TotalSimSamp2 <- left_join(PropSampNEM, TotalSimSamp, by=c(site="destination")) %>% #join sites to destination, which will add a row for the surveyed sites with no samples
                    mutate(NumSimSampRec=ifelse(is.na(NumSimSampRec), 0, NumSimSampRec)) %>% #change the Na's that result to 0s, because we didn't get samples there
                    rename(destination="site")#rename sites to destination after joining
                
                
                
                #make a table with the total number of parentage matches found in the simulated loop
                TotalSimPar <- SimulatedSamplingNEMPar3 %>% 
                            select(source, destination, Parentage) %>%
                            group_by(source,destination) %>%
                            mutate(NumPar=sum(Parentage)) %>% #sum NEM the parentage observations along each ROUTE
                            select(-Parentage) %>%
                            ungroup() %>%
                            distinct(source, destination, .keep_all = T)
                
                
                #combine for a summary table BY DESTINATION, this below will be for UNASSIGNED ROWS
                SimSampSummaryNEM <- left_join(TotalSimSamp2, TotalSimPar, by="destination") %>%
                    distinct(source, destination, .keep_all = T) %>%
                    group_by(destination) %>%
                    mutate(NumPar=ifelse(is.na(NumPar), 0, NumPar)) %>% #change the Na's that result to 0s, because we didn't get samples there
                    mutate(NumPar=sum(NumPar)) %>% #sum NEM the parentage observations at each DESTINATION
                    distinct(destination, .keep_all = T) %>%
                    select(-source) %>%
                    mutate(NumUnassigned=NumSimSampRec-NumPar)
                                
                
                #finNEMy, join NEM of the sites surveyed for a complete parentage matrix
                #for this, need to have a source/destination column for NEM sites
                PropSampNEM <- PropSampNEM %>%
                                rename(source="site")
                
                PropSampNEM$destination <- PropSampNEM$source #make another column for destination
                
                #this below will become parentage matrix! 
                TotalSimPar2 <- full_join(TotalSimPar,PropSampNEM, by=c("source", "destination")) %>%
                    mutate(NumPar=ifelse(is.na(NumPar), 0, NumPar))
        
                
                #spread into matrix format
                SimDispMat <- TotalSimPar2 %>%
                    filter(source %in% AllYearsRecruits$site) %>%
                    arrange(source, destination)  %>%
                    spread(destination, NumPar) %>%
                    select(-source)
                SimDispMat[is.na(SimDispMat)] <- 0
                
                #add in unassigned row
                unassigned <- t(as.matrix(SimSampSummaryNEM %>% arrange(destination) %>% ungroup() %>% select(NumUnassigned)))
                rownames(unassigned)<-NULL
                colnames(unassigned)<-names(SimDispMat)
                
                SimDispMatFullNEM <- rbind(SimDispMat, unassigned)
                colnames(SimDispMatFullNEM) <- NULL

        #The full remade parentage matrix
        AssignmentsNEM <- SimDispMatFullNEM
    
    #NOW FIT THE KERNEL


x <- list(Distances=Distances, Assignments=AssignmentsNEM, Sampled_reefs=Sampled_reefsAll, Reef_sizes=Reef_sizes, Adult_sample_proportions=Adult_sample_proportionsAll) #put inputs into a list because that's the bbmle format

SimNEMFit <- suppressWarnings(mle2(LL_kt_bbmle, start=list(k=-3, theta=1), lower=c(-10, 0.15), upper=c(10, 8), method="L-BFGS-B", data=x, control=list(maxit=500)))

BestKNEM <- as.numeric(coef(SimNEMFit)[1])
BestThetaNEM <- as.numeric(coef(SimNEMFit)[2])
MDDNEM <- as.numeric(cubintegrate(integrate_kernel_sum1, lower = 0, upper = Inf, k=BestKNEM, theta=BestThetaNEM, , method = "pcubature")$integral)
k_eval <- BestKNEM
theta_eval <- BestThetaNEM
MedNEM  <- round(nleqslv(x = 7, fn = cdf_solve)$x, 2) 
Dist90_NEM <- round(nleqslv(x = 7, fn = cdf_solve90)$x, 2)
    
    #store the info in this df
    SimulatedKernelsNEM_beta <- as.data.frame(matrix(nrow=1, ncol=7), stringsAsFactors = FALSE)
    colnames(SimulatedKernelsNEM_beta) <- col
    
    SimulatedKernelsNEM_beta$season <- NA
    SimulatedKernelsNEM_beta$k <- BestKNEM
    SimulatedKernelsNEM_beta$theta <- BestThetaNEM
    SimulatedKernelsNEM_beta$med <- MedNEM
    SimulatedKernelsNEM_beta$mdd <- MDDNEM
    SimulatedKernelsNEM_beta$dist90 <- Dist90_NEM
    SimulatedKernelsNEM_beta$iteration <- n

    
#join results into larger df
SimulatedKernelsNEM <- bind_rows(SimulatedKernelsNEM, SimulatedKernelsNEM_beta) %>%
    mutate(season="NEM")
    
setTxtProgressBar(pb, n)


}
close(pb)
EndTime <- Sys.time()
EndTime-StartTime
options(warn=0) #turn warnings back on


write.csv(SimulatedKernelsNEM, file="~/oceanography/script_output/KernelFits/Adjust_Survival_NoRecentRecruits/1000SimulatedKernelsNEMSurvNoRecentRec.csv", row.names=F)

In [None]:
head(SimulatedKernels2012)
head(SimulatedKernels2013)
head(SimulatedKernels2014)
head(SimulatedKernelsAll)
head(SimulatedKernelsNEM)
head(SimulatedKernelsSWM)