In [252]:
Packages <- c("dplyr", "R.utils", "nleqslv", "broom","cubature", "geosphere", "data.table",  "ggplot2", "bbmle", "stringr",  "lubridate", "RColorBrewer")

invisible(suppressPackageStartupMessages(lapply(Packages, library, character.only = TRUE)))

setwd('/local/home/katrinac/oceanography')
"%!in%" <- function(x,table) match(x,table, nomatch = 0) == 0
source("~/parentage/kernel_fitting/1340_loci/functions/ll_kt_both_bbmle.R")
source("~/oceanography/scripts/PredictedProportions.R")

#read in the kernel fitting summary
kernels <- fread(file="~/parentage/kernel_fitting/1340_loci/final_results/tables/kernel_fitting_summary.csv")
kernel2012_14 <- fread(file="~/oceanography/empirical_data/genetics/GenKernelsForROMSComp2012-14.csv")

#read in the centroids adjusted for the simulation, so the Magbangons combined 
#centroids <- fread(file="~/oceanography/script_output/SurveyData/SimulationCentroids.csv")
centroids <- fread(file="~/oceanography/empirical_data/site_centroids_SimTest.csv")
setorder(centroids, site)
#read in the table with number of recruits sampled at each site for each year
AnnualRecsSamp <- fread(file="~/oceanography/script_output/SurveyData/AnnualRecruitsSampled.csv")
#read in the table of the proportion of anemones sampled at each site for each year
PropSamp <- fread(file="~/oceanography/script_output/SurveyData/ProportionHabitatSampled.csv")

#read in the ROMS simulation connectivity table with metadata, not yet subsetted (*but check this)
date_join <- fread(file="~/oceanography/script_output/ROMSDataTables/LongFormConnWithProbs.csv")
#make vectors defining sites we didn't sample, but that are in the model, and the sandflats specifically 
UnsampledSites <- c("SF1", "SF2", "SF3", "SF4", "SF5", "SF6", "Pangasugan", "Other", "CAI") 
SandFlats <- c("SF1", "SF2", "SF3", "SF4", "SF5", "SF6") 

#make the constant inputs for the kernel fitting function
#distance matrix using the centroids with combined Magbangon
### List of source locations
sites_source <- centroids

### List of destination locations
sites_dest <- centroids

dist_mat_m <- distm(sites_source[,c('lon','lat')], sites_source[,c('lon','lat')], fun=distVincentyEllipsoid)
Distances <- dist_mat_m*10^-3
#read in the reef areas for the kernel fitting
Area <- fread("~/oceanography/empirical_data/site_area_header_nonsurveyed_simulation_kernels_test.csv") %>%
    arrange(site) %>%
    filter(site %!in% c("near_north_full1", "near_north_full2", "near_north_full3", "near_south_full1", "near_south_full2", "near_south_full3")) %>%
    mutate(kmsq=msq*10^-6)# %>%
    #select(kmsq) #need to uncomment for functions to work
setorder(Area, site)
Reef_sizes <- as.matrix(Area$kmsq)

#make a table with the survey information for each site (how many fish sampled, prop anems sampled, total number of anems at site)
SurveyData <- AnnualRecsSamp[PropSamp, on=.(year=end_year, site)][#join the sampling tables together
    is.na(n_offs_gen), n_offs_gen := 0][#change NA's to 0
    ,-"time_frame"]#drop the time_frame column, we can key with end_year
#setkey(SurveyData, site)
#check all sites are represented in centroids and area (and indirectly distances, which comes from centroids)
#Area[site %!in% centroids$site] #should be nothing

#genetic data to make predicted proportions
Assignments2012_4 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/parentage_matrix_2012-14ForROMSComp.csv", header=FALSE))
Adult_sample_proportions2012_4 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/prop_samp12-14.csv", header=FALSE))
Sampled_reefs2012_4 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/site_index12-14.csv", header=FALSE))

Distances_gen <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/distance_matrix_unsurveyed.csv", header=FALSE))
Reef_sizes_gen <- as.matrix(read.csv("~/parentage/kernel_fitting/894_loci/area_unsurveyed.csv", header=FALSE))
Centroids_gen <- as.matrix(read.csv("~/parentage/kernel_fitting/894_loci/centroids_unsurveyed.csv", header=T))

Assignments2012 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/parentage_matrix12_corrected.csv", header=FALSE))
Adult_sample_proportions2012 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/prop_samp12.csv", header=FALSE))
Sampled_reefs2012 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/site_index12.csv", header=FALSE))

Assignments2013 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/parentage_matrix13_corrected.csv", header=FALSE))
Adult_sample_proportions2013 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/prop_samp13.csv", header=FALSE))
Sampled_reefs2013 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/site_index13.csv", header=FALSE))

Assignments2014 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/parentage_matrix14_corrected.csv", header=FALSE))
Adult_sample_proportions2014 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/prop_samp14.csv", header=FALSE))
Sampled_reefs2014 <- as.matrix(read.csv("~/parentage/kernel_fitting/1340_loci/input/site_index14.csv", header=FALSE))


__Loop through sampling different proportions of other and CAI source particles and compare the unassigned proportions of total sample particles to the genetic observations from survey data__

In [229]:
PropAssignedTable <- rbind(kernels[Year %in% c("2012", "2013", "2014")][
    , PropAssigned := PercentAssigned/100][ #change to proportion note percent
    , c("Year", "PropAssigned")],                      
    unique(kernels[Year %in% c("2012", "2013", "2014")][ #only the years coinciding with the models
   , PropAssigned := sum(NumParentageMatches)/sum(NumOffsSampled)][#sum up all of the sampled offspring 2012-2014, and matches
    , "PropAssigned"][ #select the column with the total value
    , Year := "2012-14"]))

PropAssignedTable[]

Year,PropAssigned
2012,0.0476
2013,0.14
2014,0.0718
2012-14,0.09390863


__For the expected values of recruits from outside of our sampled region, I'm using the intermediate "PredictedProportions" matrix from the Bode kernel fitting script (https://github.com/MikeBode/Parentage_kernel_fitting/blob/master/Kernel_Fitting_Function.m) because it accounts for how well sites were sampled when estimating proportions of recruits from unsampled sites__

In [253]:
#####outside of the loop*****
#each year will require a different set of survey data, so make a list of each and index by site for fast look up
PropSampTable <- SurveyData[PropAnemSamp >0, c("year", "site")]#[
    #, c("Year", "Source", "Dest", "Parentage")] #only grab the columns we need
#setkey(year)

#make a site index table, use this for Sampled_reefs input in kernel fitting
site_index <- unique(centroids, by="site")[, "site"][, index := .I] #add the row number as the unique site index

#make sure all sampled sites are represented by joining the survey data to the sampled simulation
PropSampTable <- rbind(SurveyData[PropAnemSamp >0 & year %in% c(2012, 2013, 2014), c("year", "site")][, .(Source=site, Dest=site, Year=year)][ #will join to the simulated sampling table by source and dest, so make those each a column from site and preserve the year variable as a key
    , c("Year", "Source", "Dest")][, Monsoon := "NEM"], SurveyData[PropAnemSamp >0 & year %in% c(2012, 2013, 2014), c("year", "site")][, .(Source=site, Dest=site, Year=year)][ #will join to the simulated sampling table by source and dest, so make those each a column from site and preserve the year variable as a key
    , c("Year", "Source", "Dest")][, Monsoon := "SWM"])
unq_survey <- unique(PropSampTable, by=c("Source", "Dest", "Year", "Monsoon"))#, unique(PropSampTable, by=c("Source", "Dest", "Year"))[, Monsoon := "SWM"]) #add in the diff Monsoon seasons so there are complete parentage matrices later
add_dest <- rbind(SurveyData[year %in% c(2012, 2013, 2014) & PropAnemSamp >0][, c("year", "site")][, Monsoon := "NEM"], SurveyData[year %in% c(2012, 2013, 2014) & PropAnemSamp >0][, c("year", "site")][, Monsoon := "SWM"])  #what destinations were sampled, for use with unassigned table

###outside of the loop
PropToEval <- seq(0.1, 1, 0.1) #make a vector of proportions to sample iterativaley and compare
#empty table to hold results
PropSampOtherCAI <- data.table(TimeScale=character(), TimeID=character(), PropUnassigned=numeric(), ExpUnassigned=numeric(),  PropSampEval=numeric(), Check1=character(), Check2=character(), NrowSimConn=numeric())

#get the average value of the predicted proportions from the last row, which is unsampled source sites using the genetic data
PredProp2012 <- PredProp(k=kernels[Year=="2012", best_k], theta=kernels[Year=="2012", best_theta], Assignments = Assignments2012, Sampled_reefs=Sampled_reefs2012,
                 Distances=Distances_gen, Reef_sizes=Reef_sizes_gen, Adult_sample_proportions=Adult_sample_proportions2012)

PredProp2013 <- PredProp(k=kernels[Year=="2013", best_k], theta=kernels[Year=="2013", best_theta], Assignments = Assignments2013, Sampled_reefs=Sampled_reefs2013,
                 Distances=Distances_gen, Reef_sizes=Reef_sizes_gen, Adult_sample_proportions=Adult_sample_proportions2013)

PredProp2014 <- PredProp(k=kernels[Year=="2014", best_k], theta=kernels[Year=="2014", best_theta], Assignments = Assignments2014, Sampled_reefs=Sampled_reefs2014,
                 Distances=Distances_gen, Reef_sizes=Reef_sizes_gen, Adult_sample_proportions=Adult_sample_proportions2014)

PredProp2012_4 <- PredProp(k=kernel2012_14[year=="2012-14", k], theta=kernel2012_14[year=="2012-14", theta], Assignments = Assignments2012_4, Sampled_reefs=Sampled_reefs2012_4,
                 Distances=Distances_gen, Reef_sizes=Reef_sizes_gen, Adult_sample_proportions=Adult_sample_proportions2012_4)


In [254]:
pb <- txtProgressBar(min = 0, max =length(PropToEval), style = 3)

StartTime <- Sys.time()

for(i in 1:length(PropToEval)){

PropSampOtherCAI_int <- data.table(TimeScale=character(), TimeID=character(), PropUnassigned=numeric(), ExpUnassigned=numeric(),  PropSampEval=numeric(), Check1=character(), Check2=character(), NrowSimConn=numeric())[1:4]

dest_sampled <- date_join[DestPropSamp >0]
check1 <- nrow(dest_sampled)
dest_sampled <- dest_sampled[c(dest_sampled[, .I[Source != "Other"]], sample(dest_sampled[, .I[Source == "Other"]], length(dest_sampled[, .I[Source == "Other"]])*PropToEval[i]))]
check2 <- nrow(dest_sampled)
dest_sampled <- dest_sampled[c(dest_sampled[, .I[Source != "CAI"]], sample(dest_sampled[, .I[Source == "CAI"]], length(dest_sampled[, .I[Source == "CAI"]])*PropToEval[i]))]
check3 <- nrow(dest_sampled)

#check that we have less rows, should both be TRUE
test1 <- check1 > check2
test2 <- check2 > check3
#check1 > check2
#check2 > check3
#rename the monsoon column in the full table for consistency
setnames(dest_sampled, "SimMonsoon", "Monsoon")

#join in the number of parentage matches observed by year
dest_sampled <- kernels[Year %in% c("2012", "2013", "2014")][, Year:=as.integer(Year)][,c("Year", "NumParentageMatches")][dest_sampled, on=.(Year=YearSampled)]#[

#randomly subsample the sampled particle data
sim_sample <- dest_sampled[, .SD[sample(.N, DestNOffsAnnual, prob=SurvWeight)], by = c("Year", "Dest")] #randomly sample rows (particles) from the table according to the survival weighting, based on the number we sampled at each site in each year of surveys

PropUnassignedByYear <- (sim_sample[Source == "CAI"| Source == "Other", .(.N), by="Year"][, N]/#total particales sampled from other/CAI sources
sim_sample[, .(.N), by="Year"][, N]) #total particles sampled

PropSampOtherCAI_int$TimeScale[1] <- "annual"
PropSampOtherCAI_int$TimeID[1] <- "2012"
PropSampOtherCAI_int$PropUnassigned[1] <- PropUnassignedByYear[1]
PropSampOtherCAI_int$ExpUnassigned[1] <- mean(PredProp2012[nrow(PredProp2012),])
PropSampOtherCAI_int$PropSampEval[1] <- PropToEval[i]
PropSampOtherCAI_int$Check1[1] <- test1
PropSampOtherCAI_int$Check2[1] <- test2
PropSampOtherCAI_int$NrowSimConn[1] <- nrow(dest_sampled)
    
PropSampOtherCAI_int$TimeScale[2] <- "annual"
PropSampOtherCAI_int$TimeID[2] <- "2013"
PropSampOtherCAI_int$PropUnassigned[2] <- PropUnassignedByYear[2]
PropSampOtherCAI_int$ExpUnassigned[2] <- mean(PredProp2013[nrow(PredProp2013),])
PropSampOtherCAI_int$PropSampEval[2] <- PropToEval[i]
PropSampOtherCAI_int$Check1[2] <- test1
PropSampOtherCAI_int$Check2[2] <- test2
PropSampOtherCAI_int$NrowSimConn[2] <- nrow(dest_sampled)
    
PropSampOtherCAI_int$TimeScale[3] <- "annual"
PropSampOtherCAI_int$TimeID[3] <- "2014"
PropSampOtherCAI_int$PropUnassigned[3] <- PropUnassignedByYear[3]
PropSampOtherCAI_int$ExpUnassigned[3] <- mean(PredProp2014[nrow(PredProp2014),])
PropSampOtherCAI_int$PropSampEval[3] <- PropToEval[i]
PropSampOtherCAI_int$Check1[3] <- test1
PropSampOtherCAI_int$Check2[3] <- test2
PropSampOtherCAI_int$NrowSimConn[3] <- nrow(dest_sampled)
    
PropSampOtherCAI_int$TimeScale[4] <- "interannual"
PropSampOtherCAI_int$TimeID[4] <- "2012_4"
PropSampOtherCAI_int$PropUnassigned[4] <- nrow(sim_sample[Source == "CAI"| Source == "Other"])/nrow(sim_sample)
PropSampOtherCAI_int$ExpUnassigned[4] <- mean(PredProp2012_4[nrow(PredProp2012_4),])
PropSampOtherCAI_int$PropSampEval[4] <- PropToEval[i]
PropSampOtherCAI_int$Check1[4] <- test1
PropSampOtherCAI_int$Check2[4] <- test2
PropSampOtherCAI_int$NrowSimConn[4] <- nrow(dest_sampled)

l <- list(PropSampOtherCAI, PropSampOtherCAI_int)
PropSampOtherCAI <- rbindlist(l, use.names = TRUE, fill=TRUE, idcol = FALSE)
setTxtProgressBar(pb, i)
    
}

close(pb)
EndTime <- Sys.time()
EndTime-StartTime

fwrite(PropSampOtherCAI, file="~/oceanography/script_output/PropSampOtherCAIEvaluation.csv")




Time difference of 27.85093 secs

In [255]:
PropSampOtherCAI

TimeScale,TimeID,PropUnassigned,ExpUnassigned,PropSampEval,Check1,Check2,NrowSimConn
annual,2012,0.1904762,0.9750237,0.1,True,True,332264
annual,2013,0.22,0.9562503,0.1,True,True,332264
annual,2014,0.2044199,0.9574422,0.1,True,True,332264
interannual,2012_4,0.2081218,0.965407,0.1,True,True,332264
annual,2012,0.3333333,0.9750237,0.2,True,True,363170
annual,2013,0.3733333,0.9562503,0.2,True,True,363170
annual,2014,0.281768,0.9574422,0.2,True,True,363170
interannual,2012_4,0.3248731,0.965407,0.2,True,True,363170
annual,2012,0.4444444,0.9750237,0.3,True,True,394076
annual,2013,0.36,0.9562503,0.3,True,True,394076


__Seems like good justification to not subsample the ROMS particles from other/Camotes Islands__

In [10]:
nrow(dest_sampled)
nrow(SimConn)

In [7]:
#save inter file
#fwrite(dest_sampled, file="~/oceanography/script_output/LongFormConnWithProbsTest.csv")
#see if I can write as a compressed file so it can be stored on github
#https://stackoverflow.com/questions/42788401/is-possible-to-use-fwrite-from-data-table-with-gzfile