# A notebook in R for modelling species distributions and sample effort using 
## CAR models

The conditional autologistic model (Besage, et. al 1991) has the following specification (Lee, 2013).


In [21]:
library(CARBayes)
library(dplyr)
library(purrr)
library(biospytial.rwrapper)

In [22]:
## Read adjancency matrix
mat_filename = "/outputs/training_data_sample_puebla_p9_abies_pinophyta_adjmat.npy"
library(reticulate)
np <- import("numpy")
M <- np$load(mat_filename)
TDF = read.csv("/outputs/training_data_sample_puebla_p9_abies_pinophyta.csv")
## Order it according to the id of the cell
## This is important because the adjancy matrix rows need to be the same
TDF = TDF[order(TDF$cell_ids),]
# Convert to numeric
TDF = mutate_at(TDF,vars(Dist.to.road_m,Elevation_m,MaxTemp_m,MeanTemp_m,MinTemp_m,Population_m,Precipitation_m,SolarRadiation_m,VaporPres_m,WindSp_m),as.numeric)
names(TDF) = lapply(names(TDF),function(x) gsub("_","",x))
names(TDF) = lapply(names(TDF),function(x) gsub("\\.","",x))
                    
### Mini script para borrar entrada con 0 neighbours
D = apply(M,MARGIN = 1,sum)
idx = match(0,D)
## Please check that the order is ok
# Checked, 
cell_with_no_neighbour = TDF$cellids[idx]
## look in the gis
## Erase idx for M and for TDF (Or maybe only for M)
M_bis = M[-c(idx),-c(idx)]

names(TDF)[23] <- 'covid2'
DataFrame = TDF %>% rowwise() %>% mutate(sample=pseudo_absence_naive(Plantae,LUCA),
                              species=pseudo_absence_naive(Pinophyta,Plantae))
                    
                    
formula_sample=sample~Disttoroadm+Populationm #+factor(tipos)
formula_presence=species~Elevationm+MeanTempm
n <- nrow(TDF)
trials <- rep(1,n)

burnin = 50000
n.sample = 100000
thin = 50    

In [3]:
model.sample <-S.CARbym(formula=formula_sample,family="binomial",W=M_bis,trials = trials,data=DataFrame,burnin=burnin,n.sample=n.sample,thin=thin,verbose = TRUE)

Setting up the model.
Generating 1000 post burnin and thinned (if requested) samples.
Summarising results.
Finished in  819.8 seconds.


In [4]:
model.presence <- S.CARbym(formula=formula_presence,family="binomial",W=M_bis,trials = trials,data=DataFrame,burnin=burnin,n.sample=n.sample,thin=thin,verbose = TRUE)

Setting up the model.
Generating 1000 post burnin and thinned (if requested) samples.
Summarising results.
Finished in  673.4 seconds.


In [13]:
#DIC for model with polygon 4 611.3458
print(model.sample)


#################
#### Model fitted
#################
Likelihood model - Binomial (logit link function) 
Random effects model - BYM CAR
Regression equation - sample ~ Disttoroadm + Populationm
Number of missing observations - 731

############
#### Results
############
Posterior quantities and DIC

             Median    2.5%   97.5% n.sample % accept n.effective Geweke.diag
(Intercept)  3.2695  2.7012  4.8562     1000     46.6        13.2        -1.7
Disttoroadm -0.0002 -0.0004 -0.0001     1000     46.6        97.9         1.8
Populationm  0.0000 -0.0001  0.0002     1000     46.6       907.8         0.3
tau2         5.7359  3.2187 14.3688     1000    100.0        16.1        -1.5
sigma2       0.1078  0.0059  3.7740     1000    100.0         6.1        -1.2

DIC =  2136.889       p.d =  496.3488       LMPL =  -1111.23 


In [14]:
print(model.presence)


#################
#### Model fitted
#################
Likelihood model - Binomial (logit link function) 
Random effects model - BYM CAR
Regression equation - species ~ Elevationm + MeanTempm
Number of missing observations - 1161

############
#### Results
############
Posterior quantities and DIC

             Median    2.5%   97.5% n.sample % accept n.effective Geweke.diag
(Intercept) -4.3725 -6.2358 -3.1681     1000     46.5        44.9         0.1
Elevationm   0.0002 -0.0002  0.0005     1000     46.5       815.9         0.6
MeanTempm   -0.0006 -0.0011 -0.0002     1000     46.5       380.5         0.9
tau2        13.6601  8.4661 30.3181     1000    100.0        25.8        -0.6
sigma2       0.0254  0.0082  0.1320     1000    100.0        13.6         2.1

DIC =  1423.11       p.d =  433.7204       LMPL =  -790.36 


In [15]:
## Compute independent joint distribution
library(boot)
## Calculate the inverse of the logit for extracted fitted 
#fitted_presences = apply(model.presence$samples$fitted,MARGIN=1,inv.logit)
#fitted_sample = apply(model.sample$samples$fitted,MARGIN=1,inv.logit)

fitted_presences = apply(model.presence$samples$fitted,MARGIN=1,identity)
fitted_sample = apply(model.sample$samples$fitted,MARGIN=1,identity)

re_presence = apply(model.presence$samples$psi,MARGIN=1,identity)
re_sample = apply(model.sample$samples$psi,MARGIN=1,identity)

In [16]:
getUpperLowerCI <- function(fitted_sample,prefix="",MARGIN=1){
    fp = partial(quantile,probs=c(0.025,0.5,0.975))
    quants =apply(X = fitted_sample,FUN = fp,MARGIN)
    mfitt = apply(fitted_sample,mean,MARGIN = MARGIN)
    vfitt = apply(fitted_sample,var,MARGIN = MARGIN)
    new_data = cbind(t(quants),mfitt,vfitt)
    l = as.list(c("q025","median","q0975","mean","variance"))
    nnames = lapply(l,function (x){return(paste(prefix,x,sep = '_'))})
    colnames(new_data) = nnames
    return(new_data)
}



In [17]:
## Sample from the posterior
sample.x <- data.frame(t(fitted_presences)) %>% mutate_all(function(p) rbernoulli(1,p))
sample.y <- data.frame(t(fitted_sample)) %>% mutate_all(function(p) rbernoulli(1,p))
sum.x <- colSums(sample.x)
nsample <- nrow(sample.x)
PX <- sum.x / nsample
sum.y <- colSums(sample.y)
PY <- sum.y / nsample
PXY <- PX * PY

In [23]:
P = getUpperLowerCI(fitted_presences,prefix = 'P')
S = getUpperLowerCI(fitted_sample,prefix = 'S')
Gp = getUpperLowerCI(re_presence,prefix = 'Gp')
Gs = getUpperLowerCI(re_sample,prefix = 'Gs')

In [24]:
dataout = cbind(TDF$cellids,P,S,Gp,Gs,PX,PY,PXY)

In [25]:
## Save the complete chain trace for analysing in Python,
file_ = '/outputs/presence_only_models/modelCAR1.csv'
write.csv(dataout,file_)

### Postprocess the posterior dist.

In [None]:
joint_ind_ps_summary = getUpperLowerCI(joint_ind_ps,prefix='join_ind_ps',MARGIN=1)

In [None]:
var_joint_ind_ps_summary = getUpperLowerCI(var_joint_ind_ps,prefix='var_joint',MARGIN=1)

In [None]:
sample_ci = getUpperLowerCI(model.sample$samples$fitted,prefix = 'sample')

In [None]:
presence_ci = getUpperLowerCI(model.presence$samples$fitted,prefix='pinophyta')

In [None]:
#TDF$fitted_values = model.spatial$fitted.values
#TDF = cbind(TDF,joint_ind_ps_summary,var_joint_ind_ps_summary,sample_ci,presence_ci)
TDF = cbind(TDF,joint_ind_ps_summary,var_joint_ind_ps_summary)


In [None]:
## Export to CSV
write.csv(TDF,file='/outputs/resultsCAR_sampleeffort1_puebla_p9_pinophyta_luca.csv')

In [None]:
HASTA AQUI ME QUEDE HOY. FALTA VER que pasa con los modelos hacer mapa

In [None]:
dtau = density(model.presence$samples$tau2)
dsigma = density(model.presence$samples$sigma2)

plot(dtau,main=expression(tau^2))
plot(dsigma,main=expression(sigma^2))
plot(beta1,main="Intercept")
plot(beta2,main="Distance to Road")
plot(beta3,main="Population")

In [None]:
mod_sig = glm(formula, data=TDF, family=binomial())
#mod1.summary()

In [None]:
summary(mod_sig)

In [None]:
confint(mod_sig)

## Instantiate Geospatial data

In [None]:
library(lattice)
library(sp)


In [None]:
coordinates(TDF) <- c("Longitude","Latitude")

In [None]:
bubble(TDF,"Pinophyta")

## Spatial analysis in R
First, exploratory

In [None]:
library(gstat)

In [None]:
plot(variogram(Pinophyta ~ 1,TDF))

In [None]:
install.packages("geoR")

In [None]:
install.packages("geoRglm")

In [None]:
library('spdep')

In [None]:
model_car = spautolm(Pinophyta ~ Dist.to.road_m +  Population_m + tipos,data=TDF,family="CAR")