In [1]:
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()
from drivers.tree_builder import TreeNeo
from drivers.graph_models import TreeNode, Order, Family, graph,Kingdom,Occurrence
from drivers.graph_models import Cell,Mex4km, countObjectsOf
from drivers.graph_models import pickNode
import matplotlib.pyplot as plt
import pandas as pd
import itertools as it
import numpy as np

## Use the ggplot style
plt.style.use('ggplot')

# Spatial Regression for richness of certain taxon
---
Abstraction dashboard for the joininig the data with the model

***
Here I show how to extract different taxonomic information at cell level.
Although there exists a method for building the taxonomic tree within a single cell, the process can be computationally intensive because it depends on extracting the total amount of occurrences in each cell. From there, it traverses fromtop to bottom the tree looking for the corresponding nodes.

The approach is usefull when one needs a small number of trees but it'll become increasingly slow if the amount of cells or occurrences increases. 

---

## Extracting specific taxonomic levels en each cells

The method studied here makes use of the relationship type `IS_IN` stored in the knowledge graph.

> Developer's note: *There was a problem with the design of the OGM implementation (py2neo.ogm). The retrieval of linked nodes based on a specific relation does not distinguish different labels. In other words it returns the totality of the data that has the  specific relationship given a node.*

> Patchy solution: 
The solution was to include extra methods for the class Cell `has_[taxas]`. This method/attribute returns a graph selector that points to the corresponding nodes.

> Stable Fix: 
Make relationships as specific as possible (given the data). For example, if instead of using the relation type 
* *IS_IN* for (Bursera:Family) -[IS_IN]-> (Grid:Cell) 
change it to:
* *Family_IS_IN* for (Bursera:Family) -[IS_IN]-> (Grid:Cell)
Let's get started.
As usual we need to load the necessary modules


In [2]:
## Let's pick the bats node
bats = pickNode(Order,name='Chiroptera')

In [4]:
ids4bats = bats.getCellsById()

In [None]:
%time cells_w_bats = list(cc)

In [None]:
import pandas as pd

In [None]:
dd = pd.DataFrame(cc)

In [None]:
[o['c.id'] for o in cc]

In [None]:
%time list(cc)

In [None]:
ccs = list(bats.giveNCells(3))

In [None]:
C = bats.is_in.related_class

In [None]:
C.select

In [None]:
## Explore why is taking so much time
cells = bats.is_in._related_objects

In [None]:
cells_w_bats,b = zip(*cells)

## Random selection of cells.

> Note: Data Arquitecture. For storage reasons I couldn't load the complete world bioclimatic layers. Therefore I needed to put a regional subset that comprises only the Mexican Territory. 
For this reason, it is necessary that any approach for selecting subsamples needs to be constrained (filter) by this geometry. 
We can do that with this:

In [None]:
from sketches.models import Country
from mesh.models import MexMesh

Mexico = Country.objects.filter(name__contains="exico").get()
mexican_cells = MexMesh.objects.filter(cell__intersects=Mexico.geom)

Obtain list of cells within the Mexican Territory.
> The attribute: `mexican_cells.values` is a generator of the Type: QuerySet. We need to cast it to list for loading all the data in memory.


In [None]:
# Get all cell ids
selected_cells = mexican_cells
#selected_cells = cells_w_bats
#ids = list(selected_cells.values('pk'))

The UniformRandomCellSample is a method for sampling cells in the example below we give as arguments. 

## Extract richness and Environmental covariates from cells at a given taxonomic level
Options are: Family, Order, Spicies, etc

In [None]:
from traversals import strategies as st

In [None]:
from traversals import sampling as sm

In [None]:
trees = sm.UniformRandomSampleForest(selected_cells,size=150)

In [None]:
%time ts = list(trees)

In [None]:
%time data = st.getEnvironmentalCovariatesFromListOfTrees(ts)

In [None]:
%time x = arbol1.associatedData.getEnvironmentalVariablesCells()

In [None]:
arbol1 = ts[7]

In [None]:
%time st.getEnvironmentalCovariatesFromListOfTrees

In [None]:
%time data = st.getEnvironmentAndRichnessFromListOfCells(list_of_cells=selected_cells,taxonomic_level_name='Family')

In [None]:
data.loc[:2]

In [None]:
data.n_Family.mean()

It takes time because it need to calculate on the fly the summary statistic of each cell. It is using the postgis backend

In [None]:
data.plot(column='n_Family')

## Obtaining the predictors
In this case we will bring all the variables to start working with everything

In [None]:
from raster_api.tools import RasterData
from raster_api.models import raster_models_dic as models


### Obtaining everything

In [None]:
datadict = { key : RasterData(models[key],border=Mexico.geom) for key,value in models.iteritems()}

In [None]:
## Without resamling (whole data)
pixel_size = 0.25
%time datacube_field = map(lambda raster : raster.rescale(pixel_size),datadict.itervalues())
datacube = datacube_field

In [None]:
map(lambda (k,d) : d.display_field(title=k,origin='Lower'),datadict.iteritems())

## For the moment we don't want to do temporal analysis so we need to aggregate the array by the mean.

Using the new 'resample' method

In [None]:
# Take mean of evetything
cubes = map(lambda (k,v): np.mean(v.toNumpyArray(),axis=0), datadict.iteritems())

The coordinates are the same so, we can extract the them with getCoordinates and then append everything as a flat array

In [None]:
coords = map(lambda (k,v) : v.getCoordinates(),datadict.iteritems())

In [None]:
coords = pd.concat(coords,axis=1)

In [None]:
coords1 = coords[[0,1]]

In [None]:
## Oke I need a way to extract the dataframe, maybe aggregate it by mean 
dataframe_cube = map(lambda cube: pd.DataFrame(cube.flatten()),cubes)

In [None]:
datacube = pd.concat(dataframe_cube,axis=1)

In [None]:
datacube = pd.concat([datacube,coords1],axis=1)

In [None]:
datacube.columns = datadict.keys() + list(coords1.columns)

In [None]:
plt.scatter(datacube.Longitude,datacube.Latitude,c=datacube.SolarRadiation)

In [None]:
datacube_clean.columns

In [None]:
datacube.loc[:10]
datacube_clean = datacube.dropna()

In [None]:
## Convert to geopandas
from external_plugins.spystats.spystats import tools as tl
datacube_clean = tl.toGeoDataFrame(datacube_clean,xcoord_name='Longitude',ycoord_name='Latitude')

# Bayesian Modelling

The Model is an inhomogeneous Poisson process.

$$y(x) \sim Poisson(\lambda(x)) $$
$$ \lambda(x) = exp\{\alpha + S(x)\}$$

Where $S(x)$ is a Gaussian Process such that:

$$S(x) \sim MVN(0,\sigma^2 \rho(||x - x'||) $$

For this particular case:
$$\rho = Matern(\phi,\kappa = \frac{3}{4}) + \tau^2$$

I'm using a Bayesian approach with parameters $\phi$ and $\tau$ as random variables with it's corresponding priors.

In [None]:
import pymc3 as pm
data.columns = [u'n_Family', u'Longitude', u'Latitude', u'Elevation_mean',
       u'MaxTemperature_mean', u'MeanTemperature_mean', u'MinTemperature_mean',
       u'Precipitation_mean', u'SolarRadiation_mean', u'Vapor_mean',
       u'WindSpeed_mean', u'geometry']

In [None]:
from statsmodels.genmod.generalized_linear_model import GLM
glmodel = GLM.from_formula('n_Family ~ Elevation_mean + MaxTemperature_mean',data=data)
res = glmodel.fit()
print(res.summary())


z = np.array([0.0,0.0])
coefs = np.append(z,res.params.values[1:])

In [None]:
## cero coef for long lat
z = np.array([0.0,0.0])
coefs = np.append(z,res.params.values[1:])
print(coefs)

In [None]:
## Analysis, GP only one parameter to fit
# The variational method is much beter.
from pymc3.variational.callbacks import CheckParametersConvergence

with pm.Model() as model:
    sigma = 1.0
    #range_a=10.13
    
    
    tau = pm.Uniform('tau',0,5.0)
    #sigma = pm.Flat('sigma')
    #phi = pm.HalfNormal('phi',mu=8,sd=3)
    #phi = pm.Uniform('phi',6,12)
    phi = pm.Uniform('phi',0,15)
    
    Tau = pm.gp.cov.Constant(tau)
    
    cov = sigma * pm.gp.cov.Matern32(2,phi,active_dims=[0,1]) + Tau
    #K = cov(grid[['Lon','Lat']].values)
    #phiprint = tt.printing.Print('phi')(phi)
    
    
    mf = pm.gp.mean.Linear(coeffs=coefs,intercept=res.params.values[0])

    
    
    
    
    ## The latent function
    gp = pm.gp.Latent(cov_func=cov)
    
    
    
    
    
    ## I don't know why this
    #f = gp.prior("latent_field", X=data[['Longitude','Latitude']].values,reparameterize=False)
    
    f = gp.prior("latent_field", X=data[['Longitude','Latitude','Elevation_mean','MaxTemperature_mean']].values,reparameterize=False)
    
    
    
    
    
    
    
    #f_print = tt.printing.Print('latent_field')(f)
    
    y_obs = pm.Poisson('y_obs',mu=np.exp(f),observed=data[['n_Family']].values)
    
    #y_obs = pm.MvNormal('y_obs',mu=np.zeros(n*n),cov=K,observed=grid.Z)

    #gp = pm.gp.Latent(cov_func=cov,observed=sample)
    # Use elliptical slice sampling
    #ess_step = pm.EllipticalSlice(vars=[f_sample], prior_cov=K)
    #step = pm.HamiltonianMC()
    #step = pm.Metropolis()
    #%time trace = pm.sample(5000,step)#,tune=0,chains=1)
    ## Variational
    
    %time mean_field = pm.fit(method='advi', callbacks=[CheckParametersConvergence()],n=15000)    
    %time trace = mean_field.sample(draws=5000)

#with model:    
    
    ## For predicting
    #%time f_star = gp.conditional("f_star", data_star.iloc[:,1:3].values)
    #%time f_star = gp.conditional("f_star", small_sample.iloc[:,1:3].values)
    %time f_star = gp.conditional("f_star", datacube_clean[['Longitude','Latitude','Elevation','MeanTemperature']].values)

    ## Full data
    ##%time f_star = gp.conditional("f_star",elev_data.iloc[:,1:3].values)


    
#with model:
    ## sampling predictions posterior predictive checks
    pred_samples = pm.sample_ppc(trace, vars=[f_star], samples=10)

                    


In [None]:
preds = pd.DataFrame(pred_samples['f_star']).transpose()

In [None]:
preds['mean_sample'] = preds.mean(axis=1)

In [None]:
#preds['idx'] = data_star.index.values
preds['idx'] = datacube_clean.index.values

In [None]:

#test1 = data_s.merge(preds,how='left',left_index=True,right_on='idx',suffixes=('_obs','_pred'))
test1 = datacube.merge(preds,how='left',left_index=True,right_on='idx',suffixes=('_obs','_pred'))
## Only the values of small_sample
#test2 = elev_data.merge(preds,how='inner',left_index=True,right_on='idx',suffixes=('_obs','_pred'))

In [None]:
fig, ax = plt.subplots(figsize=(10, 9));
plt.scatter(test1.Longitude,test1.Latitude,c=test1.mean_sample)
plt.scatter(data.Longitude,data.Latitude,c=data.n_Family,cmap=plt.cm.Greys,s=90)
plt.colorbar()

# Convert to a raster format.
 ## Motivation
 It's important for visualization and compatibility with GIS software to generate the results in a standard raster format.
*Biospytial* Has incorporated tools for reading and converting to the standard raster formats. In this case Geotif.

In [None]:
## Import raster container
from raster_api.tools import RasterContainer

### The easiest way is to take the metadata (geospatial parameters) from one of the RasterData we used as covariates

In [None]:
elv_rast = datadict['Elevation'].rasterdata

In [None]:
predicted_data = test1.mean_sample.values

In [None]:
ncounts_families = RasterContainer(predicted_data,use_metadata_from=elv_rast)

In [None]:
ncounts_families.display_field(band=1,origin='Lower',title='log(family richness)')

In [None]:
ncounts_families.display_field(band=2,origin='Lower',title='family richness')

### Export to Geotif

In [None]:
ncounts_families.exportToGeoTiff('ncount_families_elev_meantemps_2.5')

There are several parameters hidden here:
    1. The size of the predictors grid
    2. The sample size of the training data
    3. The bayesian hyperparameters

## For implementing today
* The modeller function (structure):
    
   * Inputs: 
    
    ** Dataframe for training data
    
    ** numpy n-array or dataframe for predictors
    
    ** Model (specified as Pymc3 model)
    
    * Outputs:
        
        *** The new RasterContainer with rthe bands as: log, exp and expit (in case binomial)
        
* Seems like needs to be a premodelr to select the cells, the geometry, etc.




In [None]:
ncounts_families.rasterdata.bands[0].data().shape

## stages
1. Build a function for extracting the predictors given a scale parameter
2. Build a function for the "premodeling"
3. Build a function for bundling everything and return the prediction (needs to accept a model)

> Models to run
Show two maps of taxa. e.g. Agave and Bats for instance

Then also show the matrix distance for trees 
