# Model template using multiple polygons
This is an exercise to test the computing time for fitting and predicting spatial models using an assortment of different polygons.

This can work as a general template for other models and data pipelines.

In [1]:
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()

#import traversals.strategies as st
#from os import walk
import matplotlib.pyplot as plt
import pandas as pd
import itertools as it
import numpy as np
import pymc3 as pm

## Use the ggplot style
plt.style.use('ggplot')

  from ._conv import register_converters as _register_converters


In [2]:
from external_plugins.spystats.spystats import utilities as ut

In [3]:
bursera_path = '/outputs/presence_only_models/data/burseras'
bursera_dataset = ut.loadDataset(bursera_path)

train_path = '/outputs/presence_only_models/data/root'
train_dataset = ut.loadDataset(train_path)
## Predictors
pred_path = '/outputs/presence_only_models/predictors/datasetp2'
pred_dataset = ut.loadDataset(pred_path)
### PATCH, the thing is taking backwards the order of the lists of files, because of the name
#pred_dataset.reverse()
prediction_dataset_dic= map(lambda p : ut.preparePredictors(p),pred_dataset)


INFO:external_plugins.spystats.spystats.utilities:Reading file /outputs/presence_only_models/data/burseras
INFO:external_plugins.spystats.spystats.utilities:Reading file /outputs/presence_only_models/data/root
INFO:external_plugins.spystats.spystats.utilities:Reading file /outputs/presence_only_models/predictors/datasetp2


In [4]:
## Recreating the polygons
## This should be included in the data structure
from django.contrib.gis.geos import Point, Polygon
xcoord = -99.76
ycoord = 17.55
p = Point(xcoord,ycoord,srid=4326)
radii = np.linspace(0.08,1, 10)
polys = map(lambda r : p.buffer(r),radii)


## Obtaining the predictors
In this case we will bring all the variables to start working with everything

In [5]:
from raster_api.tools import RasterData
from raster_api.models import raster_models_dic as models


### Ok, lets start the ,modelling here

In [6]:
#pm.traceplot(trace)
from raster_api.tools import RasterContainer
from raster_api.models import ETOPO1,MeanTemperature
from raster_api.tools import RasterData
from sketches.models import Country
from mesh.models import MexMesh

In [7]:
i = 4

datatrain = train_dataset[i]
#Y = datatrain.Burseraceae
#Y = datatrain.Burseraceae
datapred = prediction_dataset_dic[i]
polygon = polys[i]

elevation = RasterData(rastermodelinstance=ETOPO1,border=polygon)
height = 100
width = 100
### Remember that the geoparams are not necesarily the same as the template.
## This is why it's important to import this object as well
elevation.resize(width,height)
#elevation.display_field(origin='Lower',interpolation='None')


<Raster object at 0x7f8fedb9cbe0>

In [8]:
# Using the patsy library to extract the data

In [14]:
## Assign categorical values
datatrain.inegiv5name = datatrain.inegiv5name.astype('category')
datapred['full'].name = datapred['full'].name.astype('category')
datapred['clean'].name = datapred['clean'].name.astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [54]:
#TM = dmatrices('Burseraceae ~ Longitude + Latitude + DistanceToRoadMex_mean + WorldPopLatam2010_mean + inegiv5name',datatrain,return_type="dataframe")
TM = dmatrices('Burseraceae ~ Longitude + Latitude + DistanceToRoadMex_mean + WorldPopLatam2010_mean + inegiv5name',datatrain)

In [55]:
#PM = dmatrix('Longitude + Latitude + Q("Dist.to.road_m") + Population_m + name',datapred['clean'],return_type='dataframe')
PM = dmatrix('Longitude + Latitude + Q("Dist.to.road_m") + Population_m + name',datapred['clean'])

<read-write buffer for 0x7f8fea89faa0, size 2655000, offset 0 at 0x7f8fea8afc70>

In [None]:
## This is for calculating the signal
from pymc3.variational.callbacks import CheckParametersConvergence
def FitMyModel(Y,train_subset,predictordf):
    with pm.Model() as model:
        ## Building the kernel
        tau = pm.HalfNormal('tau',sd=10)
        sigma = pm.HalfNormal('sigma',sd=10)
        phi = pm.Uniform('phi',0,15)
        Tau = pm.gp.cov.Constant(tau)
        cov = (sigma * pm.gp.cov.Matern32(2,phi,active_dims=[0,1])) + Tau
        
        
        ## Parameters for linear predictor
        #b0 = pm.Normal('b0',mu=0,sd=10)
        dummies = [col for col in traindf if str(col).startswith('dum_')]
        dumscols = traindf[dummies]
        dumshape = dumscols.shape
        
        b = pm.Normal('b',mu=0,sd=1.5,shape=dumshape)
        k = pm.math.matrix_dot(dumscols,b.transpose())
        
       
        
        ## The latent function
        gp = pm.gp.Latent(cov_func=cov)
        f = gp.prior("latent_field", X=traindf.values,reparameterize=False)


        
        
        yy = pm.Bernoulli("yy",logit_p=k,observed=traindf.Burseraceae.values)



        trace = pm.fit(method='advi', callbacks=[CheckParametersConvergence()],n=15000)    
        #trace = pm.sample(10)
        trace = trace.sample(draws=5000)


        f_star = gp.conditional("f_star", predictordf.values)

        pred_samples = pm.sample_ppc(trace, vars=[f_star], samples=100)
        return pred_samples,trace
   

In [None]:
%time pred_samples,trace = FitMyModel(Y,traindf,predictordf)

In [None]:
for i in range(len(prediction_dataset_dic)):
    datatrain = train_dataset[i]
    #Y = datatrain.Burseraceae
    Y = datatrain.Burseraceae
    datapred = prediction_dataset_dic[i]
    polygon = polys[i]

    elevation = RasterData(rastermodelinstance=ETOPO1,border=polygon)
    height = 100
    width = 100
    ### Remember that the geoparams are not necesarily the same as the template.
    ## This is why it's important to import this object as well
    elevation.resize(width,height)
    #elevation.display_field(origin='Lower',interpolation='None')
    %time pred_samples = FitMyModel(Y,datatrain,datapred)
    ut.plotThings(pred_samples,datapred,elevation)

In [None]:
ut.plotThings(pred_samples,datapred,elevation)

In [None]:
pm.traceplot(trace)

In [None]:
from scipy.special import expit
fig, ax = plt.subplots(figsize=(10, 9));

#ncounts_families.display_field(band=2,origin='Lower',title='family richness')
expit(ql_presences_of_something.rasterdata.bands[0].data())
plt.imshow(ql_presences_of_something.rasterdata.bands[1].data(),origin='Lower',cmap=plt.cm.Greens)
plt.title("Presences quantile 0.025")
plt.colorbar(orientation='horizontal')

In [None]:
from scipy.special import expit
fig, ax = plt.subplots(figsize=(10, 9));

#ncounts_families.display_field(band=2,origin='Lower',title='family richness')
plt.imshow(mean_presences_of_something.rasterdata.bands[1].data(),origin='Lower',cmap=plt.cm.Greens,clim=(0,1))

plt.colorbar(orientation='horizontal')
plt.title('Probability of presences (mean surface)' )

In [None]:
from scipy.special import expit
fig, ax = plt.subplots(figsize=(10, 9));

#ncounts_families.display_field(band=2,origin='Lower',title='family richness')
plt.imshow(qh_presences_of_something.rasterdata.bands[1].data(),origin='Lower',cmap=plt.cm.Greens,clim=(0,1))
plt.colorbar(orientation='horizontal')
plt.title("Presences quantile 0.975")

In [None]:
fig, ax = plt.subplots(figsize=(10, 9));

#ncounts_families.display_field(band=2,origin='Lower',title='family richness')
plt.imshow(prob5.rasterdata.bands[1].data(),origin='Lower',cmap=plt.cm.RdBu,clim=(0,1))
plt.colorbar(orientation='horizontal')
plt.title("Probability of probability more than 0.5 of presence of Burseracea family")

## Export results to GEotif!

In [None]:
name = "sample_root"
ql_presences_of_something.exportToGeoTiff("ql_"+name)


In [None]:
mean_presences_of_something.exportToGeoTiff("mean_"+name)

In [None]:
qh_presences_of_something.exportToGeoTiff("qh_"+name)

In [None]:
prob5.exportToGeoTiff("prob05"+name)