# This script is for showing the focal module to interpolate a Tree structure

## Import modules and libraries

In [1]:
from drivers.neo4j_reader import TreeNeo
from mesh.models import MexMesh
from gbif.taxonomy import Occurrence, Taxonomy, GriddedTaxonomy
from drivers.neo4j_reader import Cell,extractOccurrencesFromTaxonomies
from py2neo import Graph
import pandas as pd
import matplotlib.pyplot as plt
from drivers.neo4j_reader import RasterCollection

## Initialize geographic context and graph object

In [2]:
g = Graph()


from django.contrib.gis.geos import GEOSGeometry
polystr = "POLYGON((-109 27,-106 27,-106 30,-109 30,-109 27))"
polygon = GEOSGeometry(polystr)


In [3]:
mexgrid = MexMesh.objects.filter(cell__intersects=polygon)
## Instantiate the biosphere
biosphere = Occurrence.objects.all()
subbiosphere = biosphere.filter(geom__intersects=polygon)
ggg = GriddedTaxonomy(subbiosphere,mexgrid.filter(cell__intersects=polystr),generate_tree_now=False,use_id_as_name=False)


Exception django.contrib.gis.gdal.error.GDALException: GDALException('Invalid pointer returned from "GDALClose"',) in 

In [4]:
# Sort occurrences by length
ggg.taxonomies.sort(key=lambda l : len(l.occurrences))


In [5]:
trees_f = TreeNeo(extractOccurrencesFromTaxonomies([ggg.taxonomies[3400]]))

central = trees_f

## Expand cells to 6 neighbours (Will take time, poor implementation)

In [6]:
neighbours = central.expandNeighbouringTrees(4)

In [7]:
## We need to wipe repeated trees
neighbours = list(set(neighbours))

In [8]:
## Merge the trees into a bigger tree
bigt = reduce(lambda a, b : a + b , neighbours)  

In [9]:
## Let's select for now (without losing generality the level "Family")
catalog = map(lambda l : l.name, bigt.kingdoms)
## Let's sort it by id
catalog.sort()


In [10]:
bigt.children

[<TreeNode | Kingdom: Animalia - n.count : 512- >,
 <TreeNode | Kingdom: Fungi - n.count : 248- >,
 <TreeNode | Kingdom: Plantae - n.count : 1301- >]

In [11]:
catalog

[u'Animalia', u'Fungi', u'Plantae']

## So now we have the list of the upper scale or the big region. This is going to be used to generate presence absences


In [12]:
neighbours

[<LocalTree Of Life | Root: LUCA - n.count : 4- >,
 <LocalTree Of Life | Root: LUCA - n.count : 180- >,
 <LocalTree Of Life | Root: LUCA - n.count : 266- >,
 <LocalTree Of Life | Root: LUCA - n.count : 173- >,
 <LocalTree Of Life | Root: LUCA - n.count : 38- >,
 <LocalTree Of Life | Root: LUCA - n.count : 34- >,
 <LocalTree Of Life | Root: LUCA - n.count : 14- >,
 <LocalTree Of Life | Root: LUCA - n.count : 34- >,
 <LocalTree Of Life | Root: LUCA - n.count : 14- >,
 <LocalTree Of Life | Root: LUCA - n.count : 2- >,
 <LocalTree Of Life | Root: LUCA - n.count : 4- >,
 <LocalTree Of Life | Root: LUCA - n.count : 81- >,
 <LocalTree Of Life | Root: LUCA - n.count : 596- >,
 <LocalTree Of Life | Root: LUCA - n.count : 36- >,
 <LocalTree Of Life | Root: LUCA - n.count : 64- >,
 <LocalTree Of Life | Root: LUCA - n.count : 4- >,
 <LocalTree Of Life | Root: LUCA - n.count : 177- >,
 <LocalTree Of Life | Root: LUCA - n.count : 10- >,
 <LocalTree Of Life | Root: LUCA - n.count : 330- >]

In [13]:
## Let's see what's in the first member:
n0 = neighbours[0]

In [14]:
a = n0.pseudoPresenceAbsence(catalog,1,selected_field='name')

In [15]:
n1 = neighbours[1]

In [16]:
b = n1.pseudoPresenceAbsence(catalog,1,selected_field='name')

In [17]:
c = pd.concat([a,b],axis=1)

In [18]:
c = a
for i,neighbour in enumerate(neighbours):
    n = neighbour.pseudoPresenceAbsence(catalog,1,selected_field='name')
    n.columns = [neighbour.getExactCells()[0].id]
    c = pd.concat([c,n],axis=1,names=str(i))

In [19]:
c

Unnamed: 0,0,195326,196487,195713,196488,196103,195325,195714,195717,196483,196100,195715,195716,196871,196102,197249,196484,196486,196485,196870
Animalia,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0
Plantae,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1
Fungi,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [73]:
c.drop(0,1)

Unnamed: 0,195326,196487,195713,196488,196103,195325,195714,195717,196483,196100,195715,195716,196871,196102,197249,196484,196486,196485,196870
Animalia,0,1,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,0
Plantae,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1
Fungi,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [76]:
cooccurrences = c

In [77]:
central.pseudoPresenceAbsence(catalog, 1, selected_field='name')

Unnamed: 0,0
Animalia,1
Plantae,0
Fungi,0


In [78]:
catalog

[u'Animalia', u'Fungi', u'Plantae']

## Now, let's calculate the environment

In [79]:
environments = map(lambda n : n.associatedData.getEnvironmentalVariablesCells(),neighbours)

In [80]:
cell_ids = map(lambda c : str(c.getExactCells().pop().id),neighbours)

In [81]:
envs = pd.DataFrame.from_dict(environments)

In [82]:
envs = pd.concat([pd.DataFrame({'neighbour_id':cell_ids}),envs],axis=1)

In [83]:
envs.set_index('neighbour_id')

Unnamed: 0_level_0,MaxTemperature_mean,MaxTemperature_std,MeanTemperature_mean,MeanTemperature_std,MinTemperature_mean,MinTemperature_std,Precipitation_mean,Precipitation_std,SolarRadiation_mean,SolarRadiation_std,Vapor_mean,Vapor_std,WindSpeed_mean,WindSpeed_std
neighbour_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
195326,24.257407,0.254007,14.802778,0.341812,5.349074,0.447316,77.569444,2.042493,18575.19213,31.49698,0.874861,0.022549,2.741204,0.060159
196487,22.789352,0.889141,13.497222,0.770713,4.199769,0.677539,83.803241,4.19477,18497.122685,63.871331,0.797477,0.052171,2.936806,0.157682
195713,24.893056,0.203955,15.727778,0.256806,6.562222,0.323763,76.925,1.63111,18609.458333,27.278082,0.936,0.018719,2.611111,0.04362
196488,22.472685,0.492145,12.953009,0.420161,3.434954,0.367104,84.037037,2.468258,18454.939815,44.599961,0.772731,0.030286,3.003472,0.098579
196103,23.60625,0.568028,14.141435,0.564612,4.675463,0.588116,80.604167,3.335017,18481.791667,53.350899,0.829028,0.038374,2.846991,0.109634
195325,24.81412,0.190622,15.529167,0.250565,6.244907,0.317938,76.064815,1.576403,18583.291667,28.489497,0.913009,0.022666,2.658796,0.057749
195714,24.558102,0.311941,15.381713,0.332983,6.199768,0.357752,77.594907,2.319583,18590.407407,35.581397,0.912616,0.023828,2.656019,0.060069
195717,23.954861,0.295559,14.166667,0.280878,4.380787,0.288897,80.085648,1.677545,18505.099537,34.470394,0.832778,0.021545,2.8375,0.068161
196483,24.189815,0.746059,15.468287,0.943928,6.743287,1.150485,83.210648,4.091704,18527.546296,61.1992,0.883403,0.054675,2.725926,0.133956
196100,22.66088,0.664006,13.432176,0.553997,4.206019,0.477548,86.162037,2.840747,18532.081019,53.903177,0.793542,0.041881,2.955787,0.127942


In [89]:
from sklearn import linear_model

In [90]:
environments[:,2,2]

TypeError: list indices must be integers, not tuple

## Let's save it into a CSV


In [86]:
envs.to_csv('environmental_neighbourstest1.csv')

In [88]:
cooccurrences.transpose().to_csv('environmental_kingdoms_test1.csv')

In [79]:
0.0061 * 108117

659.5137000000001

In [80]:
854.17 / 659


1.2961608497723824

In [None]:
En este trabajo describo un m'etodo para modelar distribuciones geogr'aficas de grupos de organismos. Utilizo una base de datos global de ocurrencias de especies y con base en la taxonomia de cada una construyo un 'arbol (gr'afica ac'iclica). La gr'afica resultante tiene hojas (nodos con grado = 1) con atributos espaciales y temporales. Esto es, cada ocurrencia tiene un lugar en el espacio tiempo. De aqu'i se desprenden varias preguntas acerca de las asociaciones (correlaciones) de distintas especies, familias, clases, u otra herarquia taxon'omica en el espacio (tiempo). Adem'as de la informaci'on de correlacion entre grupos taxon'omicos, hay una componente ambiental de manera que el modelo utiliza la informaci'on ambiental, las co-ocurrencias de otros grupos y una regularizaci'on gaussiana para minimizar el ajuste. 

El m'etodo que presento consta de dos partes. Una te'orica donde propongo un modelo regresi'on multiple mixto generalizado (LME); y una implementaci'on donde describo las herramientas de grandes datos utilizados para estructurar explicitamente la base de datos.

Debido a esto el proyecto puede suscribirse dentro del rubro "Medio Ambiente" o "Tecnolog'ias de la informaci'on"

In [None]:
I'm proposing a method for modeling geographic distributions of organisms based on their environmental and ecological niche. I use historic records at a global range (GBIF) using a RESTfull system for filtering, querying and perform spatial processing. 
A global tree of life is built using the taxonomic relationships of each occurrence, i.e. each occurrence has a species name, each specie a genus, and so on. 
Each occurrence (a realisation of a random variable) has spatial and temporal components. Using these location attributes it's possible to bind each occurrence to other environmental data (e.g. Temperature, Precipitation, Solar Radiation, WindSpeed, etc). 
The global tree of life is bound to a lattice that spans the entire planet. 
From here it's possible to derive interesting questions regarding stationary distributions, correlations of taxonomic groups across space (or time) etc. 

The model is defined as a Generalized Linear Mixed Model with random effects that models a joint probability distribution of the environmental covariates, the co-occurrent groups and a regularizing gaussian kernel to account for the spatial variation. 

The talk I'm proposing will be divided in two parts: i) a theoretical part, where I'll define the model and an implementation where I'll explain the big data framework and tools I've been developing. 

The talk is aimed at a general level audience. I'm looking forward to build  collaborations.