## Here I show two methods for getting data from the knowledge graph. 
1. The `naive` method which means that at all the data from a graph query (taxonomical query) will be exported first as a complete list and then processed accordingly. 

2. The `lazy-evaluation` method which uses nested iterators from the data retrieval (GraphSelector) to the processing. Only at the end a flattening is done. Which means, it gets the data from the bottom of the nested iterator stack.


In [24]:
%matplotlib inline
import sys
sys.path.append('/apps')
import django
django.setup()
from drivers.tree_builder import TreeNeo
from drivers.graph_models import TreeNode, Order, Family, graph,Kingdom,Occurrence
from drivers.graph_models import Cell,Mex4km, countObjectsOf
import matplotlib.pyplot as plt
## Use the ggplot style
plt.style.use('ggplot')

## Take a subsample of the Cells.

In [25]:
n = graph.data("MATCH (n:mex4km) RETURN Count(n)").pop()['Count(n)']

In [26]:
n = countObjectsOf(Mex4km)
print(n)

279277


### A lot of data we need to use a sampling method.

In [27]:
import numpy as np
np.random.seed(12345)
sample_size = 50
choices = np.random.choice(range(1,n),sample_size,replace='False')

### Query for exporting a selection of cells.

** THis needs to be added in the code as utilities

In [28]:
c = list(choices)
## This will stringify the id list to get the selected cells.
sel = Mex4km.select(graph).where("_.id IN  %s "%str(c))

### Using iterators (imap + graphselector_iterator)

In [29]:
import itertools as it

In [30]:
%time ocs = it.imap(lambda c : c.has_occurrences,sel)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 15 µs


## Getting the first N data sets
1. First Build an iterable slice 


In [31]:
N = sample_size
%time f_10k = it.islice(ocs,N)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 19.8 µs


### Flatten the data
Because are nested iterables and according to the "Zen of Python" 
` Flat is better than nested `
We will *flatten* the list to get the amount of data we want. 
This increases dramatically the efficiency because we don't need to get the all the data in the shape of list and then perform the needed operations.
We can define them abstractly and the only flatten the data whenever is needed.


In [32]:
cosas = it.chain.from_iterable(ocs)

In [33]:
ccc = list(cosas)

In [34]:
ccc

[<Occurrence pk=1511965>,
 <Occurrence pk=849598>,
 <Occurrence pk=710899>,
 <Occurrence pk=3178968>,
 <Occurrence pk=2807291>,
 <Occurrence pk=2395867>,
 <Occurrence pk=1471529>,
 <Occurrence pk=311658>,
 <Occurrence pk=2794415>,
 <Occurrence pk=2391625>,
 <Occurrence pk=3178392>,
 <Occurrence pk=312456>,
 <Occurrence pk=2799065>,
 <Occurrence pk=2384205>,
 <Occurrence pk=2795497>,
 <Occurrence pk=472057>,
 <Occurrence pk=689407>,
 <Occurrence pk=2794414>,
 <Occurrence pk=2808041>,
 <Occurrence pk=3188328>,
 <Occurrence pk=2416561>,
 <Occurrence pk=471833>,
 <Occurrence pk=3216290>,
 <Occurrence pk=2075876>,
 <Occurrence pk=2390662>,
 <Occurrence pk=2402321>,
 <Occurrence pk=1512607>,
 <Occurrence pk=313303>,
 <Occurrence pk=471835>,
 <Occurrence pk=710897>,
 <Occurrence pk=3178394>,
 <Occurrence pk=2147850>,
 <Occurrence pk=2794626>,
 <Occurrence pk=473300>,
 <Occurrence pk=471631>,
 <Occurrence pk=310624>,
 <Occurrence pk=1954371>,
 <Occurrence pk=1572471>,
 <Occurrence pk=1211106>,

### Benchmarking time for retrieval using explicit lists vs lazy-evaluation


In [35]:
%time samples = list(sel)

CPU times: user 300 ms, sys: 12 ms, total: 312 ms
Wall time: 418 ms


In [36]:
%time ocs2 = map(lambda c : list(c.has_occurrences),samples)

CPU times: user 204 ms, sys: 8 ms, total: 212 ms
Wall time: 255 ms


In [37]:
ocs2_l = filter(lambda k : k != [] ,ocs2)

In [38]:
len(ocs2_l)

7

In [39]:
lll = reduce(lambda a,b : a+b,ocs2_l)

In [40]:
lll

[<Occurrence pk=1511965>,
 <Occurrence pk=849598>,
 <Occurrence pk=710899>,
 <Occurrence pk=3178968>,
 <Occurrence pk=2807291>,
 <Occurrence pk=2395867>,
 <Occurrence pk=1471529>,
 <Occurrence pk=311658>,
 <Occurrence pk=2794415>,
 <Occurrence pk=2391625>,
 <Occurrence pk=3178392>,
 <Occurrence pk=312456>,
 <Occurrence pk=2799065>,
 <Occurrence pk=2384205>,
 <Occurrence pk=2795497>,
 <Occurrence pk=472057>,
 <Occurrence pk=689407>,
 <Occurrence pk=2794414>,
 <Occurrence pk=2808041>,
 <Occurrence pk=3188328>,
 <Occurrence pk=2416561>,
 <Occurrence pk=471833>,
 <Occurrence pk=3216290>,
 <Occurrence pk=2075876>,
 <Occurrence pk=2390662>,
 <Occurrence pk=2402321>,
 <Occurrence pk=1512607>,
 <Occurrence pk=313303>,
 <Occurrence pk=471835>,
 <Occurrence pk=710897>,
 <Occurrence pk=3178394>,
 <Occurrence pk=2147850>,
 <Occurrence pk=2794626>,
 <Occurrence pk=473300>,
 <Occurrence pk=471631>,
 <Occurrence pk=310624>,
 <Occurrence pk=1954371>,
 <Occurrence pk=1572471>,
 <Occurrence pk=1211106>,

In [41]:
lll == ccc

True

In [42]:
sel = Mex4km.select(graph).where("_.id IN  %s "%str(c))

In [None]:
def _try_levelnames_extraction(relationship):
    """
    Extracts the end node relationship name.
    for use with map functions.
    """
    try:
        a = relationship.start_node()['levelname']
        return a
    except:
        return None
    
types = map(lambda r : map(lambda t : _try_levelnames_extraction(t),r),available_rels)

In [None]:
types

In [None]:
tt = tb.buildTreeNeo(samples[26])

In [None]:
#For now not run
#big_tree = reduce(lambda a,b : a+b , trees)
import seaborn as sns

In [None]:
t = trees[2]

In [None]:
ll = map(lambda t : t.richness , trees)

In [None]:
sns.distplot(ll)

In [None]:
tl.plotTree(tt)

In [None]:
import traversals.strategies as strg

In [None]:
type(root)

In [None]:
root = t.node

In [None]:
a = strg.getPresencesForNode(root,trees)

In [None]:
data_t = strg.getPresencesForListOfNodes([root],trees)

In [None]:
data_t

# The model

In [None]:
import pymc3 as pm

In [None]:
-

In [None]:
from pymc3 import find_MAP
map_estimate = find_MAP(model=model)
map_estimate

In [None]:
import pandas as pd

In [None]:
mapxy = pd.concat([data_t[['Longitude','Latitude']],pd.DataFrame({'map': map_estimate['latent_field']})],axis=1)

In [None]:
gmapxy = tools.toGeoDataFrame(mapxy,xcoord_name='Longitude',ycoord_name='Latitude')

In [None]:
fig, ax = plt.subplots(figsize=(14, 9));
gmapxy.plot(ax=ax,column='map')

## Prediction
The conditional method creates the conditional, or predictive, distribution over the latent function at arbitrary x∗x∗ input points, f(x∗)f(x∗). To construct the conditional distribution we write:

In [None]:
minx = min(data_t.Longitude)
maxx = max(data_t.Longitude)
miny = min(data_t.Latitude)
maxy = max(data_t.Latitude)

In [None]:
from external_plugins.spystats.spystats import tools

In [None]:
grid = tools.createGrid(grid_sizex=10,grid_sizey=10,minx=minx,miny=miny,maxx=maxx,maxy=maxy)

In [None]:
gp.predict(grid[['Lon','Lat']])

In [None]:
%time f_star = gp.conditional("f_star", X=grid[['Lon','Lat']])

In [None]:
getdata = lambda tree : tree.associatedData.getEnvironmentalVariablesCells()

In [None]:
ts[1].associatedData.getEnvironmentalVariablesCells()

In [None]:
list(choices)

In [None]:
n