Mahalanobis covariance working

malaria-atlas-project · Jun 16, 2009 · 53c9d59 · 53c9d59
1 parent 19c3e22
commit 53c9d59
Show file tree

Hide file tree

Showing 6 changed files with 204 additions and 0 deletions.
diff --git a/anopheles/mahalanobis.f b/anopheles/mahalanobis.f
@@ -0,0 +1,69 @@
+      SUBROUTINE mahal(c,x,y,symm,a,l,s,nx,ny,nd,cmin,cmax)
+cf2py intent(out) d
+cf2py intent(hide) nx,ny,nd
+cf2py intent(inplace) c
+cf2py threadsafe
+      DOUBLE PRECISION x(nx,nd), y(ny,nd), s(nd,nd), l(nd)
+      DOUBLE PRECISION c(nx,ny), dev(nd), this, a, tdev(nd)
+      INTEGER i,j,k,m,nx,ny,nd,cmin,cmax
+      LOGICAL symm
+
+!       DGEMV(TRANS,M,N,ALPHA,A,LDA,X,INCX,BETA,Y,INCY)
+      EXTERNAL DGEMV
+
+      do j=cmin+1,cmax     
+          if (symm) then
+
+                c(j,j)=a*a
+
+                do i=1,j-1
+                    do k=1,nd
+                        dev(k) = x(i,k) - y(j,k)
+                    end do
+
+!               DGEMV that is guaranteed to be single-threaded
+                    do k=1,nd
+                        tdev(k) = 0.0D0                  
+                        do m=1,nd
+                          tdev(k)=tdev(k)+s(m,k)*dev(m)
+                        end do
+                    end do
+
+                    this = 0.0D0
+                    do k=1,nd
+                        this = this - tdev(k)*tdev(k)/l(k)
+                    end do
+
+                    c(i,j) = dexp(this)*a*a
+
+                end do              
+
+          else
+
+              do i=1,nx
+
+                  do k=1,nd
+                      dev(k) = x(i,k) - y(j,k)
+                  end do
+
+!               DGEMV that is guaranteed to be single-threaded
+                  do k=1,nd
+                      tdev(k) = 0.0D0                  
+                      do m=1,nd
+                        tdev(k)=tdev(k)+s(m,k)*dev(m)
+                      end do
+                  end do
+
+                  this = 0.0D0
+                  do k=1,nd
+                      this = this - tdev(k)*tdev(k)/l(k)
+                  end do
+
+                  c(i,j) = dexp(this)*a*a
+
+              end do
+          end if
+      end do
+
+      RETURN 
+      END
diff --git a/anopheles/mahalanobis_covariance.py b/anopheles/mahalanobis_covariance.py
@@ -0,0 +1,76 @@
+import numpy as np
+import pymc as pm
+from mahalanobis import mahal
+
+def mahalanobis_covariance(x,y,amp,val,vec,symm=None):
+    """
+    Spatiotemporal covariance function. Converts x and y
+    to a matrix of covariances. x and y are assumed to have
+    columns (long,lat,t). Parameters are:
+    - t_gam_fun: A function returning a matrix of variogram values.
+      Inputs will be the 't' columns of x and y, as well as kwds.
+    - amp: The MS amplitude of realizations.
+    - scale: Scales distance.
+    - inc, ecc: Anisotropy parameters.
+    - n_threads: Maximum number of threads available to function.
+    - symm: Flag indicating whether matrix will be symmetric (optional).
+    - kwds: Passed to t_gam_fun.
+    
+    Output value should never drop below -1. This happens when:
+    -1 > -sf*c+k
+    
+    """
+    # Allocate 
+    nx = x.shape[0]
+    ny = y.shape[0]
+    ndim = x.shape[1]
+
+    C = np.asmatrix(np.empty((nx,ny),order='F'))
+
+    # Figure out symmetry and threading
+    if symm is None:
+        symm = (x is y)
+
+    n_threads = min(pm.get_threadpool_size(), nx*ny / 2000)        
+
+    if n_threads > 1:
+        if not symm:
+            bounds = np.linspace(0,ny,n_threads+1)
+        else:
+            bounds = np.array(np.sqrt(np.linspace(0,ny*ny,n_threads+1)),dtype=int)
+
+    # Target function for threads
+    def targ(C,x,y,symm,amp,val,vec,cmin,cmax):
+        mahal(C,x,y,symm,amp,val,vec,cmin,cmax)
+
+    # Dispatch threads        
+    if n_threads <= 1:
+        targ(C,x,y,symm,amp,val,vec,0,C.shape[1])
+    else:
+        thread_args = [(C,x,y,symm,amp,val,vec,bounds[i],bounds[i+1]) for i in xrange(n_threads)]
+        pm.map_noreturn(targ, thread_args)
+
+    if symm:
+        pm.gp.symmetrize(C)
+
+    return C
+
+if __name__ == '__main__':
+    nd = 200
+    x = np.linspace(-1,1,1001)
+    xo = np.vstack((x,)*nd).T
+    # x,y = np.meshgrid(x,x)
+    # z = np.empty((x.shape[0],x.shape[0],2))
+    # z[:,:,0] = x
+    # z[:,:,1] = y
+
+    C = pm.gp.Covariance(mahalanobis_covariance, amp=1, val=np.ones(nd), vec=np.array(np.eye(nd),order='F')*2.)
+    C(xo,xo)
+    # A = np.asarray(C(np.atleast_2d(np.zeros(nd)),z)).reshape((x.shape[0],x.shape[0]))
+    from pylab import *
+    close('all')
+    # imshow(A)
+    # title('Autocov')
+    figure()
+    imshow(C(xo,xo).view(np.ndarray))
+    title('Evaluation')
diff --git a/reading-notes/brt.tex b/reading-notes/brt.tex
@@ -0,0 +1,53 @@
+%!TEX root = reading-notes.tex
+
+\section{Boosted Regression Trees} 
+\label{sec:brt}
+
+Boosted regression trees, like MaxEnt (section \ref{sec:maxent}), come from machine learning. In contrast to MaxEnt, which is developed almost exclusively by three people (in the context of species mapping), there are many people working energetically on boosted regression tree algorithms that are directly applicable to the mapping situation. The BRT algorithm makes much more sense than the MaxEnt algorithm. Also in contrast to MaxEnt, boosting has been given a clear likelihood-based analysis by Friedman et al. \cite{Friedman:2000p11413}, making it much easier to see how to import its ideas into the Bayesian context. In fact, there's not much point writing additional notes. I'll just briefly summarize their take-home messages here.
+
+\subsection{Likelihood interpretation}
+\label{sec:likelihood-interp} 
+Friedman et al. demonstrated that the boosting algorithm just about finds a maximum likelihood estimate for a certain generalized additive model on the logistic scale. Specifically, it fits the model
+\begin{eqnarray*}
+    y_i|p(x_i) \stackrel{\textup{\tiny ind}}{\sim} 2\ \textup{Bernoulli}(p(x_i))-1 \\
+    p(x_i) = \textup{logit}^{-1}\left(\sum_{m=1}^M f_m(x_i)\right),
+\end{eqnarray*}
+but rather than maximizing the likelihood
+\begin{eqnarray*}
+    \prod_i p(x_i)^{1_{y_i=1}}(1-p(x_i))^{1_{y_i=-1}}
+\end{eqnarray*}
+directly, it maximizes 
+\begin{eqnarray*}
+    \sum_ie^{y_i\sum_{m=1}^Mf_m(x_i)/2}
+\end{eqnarray*}
+where $\xi$ ranges over the unique values of $x$. Oddly enough, the maximizers are the same for the two objective functions:
+\begin{eqnarray*}
+    \sum_{m=1}^M\hat f_m(x) = \log \frac{n^+(x)}{n^-(x)}
+\end{eqnarray*}
+where $n^+$ and $n^-$ give the number of $y$ values corresponding to $x$ that are positive and negative, respectively. Since the $f_m$'s are not chosen to be flexible enough to interpolate the data, this maximum is usually not achieved.
+
+Boosting maximizes the objective function using an algorithm which, as one discussant notes, most statisticians would see as a capitulation: coefficients are updated one at a time in a greedy fashion, rather than jointly. Such algorithms are fast, but usually far from optimal.
+
+\subsection{Resistance to overfitting}
+\label{sec:res-to-overfitting} 
+The discussants to Friedman et al. all expressed surprise and/or consternation at the revelation that boosting can be viewed as likelihood maximization. Boosted regression trees operate in a very complicated model space, and conventional wisdom would suggest that maximum likelihood estimates with such flexible models would overfit the data. However, boosting is empirically reluctant to overfit.
+
+The most likely explanation seems to be that boosting's `sub-optimal' fitting algorithm does not usually find its optimum. I don't understand how this leads to resistance to overfitting; but no one else seems to either. At any rate, since Bayesian posteriors are much more robust against overfitting than maximum likelihood estimates anyway, I don't think pondering this phenomenon is the best use of our time unless we actually observe overfitting in our results.
+
+\subsection{Trees} 
+\label{sec:trees} 
+As its name implies, boosted regression trees makes use of linear combinations of `decision trees', which are piecewise-flat functions whose domain boundaries are at right angles to one another. To see why they are called `decision trees', consider the case where $x$ consists of two predictors $x_1$ and $x_2$. A decision tree would be a one-way flowchart such as:
+\begin{description}
+    \item[start] Is $x_1$ greater than $x_1^*$?
+    \begin{description}
+        \item[yes] Output $k_1$. 
+        \item[no] Is $x_2$ greater than $x_2^*$?
+        \begin{description}
+            \item[yes] Ouptut $k_2$.
+            \item[no] Output $k_3$.
+        \end{description}
+    \end{description}
+\end{description}
+If you plot the output value as a function of $x_1$ and $x_2$, you end up with the aforementioned piecewise-flat function. If you add up lots of these functions, with the boundaries in different places, you can approximate more complicated functions.
+
+These trees are competing for the same job as MaxEnt's features (section \ref{sec:maxent-success}). The fact that the two methods have been more or less equally successful is interesting. It's tempting to conclude that it doesn't really matter what basis functions you use, as long as they have a good span \& you take steps to prevent overfitting. However, that doesn't explain why both methods have been more successful than more traditional regression-based methods.
diff --git a/reading-notes/max-ent.tex b/reading-notes/max-ent.tex
@@ -66,6 +66,7 @@ \subsection{What does MaxEnt really do?}
 
 
 \subsection{Why has MaxEnt been successful?}
+\label{sec:maxent-success} 
 MaxEnt has been one of the most successful species-mapping techniques. Here are a few possible reasons:
 \begin{itemize}
     \item The presence of both smooth and quickly-changing feature shapes. The developers claim that hinge features greatly improve performance, indicating that this expressiveness may be helpful. An important question is whether the `sharpness' of the features is important in addition to their ability to span function spaces; Gaussian processes are very flexible, but not sharp.

diff --git a/reading-notes/reading-notes.tex b/reading-notes/reading-notes.tex
@@ -4,6 +4,7 @@
 \usepackage{pdfsync} 
 \usepackage{amsfonts}
 \usepackage{amsmath} 
+\usepackage{natbib} 
 \usepackage[colorlinks]{hyperref}
 \begin{document}
 
@@ -14,7 +15,9 @@
 \tableofcontents 
 
 \include{max-ent}
+\include{brt}
 
+\nocite{*} 
 
 \bibliographystyle{plain} 
 \bibliography{anopheles} 

diff --git a/setup.py b/setup.py
@@ -3,6 +3,8 @@
 import os
 config = Configuration('anopheles',parent_package=None,top_path=None)
 
+config.add_extension(name='mahalanobis',sources=['anopheles/mahalanobis.f'])
+
 config.packages = ["anopheles"]
 if __name__ == '__main__':
     from numpy.distutils.core import setup