diff --git a/.gitignore b/.gitignore
index 51cbe85..d1271d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,3 +52,4 @@ coverage.xml
 # Sphinx documentation
 docs/_build/
 
+.DS_Store
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
index 741b620..ee2bbb3 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,5 @@
 global-include . *.py *.c *.h Makefile *.pyx 
-global-exclude optional_test_performance.py
+global-exclude chumpy/optional_test_performance.py
 prune dist
 
 
diff --git a/Makefile b/Makefile
index ef44c4d..17f42db 100644
--- a/Makefile
+++ b/Makefile
@@ -11,7 +11,7 @@ tidy:
 
 test:  clean qtest
 qtest:   all
-	python -m unittest discover
+	python -m unittest discover -s chumpy
 
 coverage: clean qcov
 qcov: all
diff --git a/README.md b/README.md
index a3371e1..7670495 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,19 @@
 chumpy
 ======
+
+Chumpy is a Python-based framework designed to handle the auto-differentiation problem, 
+which is to evalute an expression and its derivatives with respect to its inputs, with 
+the use of the chain rule. Chumpy is intended to make construction and local
+minimization of objectives easier. Specifically, it provides:
+
+- Easy problem construction by using Numpy’s application interface
+- Easy access to derivatives via auto differentiation
+- Easy local optimization methods (12 of them: most of which use the derivatives) 
+
+Chumpy comes with its own demos, which can be seen by typing the following::
+
+>> import chumpy
+>> chumpy.demo() # prints out a list of possible demos
+
+Licensing is specified in the attached LICENSE.txt.
+
diff --git a/README.txt b/README.txt
deleted file mode 100644
index 6c1b8d9..0000000
--- a/README.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-Chumpy is a Python-based framework designed to handle the auto-differentiation problem, 
-which is to evalute an expression and its derivatives with respect to its inputs, with 
-the use of the chain rule. Chumpy is intended to make construction and local
-minimization of objectives easier. Specifically, it provides:
-
-- Easy problem construction by using Numpy’s application interface
-- Easy access to derivatives via auto differentiation
-- Easy local optimization methods (12 of them: most of which use the derivatives) 
-
-Chumpy comes with its own demos, which can be seen by typing the following::
-
->> import chumpy
->> chumpy.demo() # prints out a list of possible demos
-
-Licensing is specified in the attached LICENSE.txt.
-
diff --git a/__init__.py b/chumpy/__init__.py
similarity index 98%
rename from __init__.py
rename to chumpy/__init__.py
index f090eb6..3adc38d 100644
--- a/__init__.py
+++ b/chumpy/__init__.py
@@ -6,6 +6,8 @@
 import testing
 from version import version as __version__
 
+from version import version as __version__
+
 from numpy import bool, int, float, complex, object, unicode, str, nan, inf
 
 def test():
diff --git a/api_compatibility.py b/chumpy/api_compatibility.py
similarity index 100%
rename from api_compatibility.py
rename to chumpy/api_compatibility.py
diff --git a/ch.py b/chumpy/ch.py
similarity index 70%
rename from ch.py
rename to chumpy/ch.py
index d3b429f..f346e40 100755
--- a/ch.py
+++ b/chumpy/ch.py
@@ -9,7 +9,7 @@
 
 __all__ = ['Ch', 'depends_on', 'MatVecMult', 'ChHandle', 'ChLambda']
 
-import sys
+import os, sys, time
 import inspect
 import scipy.sparse as sp
 import numpy as np
@@ -18,11 +18,22 @@
 import copy as external_copy
 from functools import wraps
 from scipy.sparse.linalg.interface import LinearOperator
+import utils
 from utils import row, col
 import collections
 from copy import deepcopy
+from utils import timer
 
 
+# Turn this on if you want the profiler injected
+DEBUG = False
+# Turn this on to make optimizations very chatty for debugging
+VERBOSE = False
+def pif(msg):
+    # print-if-verbose.
+    if DEBUG or VERBOSE:
+        sys.stdout.write(msg + '\n')
+
 _props_for_dict = weakref.WeakKeyDictionary()
 def _props_for(cls):
     if cls not in _props_for_dict:
@@ -61,9 +72,13 @@ class Ch(object):
     terms = []
     dterms = ['x']
     __array_priority__ = 2.0
-
     _cached_parms = {}
     _setup_terms = {}
+    _default_kwargs = {'make_dense' : False, 'make_sparse' : False}
+    _status = "undefined"
+    
+    called_dr_wrt = False
+    profiler = None
 
     ########################################################
     # Construction
@@ -82,6 +97,15 @@ def __new__(cls, *args, **kwargs):
         object.__setattr__(result, '_itr', None)
         object.__setattr__(result, '_parents', weakref.WeakKeyDictionary())
         object.__setattr__(result, '_cache', {'r': None, 'drs': weakref.WeakKeyDictionary()})
+
+        if DEBUG:
+            object.__setattr__(result, '_cache_info', {})
+            object.__setattr__(result, '_status', 'new')
+        
+        for name, default_val in cls._default_kwargs.items():
+            object.__setattr__(result, '_%s' % name, kwargs.get(name, default_val))
+            if name in kwargs:
+                del kwargs[name]
         
         # Set up storage that allows @depends_on to work
         #props = [p for p in inspect.getmembers(cls, lambda x : isinstance(x, property)) if hasattr(p[1].fget, 'deps')]
@@ -153,6 +177,10 @@ def setup_terms(cls):
     ########################################################
     # Identifiers
 
+    @property
+    def short_name(self):
+        return self.label if hasattr(self, 'label') else self.__class__.__name__
+
     @property
     def sid(self):
         """Semantic id."""
@@ -278,11 +306,8 @@ def _compute_dr_wrt_sliced(self, wrt):
                 return None
         
             wrt._call_on_changed()
-        
-            try:
-                jac = wrt.compute_dr_wrt(inner).T
-            except Exception as e:
-                import pdb; pdb.set_trace()
+
+            jac = wrt.compute_dr_wrt(inner).T
 
             return self._superdot(result, jac)
 
@@ -408,8 +433,24 @@ def clear_cache(self, itr=None):
                     todo.append(parent)
                 done.add(id(next))
         return nodes_visited
-            
-        
+
+
+    def clear_cache_wrt(self, wrt, itr=None):
+        if self._cache['drs'].has_key(wrt):
+            self._cache['drs'][wrt] = None
+
+        if hasattr(self, 'dr_cached') and wrt in self.dr_cached:
+            self.dr_cached[wrt] = None
+
+        if itr is None or itr != self._itr:
+            for parent, parent_dict in self._parents.items():
+                if wrt in parent._cache['drs'] or (hasattr(parent, 'dr_cached') and wrt in parent.dr_cached):
+                    parent.clear_cache_wrt(wrt=wrt, itr=itr)
+                object.__setattr__(parent, '_dirty_vars', parent._dirty_vars.union(parent_dict['varnames']))
+                parent._invalidate_cacheprop_names(parent_dict['varnames'])
+
+        object.__setattr__(self, '_itr', itr)
+
     def replace(self, old, new):
         if (hasattr(old, 'dterms') != hasattr(new, 'dterms')):
             raise Exception('Either "old" and "new" must both be "Ch", or they must both be neither.')
@@ -541,6 +582,9 @@ def _call_on_changed(self):
         if hasattr(self, 'is_valid'):
             validity, msg = self.is_valid()
             assert validity, msg
+        if hasattr(self, '_status'):
+            self._status = 'new'
+
         if len(self._dirty_vars) > 0:
             self.on_changed(self._dirty_vars)
             object.__setattr__(self, '_dirty_vars', set())
@@ -555,8 +599,8 @@ def r(self):
         
         return self._cache['rview']
 
+    def _superdot(self, lhs, rhs, profiler=None):
 
-    def _superdot(self, lhs, rhs):
         try:
             if lhs is None:
                 return None
@@ -579,17 +623,26 @@ def _superdot(self, lhs, rhs):
                 if sp.issparse(rhs):
                     return LinearOperator((lhs.shape[0], rhs.shape[1]), lambda x : lhs.dot(rhs.dot(x)))
                 else:
+                    # TODO: ?????????????
+                    # return lhs.matmat(rhs)
                     return lhs.dot(rhs)
             
             # TODO: Figure out how/whether to do this.
-            #lhs, rhs = utils.convert_inputs_to_sparse_if_possible(lhs, rhs)
+            tm_maybe_sparse = timer()
+            lhs, rhs = utils.convert_inputs_to_sparse_if_necessary(lhs, rhs)
+            if tm_maybe_sparse() > 0.1:
+                pif('convert_inputs_to_sparse_if_necessary in {}sec'.format(tm_maybe_sparse()))
 
             if not sp.issparse(lhs) and sp.issparse(rhs):
                 return rhs.T.dot(lhs.T).T
-    
             return lhs.dot(rhs)
-        except:
-            import pdb; pdb.set_trace()
+        except Exception as e:
+            import sys, traceback
+            traceback.print_exc(file=sys.stdout)
+            if DEBUG:
+                import pdb; pdb.post_mortem()
+            else:
+                raise
             
     def lmult_wrt(self, lhs, wrt):
         if lhs is None:
@@ -610,7 +663,7 @@ def lmult_wrt(self, lhs, wrt):
             if hasattr(p, 'dterms') and p is not wrt and p.is_dr_wrt(wrt):
                 if not isinstance(p, Ch):
                     print 'BROKEN!'
-                    import pdb; pdb.set_trace()
+                    raise Exception('Broken Should be Ch object')
 
                 indirect_dr = p.lmult_wrt(self._superdot(lhs, self._compute_dr_wrt_sliced(p)), wrt)
                 if indirect_dr is not None:
@@ -673,21 +726,32 @@ def compute_rop(self, wrt, rhs):
         
         return self._superdot(dr, rhs)
 
-    def dr_wrt(self, wrt, reverse_mode=False):
+    def dr_wrt(self, wrt, reverse_mode=False, profiler=None):
+        tm_dr_wrt = timer()
+        self.called_dr_wrt = True
         self._call_on_changed()
 
-        drs = []        
+        drs = []
 
         if wrt in self._cache['drs']:
+            if DEBUG:
+                if wrt not in self._cache_info:
+                    self._cache_info[wrt] = 0
+                self._cache_info[wrt] +=1
+                self._status = 'cached'
             return self._cache['drs'][wrt]
 
         direct_dr = self._compute_dr_wrt_sliced(wrt)
 
         if direct_dr is not None:
-            drs.append(direct_dr)                
+            drs.append(direct_dr)    
 
+        if DEBUG:
+            self._status = 'pending'
+        
         propnames = set(_props_for(self.__class__))
         for k in set(self.dterms).intersection(propnames.union(set(self.__dict__.keys()))):
+
             p = getattr(self, k)
 
             if hasattr(p, 'dterms') and p is not wrt:
@@ -697,12 +761,16 @@ def dr_wrt(self, wrt, reverse_mode=False):
                 if reverse_mode:
                     lhs = self._compute_dr_wrt_sliced(p)
                     if isinstance(lhs, LinearOperator):
+                        tm_dr_wrt.pause()
                         dr2 = p.dr_wrt(wrt)
+                        tm_dr_wrt.resume()
                         indirect_dr = lhs.matmat(dr2) if dr2 != None else None
                     else:
                         indirect_dr = p.lmult_wrt(lhs, wrt)
                 else: # forward mode
-                    dr2 = p.dr_wrt(wrt)
+                    tm_dr_wrt.pause()
+                    dr2 = p.dr_wrt(wrt, profiler=profiler)
+                    tm_dr_wrt.resume()
                     if dr2 is not None:
                         indirect_dr = self.compute_rop(p, rhs=dr2)
 
@@ -711,38 +779,58 @@ def dr_wrt(self, wrt, reverse_mode=False):
 
         if len(drs)==0:
             result = None
-
         elif len(drs)==1:
             result = drs[0]
-
         else:
+            # TODO: ????????
+            # result = np.sum(x for x in drs)
             if not np.any([isinstance(a, LinearOperator) for a in drs]):
                 result = reduce(lambda x, y: x+y, drs)
             else:
                 result = LinearOperator(drs[0].shape, lambda x : reduce(lambda a, b: a.dot(x)+b.dot(x),drs))
 
         # TODO: figure out how/whether to do this.
-        # if result is not None and not sp.issparse(result):
-        #    nonzero = np.count_nonzero(result)
-        #    if nonzero > 0 and hasattr(result, 'size') and result.size / nonzero >= 10.0:
-        #         #import pdb; pdb.set_trace()
-        #         result = sp.csc_matrix(result)
-            
-            
+        if result is not None and not sp.issparse(result):
+            tm_nonzero = timer()
+            nonzero = np.count_nonzero(result)
+            if tm_nonzero() > 0.1:
+                pif('count_nonzero in {}sec'.format(tm_nonzero()))
+            if nonzero == 0 or hasattr(result, 'size') and result.size / float(nonzero) >= 10.0:
+                tm_convert_to_sparse = timer()
+                result = sp.csc_matrix(result)
+                import gc
+                gc.collect()
+                pif('converting result to sparse in {}sec'.format(tm_convert_to_sparse()))
+                        
         if (result is not None) and (not sp.issparse(result)) and (not isinstance(result, LinearOperator)):
             result = np.atleast_2d(result)
-            
+        
         # When the number of parents is one, it indicates that
         # caching this is probably not useful because not 
         # more than one parent will likely ask for this same
         # thing again in the same iteration of an optimization.
         #
+        # When the number of parents is zero, this is the top
+        # level object and should be cached; when it's > 1
+        # cache the combinations of the children.
+        #
         # If we *always* filled in the cache, it would require 
         # more memory but would occasionally save a little cpu,
         # on average.
         if len(self._parents.keys()) != 1:
             self._cache['drs'][wrt] = result
 
+        if DEBUG:
+            self._status = 'done'
+        
+        if getattr(self, '_make_dense', False) and sp.issparse(result):
+            result = result.todense()
+        if getattr(self, '_make_sparse', False) and not sp.issparse(result):
+            result = sp.csc_matrix(result)
+        
+        if tm_dr_wrt() > 0.1:
+            pif('dx of {} wrt {} in {}sec, sparse: {}'.format(self.short_name, wrt.short_name, tm_dr_wrt(), sp.issparse(result)))
+
         return result
 
 
@@ -754,11 +842,172 @@ def __call__(self, **kwargs):
     ########################################################
     # Visualization
 
+    @property
+    def reset_flag(self):
+        """
+        Used as fn in loop_children_do
+        """
+        return lambda x: setattr(x, 'called_dr_wrt', False)
+
+    def loop_children_do(self, fn):
+        fn(self)
+        for dterm in self.dterms:
+            if hasattr(self, dterm):
+                dtval = getattr(self, dterm)
+                if hasattr(dtval, 'dterms') or hasattr(dtval, 'terms'):
+                    if hasattr(dtval, 'loop_children_do'):
+                        dtval.loop_children_do(fn)
+
+
+    def show_tree_cache(self, label, current_node=None):
+        '''
+        Show tree and cache info with color represent _status
+        Optionally accpet current_node arg to highlight the current node we are in
+        '''
+        import os
+        import tempfile
+        import subprocess
+
+        assert DEBUG, "Please use dr tree visualization functions in debug mode"
+
+        cache_path = os.path.abspath('profiles')
+        def string_for(self, my_name):
+
+            color_mapping = {'new' : 'grey', 'pending':'red', 'cached':'yellow', 'done': 'green'}
+            if hasattr(self, 'label'):
+                my_name = self.label
+            my_name = '%s (%s)' % (my_name, str(self.__class__.__name__))
+            result = []
+            if not hasattr(self, 'dterms'):
+                return result
+            for dterm in self.dterms:
+                if hasattr(self, dterm):
+                    dtval = getattr(self, dterm)
+                    if hasattr(dtval, 'dterms') or hasattr(dtval, 'terms'):
+                        child_label = getattr(dtval, 'label') if hasattr(dtval, 'label') else dterm
+                        child_label = '%s (%s)' % (child_label, str(dtval.__class__.__name__))
+                        src = 'aaa%d' % (id(self))
+                        dst = 'aaa%d' % (id(dtval))
+
+                        s = ''
+                        color = color_mapping[dtval._status] if hasattr(dtval, '_status') else 'grey'
+                        if dtval == current_node:
+                            color = 'blue'
+                        if isinstance(dtval, reordering.Concatenate) and len(dtval.dr_cached) > 0:
+                            s = 'dr_cached\n'
+                            for k, v in dtval.dr_cached.iteritems():
+                                if v is not None:
+                                    issparse = sp.issparse(v)
+                                    size = v.size 
+                                    if issparse:
+                                        size = v.shape[0] * v.shape[1]
+                                        nonzero = len(v.data)
+                                    else:
+                                        nonzero = np.count_nonzero(v)  
+                                    s += '\nsparse: %s\nsize: %d\nnonzero: %d\n' % (issparse, size, nonzero)
+                            # if dtval.called_dr_wrt:
+                            # #   dtval.called_dr_wrt = False
+                            #     color = 'brown3'
+                            # else:
+                            #     color = 'azure1'
+                        elif len(dtval._cache['drs']) > 0:
+                            s = '_cache\n'
+                            
+                            for k, v in dtval._cache['drs'].iteritems():
+                                if v is not None:
+                                    issparse = sp.issparse(v)
+                                    size = v.size
+                                    if issparse:
+                                        size = v.shape[0] * v.shape[1]
+                                        nonzero = len(v.data)
+                                    else:
+                                        nonzero = np.count_nonzero(v)     
+
+                                    s += '\nsparse: %s\nsize: %d\nnonzero: %d\n' % (issparse, size, nonzero)
+                                    if hasattr(dtval, '_cache_info'):
+                                        s += '\ncache hit:%s\n' % dtval._cache_info[k]
+                            # if hasattr(dtval,'called_dr_wrt') and dtval.called_dr_wrt:
+                            # #   dtval.called_dr_wrt = False
+                            #     color = 'brown3'
+                            # else:
+                            #     color = 'azure1'
+                        result += ['%s -> %s;' % (src, dst)]
+                        # Do not overwrite src
+                        #result += ['%s [label="%s"];' % (src, my_name)]
+                        result += ['%s [label="%s\n%s\n", color=%s, style=filled];' %
+                                   (dst, child_label, s, color)]
+                        result += string_for(getattr(self, dterm), dterm)
+            return result
+            
+
+        dot_file_contents = 'digraph G {\n%s\n}' % '\n'.join(list(set(string_for(self, 'root'))))
+        dot_file_name = os.path.join(cache_path, label)
+        png_file_name = os.path.join(cache_path, label+'.png')
+        with open(dot_file_name, 'w') as dot_file:
+            with open(png_file_name, 'w') as png_file:
+                dot_file.write(dot_file_contents)
+                dot_file.flush()
+
+                png_file = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+                subprocess.call(['dot', '-Tpng', '-o', png_file.name, dot_file.name])
+
+        import webbrowser
+        webbrowser.open('file://' + png_file.name)
+        
+        self.loop_children_do(self.reset_flag)
+
+    def show_tree_wrt(self, wrt):
+        import tempfile
+        import subprocess
+
+        assert DEBUG, "Please use dr tree visualization functions in debug mode"
+
+        def string_for(self, my_name, wrt):
+            if hasattr(self, 'label'):
+                my_name = self.label
+            my_name = '%s (%s)' % (my_name, str(self.__class__.__name__))
+            result = []
+            if not hasattr(self, 'dterms'):
+                return result
+            for dterm in self.dterms:
+                if hasattr(self, dterm):
+                    dtval = getattr(self, dterm)
+                    if hasattr(dtval, 'dterms') or hasattr(dtval, 'terms'):
+                        child_label = getattr(dtval, 'label') if hasattr(dtval, 'label') else dterm
+                        child_label = '%s (%s)' % (child_label, str(dtval.__class__.__name__))
+                        src = 'aaa%d' % (id(self))
+                        dst = 'aaa%d' % (id(dtval))
+                        result += ['%s -> %s;' % (src, dst)]
+                        result += ['%s [label="%s"];' % (src, my_name)]
+                        if wrt in dtval._cache['drs'] and dtval._cache['drs'][wrt] is not None:
+                            issparse = sp.issparse(dtval._cache['drs'][wrt])
+                            size = dtval._cache['drs'][wrt].size 
+                            nonzero = np.count_nonzero(dtval._cache['drs'][wrt])
+                            result += ['%s [label="%s\n is_sparse: %s\nsize: %d\nnonzero: %d"];' %
+                                       (dst, child_label, issparse, size,
+                                        nonzero)]
+                        else:
+                            result += ['%s [label="%s"];' % (dst, child_label)]
+                        result += string_for(getattr(self, dterm), dterm, wrt)
+            return result
+            
+        
+        dot_file_contents = 'digraph G {\n%s\n}' % '\n'.join(list(set(string_for(self, 'root', wrt))))
+        dot_file = tempfile.NamedTemporaryFile()
+        dot_file.write(dot_file_contents)
+        dot_file.flush()
+        png_file = tempfile.NamedTemporaryFile(suffix='.png', delete=False)
+        subprocess.call(['dot', '-Tpng', '-o', png_file.name, dot_file.name])
+        import webbrowser
+        webbrowser.open('file://' + png_file.name)
+    
     def show_tree(self, cachelim=np.inf):
         """Cachelim is in Mb. For any cached jacobians above cachelim, they are also added to the graph. """
-        
         import tempfile
         import subprocess
+
+        assert DEBUG, "Please use dr tree visualization functions in debug mode"
+
         def string_for(self, my_name):
             if hasattr(self, 'label'):
                 my_name = self.label
@@ -835,6 +1084,31 @@ def string_for(self, my_name):
         import webbrowser
         webbrowser.open('file://' + png_file.name)
 
+
+    def tree_iterator(self, visited=None, path=None):
+        '''
+        Generator function that traverse the dr tree start from this node (self).
+        '''
+        if visited is None:
+            visited = set()
+
+        if self not in visited:
+            if path and isinstance(path, list):
+                path.append(self)
+
+            visited.add(self)
+            yield self
+
+            if not hasattr(self, 'dterms'):
+                yield
+
+            for dterm in self.dterms:
+                if hasattr(self, dterm):
+                    child = getattr(self, dterm)
+                    if hasattr(child, 'dterms') or hasattr(child, 'terms'):
+                        for node in child.tree_iterator(visited):
+                            yield node
+
     def floor(self):
         return ch_ops.floor(self)
     
@@ -997,6 +1271,39 @@ def compute_dr_wrt(self, wrt):
         if wrt is self._result:
             return 1
 
+# ChGroup is similar to ChLambda in that it's designed to expose the "internal"
+# inputs of result but, unlike ChLambda, result is kept internal and called when
+# compute_r and compute_dr_wrt is called to compute the relevant Jacobians.
+# This provides a way of effectively applying the chain rule in a different order.
+class ChGroup(Ch):
+    terms = ['result', 'args']
+    dterms = []
+    term_order = ['result', 'args']
+
+    def on_changed(self, which):
+        for argname in set(which).intersection(set(self.args.keys())):
+            if not self.args[argname].x is getattr(self, argname) :
+                self.args[argname].x = getattr(self, argname)
+    
+    # right now the entries in args have to refer to terms/dterms of result,
+    # it would be better if they could be "internal" as well, but for now the idea
+    # is that result may itself be a ChLambda.
+    def __init__(self, result, args):
+        self.args = { argname: ChHandle(x=arg) for argname, arg in args.items() }
+        for argname, arg in self.args.items():
+            setattr(result, argname, arg)
+            if result.is_dr_wrt(arg.x):
+                self.add_dterm(argname, arg.x)
+            else:
+                self.terms.append(argname)
+                setattr(self, argname, arg.x)
+        self._result = result
+        
+    def compute_r(self):
+        return self._result.r
+       
+    def compute_dr_wrt(self, wrt):
+        return self._result.dr_wrt(wrt)
 
 import ch_ops
 from ch_ops import *
diff --git a/ch_ops.py b/chumpy/ch_ops.py
similarity index 98%
rename from ch_ops.py
rename to chumpy/ch_ops.py
index 03ade3f..deea353 100755
--- a/ch_ops.py
+++ b/chumpy/ch_ops.py
@@ -19,7 +19,7 @@
             'greater', 'greater_equal', 'less', 'less_equal', 'equal', 'not_equal',
             'nonzero', 'ascontiguousarray', 'asfarray', 'arange', 'asarray', 'copy',
             'cross',
-            'shape', 'tensordot', 'sign']
+            'shape', 'sign']
 
 
 __all__ += ['SumOfSquares',
@@ -361,7 +361,7 @@ def compute_dr_wrt(self, wrt):
                 idxs_presum = np.arange(self.x.size).reshape(self.x.shape)
                 idxs_presum = np.rollaxis(idxs_presum, self.axis, 0)
                 idxs_postsum = np.arange(self.r.size).reshape(self.r.shape)
-                tp = np.ones(idxs_presum.ndim)
+                tp = np.ones(idxs_presum.ndim, dtype=np.uint32)
                 tp[0] = idxs_presum.shape[0]
                 idxs_postsum = np.tile(idxs_postsum, tp)
                 data = np.ones(idxs_postsum.size) / self.x.shape[self.axis]
@@ -519,7 +519,7 @@ def broadcast_shape(a_shape, b_shape):
             #print 'aaa' + str(our_result)
             #print 'bbb' + str(numpy_result)
             if not np.array_equal(our_result, numpy_result):
-                import pdb; pdb.set_trace()
+                raise Exception('numpy result not equal to our result')
             assert(np.array_equal(our_result, numpy_result))
 
         broadcast_shape_cache[uid] = tuple(our_result)
@@ -726,10 +726,7 @@ class dot(ch.Ch):
     dterms = 'a', 'b'
 
     def compute_r(self):
-        try:
-            return self.a.r.dot(self.b.r)
-        except:
-            import pdb; pdb.set_trace()
+        return self.a.r.dot(self.b.r)
     
     def compute_d1(self):
         # To stay consistent with numpy, we must upgrade 1D arrays to 2D
@@ -796,12 +793,14 @@ def nonzero(a):
         a = a.r
     return np.nonzero(a)
 
+# Try to pull the code for tensordot in from numpy and reinterpret it using chumpy ops
 try:
     import inspect
     exec(''.join(inspect.getsourcelines(np.tensordot)[0]))
-except: pass
+    __all__ += ['tensordot']
+except:
+    pass
 
-    
 
 
 def main():
diff --git a/ch_random.py b/chumpy/ch_random.py
similarity index 100%
rename from ch_random.py
rename to chumpy/ch_random.py
diff --git a/extras.py b/chumpy/extras.py
similarity index 100%
rename from extras.py
rename to chumpy/extras.py
diff --git a/linalg.py b/chumpy/linalg.py
similarity index 100%
rename from linalg.py
rename to chumpy/linalg.py
diff --git a/logic.py b/chumpy/logic.py
similarity index 100%
rename from logic.py
rename to chumpy/logic.py
diff --git a/chumpy/monitor.py b/chumpy/monitor.py
new file mode 100644
index 0000000..35d3f71
--- /dev/null
+++ b/chumpy/monitor.py
@@ -0,0 +1,149 @@
+'''
+Logging service for tracking dr tree changes from root objective 
+and record every step that incrementally changes the dr tree
+
+'''
+import os, sys, time
+import json
+import psutil
+
+import scipy.sparse as sp
+import numpy as np
+import reordering
+
+_TWO_20 = float(2 **20)
+
+'''
+memory utils
+
+'''
+def pdb_mem():
+    from monitor import get_current_memory
+    mem = get_current_memory()
+    if mem > 7000:
+        import pdb;pdb.set_trace()
+
+def get_peak_mem():
+    '''
+    this returns peak memory use since process starts till the moment its called
+    '''
+    import resource
+    rusage_denom = 1024.
+    if sys.platform == 'darwin':
+        # ... it seems that in OSX the output is different units ...
+        rusage_denom = rusage_denom * rusage_denom
+    mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / rusage_denom
+    return mem
+
+def get_current_memory():
+    p = psutil.Process(os.getpid())
+    mem = p.memory_info()[0]/_TWO_20
+
+    return mem
+
+'''
+Helper for Profiler
+'''
+
+def build_cache_info(k, v, info_dict):
+    if v is not None:
+        issparse = sp.issparse(v)
+        size = v.size
+        if issparse:
+            nonzero = len(v.data)
+        else:
+            nonzero = np.count_nonzero(v)
+        info_dict[k.short_name] = {
+            'sparse': issparse,
+            'size' : str(size),
+            'nonzero' : nonzero,
+        }
+
+
+def cache_info(ch_node):
+    result = {}
+    if isinstance(ch_node, reordering.Concatenate) and hasattr(ch_node, 'dr_cached') and len(ch_node.dr_cached) > 0:
+        for k, v in ch_node.dr_cached.iteritems():
+            build_cache_info(k, v, result)
+    elif len(ch_node._cache['drs']) > 0:
+        for k, v in ch_node._cache['drs'].iteritems():
+            build_cache_info(k, v, result)
+
+    return result
+
+class DrWrtProfiler(object):
+    base_path = os.path.abspath('profiles')
+    
+    def __init__(self, root, base_path=None):
+        self.root = root.obj
+        self.history = []
+
+        ts = time.time()
+        if base_path:
+            self.base_path = base_path
+                       
+        self.path = os.path.join(self.base_path, 'profile_%s.json' % str(ts))
+        self.root_path = os.path.join(self.base_path, 'root_%s.json' % str(ts))
+
+        
+        with open(self.root_path, 'w') as f:
+            json.dump(self.dump_tree(self.root), f, indent=4)
+
+    def dump_tree(self, node):
+        if not hasattr(node, 'dterms'):
+            return []
+
+        node_dict = self.serialize_node(node, verbose=False)
+        if hasattr(node, 'visited') and node.visited:
+            node_dict.update({'indirect':True})
+            return node_dict
+
+        node.visited = True
+        children_list = []
+        for dterm in node.dterms:
+            if hasattr(node, dterm):
+                child = getattr(node, dterm)
+                if hasattr(child, 'dterms') or hasattr(child, 'terms'):
+                    children_list.append(self.dump_tree(child))
+        node_dict.update({'children':children_list})
+        return node_dict
+
+    def serialize_node(self, ch_node, verbose=True):
+        node_id = id(ch_node)
+        name = ch_node.short_name
+        ts = time.time()
+        status = ch_node._status
+        mem = get_current_memory()
+        node_cache_info = cache_info(ch_node)
+
+        rec = {
+            'id': str(node_id),
+            'indirect' : False,
+        }
+        if verbose:
+            rec.update({
+                'name':name,
+                'ts' : ts,
+                'status':status,
+                'mem': mem,
+                'cache': node_cache_info,
+            })
+        return rec
+
+    def show_tree(self, label):
+        '''
+        show tree from the root node
+        '''
+        self.root.show_tree_cache(label)
+
+    def record(self, ch_node):
+        '''
+        Incremental changes
+        '''
+        rec = self.serialize_node(ch_node)
+        self.history.append(rec)
+
+    def harvest(self):
+        print 'collecting and dump to file %s' % self.path
+        with open(self.path, 'w') as f:
+            json.dump(self.history, f, indent=4)
\ No newline at end of file
diff --git a/chumpy/optimization.py b/chumpy/optimization.py
new file mode 100755
index 0000000..c315198
--- /dev/null
+++ b/chumpy/optimization.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+
+"""
+Author(s): Matthew Loper
+
+See LICENCE.txt for licensing and contact information.
+"""
+
+__all__ = ['minimize']
+
+import numpy as np
+import ch
+import scipy.sparse as sp
+import scipy.optimize
+
+from optimization_internal import minimize_dogleg
+
+#from memory_profiler import profile, memory_usage
+
+# def disable_cache_for_single_parent_node(node):
+#     if hasattr(node, '_parents') and len(node._parents.keys()) == 1:
+#         node.want_cache = False
+
+
+# Nelder-Mead
+# Powell
+# CG
+# BFGS
+# Newton-CG
+# Anneal
+# L-BFGS-B
+# TNC
+# COBYLA
+# SLSQP
+# dogleg
+# trust-ncg
+def minimize(fun, x0, method='dogleg', bounds=None, constraints=(), tol=None, callback=None, options=None):
+
+    if method == 'dogleg':
+        if options is None: options = {}
+        return minimize_dogleg(fun, free_variables=x0, on_step=callback, **options)
+
+    if isinstance(fun, list) or isinstance(fun, tuple):
+        fun = ch.concatenate([f.ravel() for f in fun])
+    if isinstance(fun, dict):
+        fun = ch.concatenate([f.ravel() for f in fun.values()])
+    obj = fun
+    free_variables = x0
+
+
+    from ch import SumOfSquares
+
+    hessp = None
+    hess = None
+    if obj.size == 1:
+        obj_scalar = obj
+    else:
+        obj_scalar = SumOfSquares(obj)
+    
+        def hessp(vs, p,obj, obj_scalar, free_variables):
+            changevars(vs,obj,obj_scalar,free_variables)
+            if not hasattr(hessp, 'vs'):
+                hessp.vs = vs*0+1e16
+            if np.max(np.abs(vs-hessp.vs)) > 0:
+
+                J = ns_jacfunc(vs,obj,obj_scalar,free_variables)
+                hessp.J = J
+                hessp.H = 2. * J.T.dot(J)
+                hessp.vs = vs
+            return np.array(hessp.H.dot(p)).ravel()
+            #return 2*np.array(hessp.J.T.dot(hessp.J.dot(p))).ravel()
+            
+        if method.lower() != 'newton-cg':
+            def hess(vs, obj, obj_scalar, free_variables):
+                changevars(vs,obj,obj_scalar,free_variables)
+                if not hasattr(hessp, 'vs'):
+                    hessp.vs = vs*0+1e16
+                if np.max(np.abs(vs-hessp.vs)) > 0:
+                    J = ns_jacfunc(vs,obj,obj_scalar,free_variables)
+                    hessp.H = 2. * J.T.dot(J)
+                return hessp.H
+        
+    def changevars(vs, obj, obj_scalar, free_variables):
+        cur = 0
+        changed = False
+        for idx, freevar in enumerate(free_variables):
+            sz = freevar.r.size
+            newvals = vs[cur:cur+sz].copy().reshape(free_variables[idx].shape)
+            if np.max(np.abs(newvals-free_variables[idx]).ravel()) > 0:
+                free_variables[idx][:] = newvals
+                changed = True
+
+            cur += sz
+            
+        methods_without_callback = ('anneal', 'powell', 'cobyla', 'slsqp')
+        if callback is not None and changed and method.lower() in methods_without_callback:
+            callback(None)
+
+        return changed
+    
+    def residuals(vs,obj, obj_scalar, free_variables):
+        changevars(vs, obj, obj_scalar, free_variables)
+        residuals = obj_scalar.r.ravel()[0]
+        return residuals
+
+    def scalar_jacfunc(vs,obj, obj_scalar, free_variables):
+        if not hasattr(scalar_jacfunc, 'vs'):
+            scalar_jacfunc.vs = vs*0+1e16
+        if np.max(np.abs(vs-scalar_jacfunc.vs)) == 0:
+            return scalar_jacfunc.J
+            
+        changevars(vs, obj, obj_scalar, free_variables)
+        
+        if True: # faster, at least on some problems
+            result = np.concatenate([np.array(obj_scalar.lop(wrt, np.array([[1]]))).ravel() for wrt in free_variables])            
+        else:
+            jacs = [obj_scalar.dr_wrt(wrt) for wrt in free_variables]
+            for idx, jac in enumerate(jacs):
+                if sp.issparse(jac):
+                    jacs[idx] = jacs[idx].todense()
+            result = np.concatenate([jac.ravel() for jac in jacs])
+
+        scalar_jacfunc.J = result
+        scalar_jacfunc.vs = vs
+        return result.ravel()
+        
+    def ns_jacfunc(vs,obj, obj_scalar, free_variables):
+        if not hasattr(ns_jacfunc, 'vs'):
+            ns_jacfunc.vs = vs*0+1e16
+        if np.max(np.abs(vs-ns_jacfunc.vs)) == 0:
+            return ns_jacfunc.J
+            
+        changevars(vs, obj, obj_scalar, free_variables)
+        jacs = [obj.dr_wrt(wrt) for wrt in free_variables]
+        result = hstack(jacs)
+
+        ns_jacfunc.J = result
+        ns_jacfunc.vs = vs
+        return result
+
+        
+    x1 = scipy.optimize.minimize(
+        method=method,
+        fun=residuals,
+        callback=callback,
+        x0=np.concatenate([free_variable.r.ravel() for free_variable in free_variables]),
+        jac=scalar_jacfunc,
+        hessp=hessp, hess=hess, args=(obj, obj_scalar, free_variables),
+        bounds=bounds, constraints=constraints, tol=tol, options=options).x
+
+    changevars(x1, obj, obj_scalar, free_variables)
+    return free_variables
+
+
+def main():
+    pass
+
+
+if __name__ == '__main__':
+    main()
+
diff --git a/chumpy/optimization_internal.py b/chumpy/optimization_internal.py
new file mode 100644
index 0000000..2102d3b
--- /dev/null
+++ b/chumpy/optimization_internal.py
@@ -0,0 +1,455 @@
+import sys
+import warnings
+import numpy as np
+import scipy.sparse as sp
+import ch, utils
+from ch import pif
+from utils import timer
+
+
+def clear_cache_single(node):
+    node._cache['drs'].clear()
+    if hasattr(node, 'dr_cached'):
+        node.dr_cached.clear()
+
+def vstack(x):
+    x = [a if not isinstance(a, sp.linalg.interface.LinearOperator) else a.dot(np.eye(a.shape[1])) for a in x]
+    return sp.vstack(x, format='csc') if any([sp.issparse(a) for a in x]) else np.vstack(x)
+def hstack(x):
+    x = [a if not isinstance(a, sp.linalg.interface.LinearOperator) else a.dot(np.eye(a.shape[1])) for a in x]
+    return sp.hstack(x, format='csc') if any([sp.issparse(a) for a in x]) else np.hstack(x)
+
+
+_giter = 0
+class ChInputsStacked(ch.Ch):
+    dterms = 'x', 'obj'
+    terms = 'free_variables'
+
+    def compute_r(self):
+        if not hasattr(self, 'fevals'):
+            self.fevals = 0
+        self.fevals += 1
+        return self.obj.r.ravel()
+
+    def dr_wrt(self, wrt, profiler=None):
+        '''
+        Loop over free variables and delete cache for the whole tree after finished each one
+        '''
+        if wrt is self.x:
+            jacs = []
+            for fvi, freevar in enumerate(self.free_variables):
+                tm = timer()
+                if isinstance(freevar, ch.Select):
+                    new_jac = self.obj.dr_wrt(freevar.a, profiler=profiler)
+                    try:
+                        new_jac = new_jac[:, freevar.idxs]
+                    except:
+                        # non-csc sparse matrices may not support column-wise indexing
+                        new_jac = new_jac.tocsc()[:, freevar.idxs]
+                else:
+                    new_jac = self.obj.dr_wrt(freevar, profiler=profiler)
+
+                pif('dx wrt {} in {}sec, sparse: {}'.format(freevar.short_name, tm(), sp.issparse(new_jac)))
+
+                if self._make_dense and sp.issparse(new_jac):
+                    new_jac = new_jac.todense()
+                if self._make_sparse and not sp.issparse(new_jac):
+                    new_jac = sp.csc_matrix(new_jac)
+
+                if new_jac is None:
+                    raise Exception(
+                        'Objective has no derivative wrt free variable {}. '
+                        'You should likely remove it.'.format(fvi))
+
+                jacs.append(new_jac)
+            tm = timer()
+            utils.dfs_do_func_on_graph(self.obj, clear_cache_single)
+            pif('dfs_do_func_on_graph in {}sec'.format(tm()))
+            tm = timer()
+            J = hstack(jacs)
+            pif('hstack in {}sec'.format(tm()))
+            return J
+
+    def on_changed(self, which):
+        global _giter
+        _giter += 1
+        if 'x' in which:
+            pos = 0
+            for idx, freevar in enumerate(self.free_variables):
+                sz = freevar.r.size
+                rng = np.arange(pos, pos+sz)
+                if isinstance(self.free_variables[idx], ch.Select):
+                    # Deal with nested selects
+                    selects = []
+                    a = self.free_variables[idx]
+                    while isinstance(a, ch.Select):
+                        selects.append(a.idxs)
+                        a = a.a
+                    newv = a.x.copy()
+                    idxs = selects.pop()
+                    while len(selects) > 0:
+                        idxs = idxs[selects.pop()]
+                    newv.ravel()[idxs] = self.x.r.ravel()[rng]
+                    a.__setattr__('x', newv, _giter)
+                elif isinstance(self.free_variables[idx].x, np.ndarray):
+                    self.free_variables[idx].__setattr__('x', self.x.r[rng].copy().reshape(self.free_variables[idx].x.shape), _giter)
+                else: # a number
+                    self.free_variables[idx].__setattr__('x', self.x.r[rng], _giter)
+                pos += sz
+
+    @property
+    def J(self):
+        '''
+        Compute Jacobian. Analyze dr graph first to disable unnecessary caching
+        '''
+        result = self.dr_wrt(self.x, profiler=self.profiler).copy()
+        if self.profiler:
+            self.profiler.harvest()
+        return np.atleast_2d(result) if not sp.issparse(result) else result
+
+
+def setup_sparse_solver(sparse_solver):
+    _solver_fns = {
+        'cg': lambda A, x, M=None : sp.linalg.cg(A, x, M=M, tol=1e-10)[0],
+        'spsolve': lambda A, x : sp.linalg.spsolve(A, x)
+    }
+    if callable(sparse_solver):
+        return sparse_solver
+    elif isinstance(sparse_solver, str) and sparse_solver in _solver_fns.keys():
+        return _solver_fns[sparse_solver]
+    else:
+        raise Exception('sparse_solver argument must be either a string in the set (%s) or have the api of scipy.sparse.linalg.spsolve.' % ', '.join(_solver_fns.keys()))
+
+
+def setup_objective(obj, free_variables, on_step=None, disp=True, make_dense=False):
+    '''
+    obj here can be a list of ch objects or a dict of label: ch objects. Either way, the ch
+    objects will be merged into one objective using a ChInputsStacked. The labels are just used
+    for printing out values per objective with each iteration. If make_dense is True, the
+    resulting object with return a desne Jacobian
+    '''
+    # Validate free variables
+    num_unique_ids = len(np.unique(np.array([id(freevar) for freevar in free_variables])))
+    if num_unique_ids != len(free_variables):
+        raise Exception('The "free_variables" param contains duplicate variables.')
+    # Extract labels
+    labels = {}
+    if isinstance(obj, list) or isinstance(obj, tuple):
+        obj = ch.concatenate([f.ravel() for f in obj])
+    elif isinstance(obj, dict):
+        labels = obj
+        obj = ch.concatenate([f.ravel() for f in obj.values()])
+    # build objective
+    x = np.concatenate([freevar.r.ravel() for freevar in free_variables])
+    obj = ChInputsStacked(obj=obj, free_variables=free_variables, x=x, make_dense=make_dense)
+    # build callback
+    def callback():
+        if on_step is not None:
+            on_step(obj)
+        if disp:
+            report_line = ['%.2e' % (np.sum(obj.r**2),)]
+            for label, objective in sorted(labels.items(), key=lambda x: x[0]):
+                report_line.append('%s: %.2e' % (label, np.sum(objective.r**2)))
+            report_line = " | ".join(report_line) + '\n'
+            sys.stderr.write(report_line)
+    return obj, callback
+
+
+class DoglegState(object):
+    '''
+    Dogleg preserves a great deal of state from iteration to iteration. Many of the things
+    that we need to calculate are dependent only on this state (e.g. the various trust region
+    steps, the current jacobian and the A & g that depends on it, etc.). Holding the state and
+    the various methods based on that state here allows us to seperate a lot of the jacobian
+    based calculation from the flow control of the optmization.
+
+    There will be once instance of DoglegState per invocation of minimize_dogleg.
+    '''
+    def __init__(self, delta, solve):
+        self.iteration = 0
+        self._d_gn = None # gauss-newton
+        self._d_sd = None # steepest descent
+        self._d_dl = None # dogleg
+        self.J = None
+        self.A = None
+        self.g = None
+        self._p = None
+        self.delta = delta
+        self.solve = solve
+        self._r = None
+        self.rho = None
+        self.done = False
+
+    @property
+    def p(self):
+        '''p is the current proposed input vector'''
+        return self._p
+    @p.setter
+    def p(self, val):
+        self._p = val.reshape((-1, 1))
+
+    # induce some certainty about what the shape of the steps are
+    @property
+    def d_gn(self):
+        return self._d_gn
+    @d_gn.setter
+    def d_gn(self, val):
+        if val is not None:
+            val = val.reshape((-1, 1))
+        self._d_gn = val
+
+    @property
+    def d_sd(self):
+        return self._d_sd
+    @d_sd.setter
+    def d_sd(self, val):
+        if val is not None:
+            val = val.reshape((-1, 1))
+        self._d_sd = val
+
+    @property
+    def d_dl(self):
+        return self._d_dl
+    @d_dl.setter
+    def d_dl(self, val):
+        if val is not None:
+            val = val.reshape((-1, 1))
+        self._d_dl = val
+
+    @property
+    def step(self):
+        return self.d_dl.reshape((-1, 1))
+    @property
+    def step_size(self):
+        return np.linalg.norm(self.d_dl)
+
+    def start_iteration(self):
+        self.iteration += 1
+        pif('beginning iteration %d' % (self.iteration,))
+        self.d_sd = (np.linalg.norm(self.g)**2 / np.linalg.norm(self.J.dot(self.g))**2 * self.g).ravel()
+        self.d_gn = None
+
+    @property
+    def r(self):
+        '''r is the residual at the current p'''
+        return self._r
+    @r.setter
+    def r(self, val):
+        self._r = val.copy().reshape((-1, 1))
+        self.updateAg()
+
+    def updateAg(self):
+        tm = timer()
+        pif('updating A and g...')
+        JT = self.J.T
+        self.A = JT.dot(self.J)
+        self.g = JT.dot(-self.r).reshape((-1, 1))
+        pif('A and g updated in %.2fs' % tm())
+
+    def update_step(self):
+        # if the Cauchy point is outside the trust region,
+        # take that direction but only to the edge of the trust region
+        if self.delta is not None and np.linalg.norm(self.d_sd) >= self.delta:
+            pif('PROGRESS: Using stunted cauchy')
+            self.d_dl = np.array(self.delta/np.linalg.norm(self.d_sd) * self.d_sd).ravel()
+        else:
+            if self.d_gn is None:
+                # We only need to compute this once per iteration
+                self.updateGN()
+            # if the gauss-newton solution is within the trust region, use it
+            if self.delta is None or np.linalg.norm(self.d_gn) <= self.delta:
+                pif('PROGRESS: Using gauss-newton solution')
+                self.d_dl = np.array(self.d_gn).ravel()
+                if self.delta is None:
+                    self.delta = np.linalg.norm(self.d_gn)
+            else: # between cauchy step and gauss-newton step
+                pif('PROGRESS: between cauchy and gauss-newton')
+                # apply step
+                self.d_dl = self.d_sd + self.beta_multiplier * (self.d_gn - self.d_sd)
+
+    @property
+    def beta_multiplier(self):
+        delta_sq = self.delta**2
+        diff = self.d_gn - self.d_sd
+        sqnorm_sd = np.linalg.norm(self.d_sd)**2
+        pnow = diff.T.dot(diff)*delta_sq + self.d_gn.T.dot(self.d_sd)**2 - np.linalg.norm(self.d_gn)**2 * sqnorm_sd
+        return float(delta_sq - sqnorm_sd) / float((diff).T.dot(self.d_sd) + np.sqrt(pnow))
+
+    def updateGN(self):
+        tm = timer()
+        if sp.issparse(self.A):
+            self.A.eliminate_zeros()
+            pif('sparse solve...sparsity infill is %.3f%% (hessian %dx%d)' % (100. * self.A.nnz / (self.A.shape[0] * self.A.shape[1]), self.A.shape[0], self.A.shape[1]))
+            if self.g.size > 1:
+                self.d_gn = self.solve(self.A, self.g).ravel()
+                if np.any(np.isnan(self.d_gn)) or np.any(np.isinf(self.d_gn)):
+                    from scipy.sparse.linalg import lsqr
+                    warnings.warn("sparse solve failed, falling back to lsqr")
+                    self.d_gn = lsqr(self.A, self.g)[0].ravel()
+            else:
+                self.d_gn = np.atleast_1d(self.g.ravel()[0]/self.A[0,0])
+            pif('sparse solve...done in %.2fs' % tm())
+        else:
+            pif('dense solve...')
+            try:
+                self.d_gn = np.linalg.solve(self.A, self.g).ravel()
+            except Exception:
+                warnings.warn("dense solve failed, falling back to lsqr")
+                self.d_gn = np.linalg.lstsq(self.A, self.g)[0].ravel()
+            pif('dense solve...done in %.2fs' % tm())
+
+    def updateJ(self, obj):
+        tm = timer()
+        pif('computing Jacobian...')
+        self.J = obj.J
+        if self.J is None:
+            raise Exception("Computing Jacobian failed!")
+        if sp.issparse(self.J):
+            tm2 = timer()
+            self.J = self.J.tocsr()
+            pif('converted to csr in {}secs'.format(tm2()))
+            assert(self.J.nnz > 0)
+        elif ch.VERBOSE:
+            nonzero = np.count_nonzero(self.J)
+            pif('Jacobian dense with sparsity %.3f' % (nonzero/self.J.size))
+        pif('Jacobian (%dx%d) computed in %.2fs' % (self.J.shape[0], self.J.shape[1], tm()))
+        if self.J.shape[1] != self.p.size:
+            raise Exception('Jacobian size mismatch with objective input')
+        return self.J
+
+    class Trial(object):
+        '''
+        Inside each iteration of dogleg we propose a step and check to see if it's actually
+        an improvement before we accept it. This class encapsulates that trial and the
+        testing to see if it is actually an improvement.
+
+        There will be one instance of Trial per iteration in dogleg.
+        '''
+        def __init__(self, proposed_r, state):
+            self.r = proposed_r
+            self.state = state
+            # rho is the ratio of...
+            # (improvement in SSE) / (predicted improvement in SSE)
+            self.rho = np.linalg.norm(state.r)**2 - np.linalg.norm(proposed_r)**2
+            if self.rho > 0:
+                with warnings.catch_warnings():
+                    warnings.filterwarnings('ignore',category=RuntimeWarning)
+                    predicted_improvement = 2. * state.g.T.dot(state.d_dl) - state.d_dl.T.dot(state.A.dot(state.d_dl))
+                    self.rho /= predicted_improvement
+
+        @property
+        def is_improvement(self):
+            return self.rho > 0
+
+        @property
+        def improvement(self):
+            return (np.linalg.norm(self.state.r)**2 - np.linalg.norm(self.r)**2) / np.linalg.norm(self.state.r)**2
+
+    def trial_r(self, proposed_r):
+        return self.Trial(proposed_r, self)
+
+    def updateRadius(self, rho, lb=.05, ub=.9):
+        if rho > ub:
+            self.delta = max(self.delta, 2.5*np.linalg.norm(self.d_dl))
+        elif rho < lb:
+            self.delta *= .25
+
+
+def minimize_dogleg(obj, free_variables, on_step=None,
+                     maxiter=200, max_fevals=np.inf, sparse_solver='spsolve',
+                     disp=True, e_1=1e-15, e_2=1e-15, e_3=0., delta_0=None,
+                     treat_as_dense=False):
+    """"Nonlinear optimization using Powell's dogleg method.
+    See Lourakis et al, 2005, ICCV '05, "Is Levenberg-Marquardt the
+    Most Efficient Optimization for Implementing Bundle Adjustment?":
+    http://www.ics.forth.gr/cvrl/publications/conferences/0201-P0401-lourakis-levenberg.pdf
+
+    e_N are stopping conditions:
+    e_1 is gradient magnatude threshold
+    e_2 is step size magnatude threshold
+    e_3 is improvement threshold (as a ratio; 0.1 means it must improve by 10%% at each step)
+
+    maxiter and max_fevals are also stopping conditions. Note that they're not quite the same,
+    as an iteration may evaluate the function more than once.
+
+    sparse_solver is the solver to use to calculate the Gauss-Newton step in the common case
+    that the Jacobian is sparse. It can be 'spsolve' (in which case scipy.sparse.linalg.spsolve
+    will be used), 'cg' (in which case scipy.sparse.linalg.cg will be used), or any callable
+    that matches the api of scipy.sparse.linalg.spsolve to solve `A x = b` for x where A is sparse.
+
+    cg, uses a Conjugate Gradient method, and will be faster if A is sparse but x is dense.
+    spsolve will be faster if x is also sparse.
+
+    delta_0 defines the initial trust region. Generally speaking, if this is set too low then
+    the optimization will never really go anywhere (to small a trust region to make any real
+    progress before running out of iterations) and if it's set too high then the optimization
+    will diverge immidiately and go wild (such a large trust region that the initial step so
+    far overshoots that it can't recover). If it's left as None, it will be automatically
+    estimated on the first iteration; it's always updated at each iteration, so this is treated
+    only as an initialization.
+
+    handle_as_dense explicitly converts all Jacobians of obj to dense matrices
+    """
+
+
+    solve = setup_sparse_solver(sparse_solver)
+    obj, callback = setup_objective(obj, free_variables, on_step=on_step, disp=disp,
+                                    make_dense=treat_as_dense)
+
+    state = DoglegState(delta=delta_0, solve=solve)
+    state.p = obj.x.r
+
+    #inject profiler if in DEBUG mode
+    if ch.DEBUG:
+        from .monitor import DrWrtProfiler
+        obj.profiler = DrWrtProfiler(obj)
+
+    callback()
+    state.updateJ(obj)
+    state.r = obj.r
+
+    def stop(msg):
+        if not state.done:
+            pif(msg)
+        state.done = True
+
+    if np.linalg.norm(state.g, np.inf) < e_1:
+        stop('stopping because norm(g, np.inf) < %.2e' % e_1)
+    while not state.done:
+        state.start_iteration()
+        while True:
+            state.update_step()
+            if state.step_size <= e_2 * np.linalg.norm(state.p):
+                stop('stopping because of small step size (norm_dl < %.2e)' % (e_2 * np.linalg.norm(state.p)))
+            else:
+                tm = timer()
+                obj.x = state.p + state.step
+                trial = state.trial_r(obj.r)
+                pif('Residuals computed in %.2fs' % tm())
+                # if the objective function improved, update input parameter estimate.
+                # Note that the obj.x already has the new parms,
+                # and we should not set them again to the same (or we'll bust the cache)
+                if trial.is_improvement:
+                    state.p = state.p + state.step
+                    callback()
+                    if e_3 > 0. and trial.improvement < e_3:
+                        stop('stopping because improvement < %.1e%%' % (100*e_3))
+                    else:
+                        state.updateJ(obj)
+                        state.r = trial.r
+                        if np.linalg.norm(state.g, np.inf) < e_1:
+                            stop('stopping because norm(g, np.inf) < %.2e' % e_1)
+                else:  # Put the old parms back
+                    obj.x = ch.Ch(state.p)
+                    obj.on_changed('x') # copies from flat vector to free variables
+                # update our trust region
+                state.updateRadius(trial.rho)
+                if state.delta <= e_2*np.linalg.norm(state.p):
+                    stop('stopping because trust region is too small')
+            if state.done or trial.is_improvement or (obj.fevals >= max_fevals):
+                break
+        if state.iteration >= maxiter:
+            stop('stopping because max number of user-specified iterations (%d) has been met' % maxiter)
+        elif obj.fevals >= max_fevals:
+            stop('stopping because max number of user-specified func evals (%d) has been met' % max_fevals)
+    return obj.free_variables
diff --git a/optional_test_performance.py b/chumpy/optional_test_performance.py
similarity index 100%
rename from optional_test_performance.py
rename to chumpy/optional_test_performance.py
diff --git a/reordering.py b/chumpy/reordering.py
similarity index 93%
rename from reordering.py
rename to chumpy/reordering.py
index 71cdb21..1220349 100644
--- a/reordering.py
+++ b/chumpy/reordering.py
@@ -288,11 +288,11 @@ def compute_r(self):
     def compute_dr_wrt(self, obj):
         if obj is self.a:
             if not hasattr(self, '_dr_cached'):
-                IS = np.arange(len(self.idxs)).flatten()
+                IS = np.arange(len(self.idxs))
                 JS = self.idxs.ravel()
                 ij = np.vstack((row(IS), row(JS)))
                 data = np.ones(len(self.idxs))
-                self._dr_cached = sp.csc_matrix((data, ij), shape=(len(self.idxs), len(self.a.r.ravel())))
+                self._dr_cached = sp.csc_matrix((data, ij), shape=(len(self.idxs), np.prod(self.a.shape)))
             return self._dr_cached
         
     def on_changed(self, which):
@@ -383,7 +383,9 @@ def everything(self):
         return self._everything
     
     def compute_dr_wrt(self, wrt):
-        if wrt in self.dr_cached:
+        if not hasattr(self, 'dr_cached'):
+            self.dr_cached = weakref.WeakKeyDictionary()
+        if wrt in self.dr_cached and self.dr_cached[wrt] is not None:
             return self.dr_cached[wrt]
         
         if wrt not in self.our_terms:
@@ -408,8 +410,15 @@ def compute_dr_wrt(self, wrt):
         JS   = np.concatenate(JS).ravel()
         data = np.concatenate(data)
                 
-        self.dr_cached[wrt] = sp.csc_matrix((data, (IS, JS)), shape=(self.r.size, wrt.size))
-        return self.dr_cached[wrt]
+        res = sp.csc_matrix((data, (IS, JS)), shape=(self.r.size, wrt.size))
+        
+        if len(self._parents.keys()) != 1:
+            self.dr_cached[wrt] = res
+        else:
+            self.dr_cached[wrt] = None
+
+        return res
+
 
 def expand_concatenates(mtxs, axis=0):
     mtxs = list(mtxs)
@@ -423,11 +432,11 @@ def expand_concatenates(mtxs, axis=0):
     return done
 
 
-def concatenate(mtxs, axis=0):
+def concatenate(mtxs, axis=0, **kwargs):
 
     mtxs = expand_concatenates(mtxs, axis)
 
-    result = Concatenate()
+    result = Concatenate(**kwargs)
     result.dterms = []
     for i, mtx in enumerate(mtxs):
         result.dterms.append('m%d' % (i,))
@@ -435,11 +444,11 @@ def concatenate(mtxs, axis=0):
     result.axis = axis
     return result
     
-def hstack(mtxs):
-    return concatenate(mtxs, axis=1)
+def hstack(mtxs, **kwargs):
+    return concatenate(mtxs, axis=1, **kwargs)
 
-def vstack(mtxs):
-    return concatenate([atleast_2d(m) for m in mtxs], axis=0)
+def vstack(mtxs, **kwargs):
+    return concatenate([atleast_2d(m) for m in mtxs], axis=0, **kwargs)
 
-def dstack(mtxs):
-    return concatenate([atleast_3d(m) for m in mtxs], axis=2)
+def dstack(mtxs, **kwargs):
+    return concatenate([atleast_3d(m) for m in mtxs], axis=2, **kwargs)
diff --git a/test_ch.py b/chumpy/test_ch.py
similarity index 99%
rename from test_ch.py
rename to chumpy/test_ch.py
index 689e687..85b6bfe 100755
--- a/test_ch.py
+++ b/chumpy/test_ch.py
@@ -123,8 +123,8 @@ def test_dr_wrt_selection(self):
         
 
     def test_sum_mean_std_var(self):
-        for fn in [ch.sum, ch.mean,ch.var, ch.std]:
-            
+        for fn in [ch.sum, ch.mean, ch.var, ch.std]:
+
             # Create fake input and differences in input space
             data1 = ch.ones((3,4,7,2))
             data2 = ch.array(data1.r + .1 * np.random.rand(data1.size).reshape(data1.shape))
@@ -137,20 +137,20 @@ def test_sum_mean_std_var(self):
             # Empirical and predicted derivatives
             gt = result2.r - result1.r
             pred = result1.dr_wrt(data1).dot(diff.ravel()).reshape(gt.shape)
-            
+
             #print np.max(np.abs(gt - pred))
-            
+
             if fn in [ch.std, ch.var]:
-                self.assertTrue(1e-2 > np.max(np.abs(gt - pred)))        
+                self.assertTrue(1e-2 > np.max(np.abs(gt - pred)))
             else:
-                self.assertTrue(1e-14 > np.max(np.abs(gt - pred)))        
+                self.assertTrue(1e-14 > np.max(np.abs(gt - pred)))
                 # test caching
                 dr0 = result1.dr_wrt(data1)
                 data1[:] = np.random.randn(data1.size).reshape(data1.shape)
                 self.assertTrue(result1.dr_wrt(data1) is dr0) # changing values shouldn't force recompute
                 result1.axis=1
                 self.assertTrue(result1.dr_wrt(data1) is not dr0)
-            
+
         self.assertEqual(ch.mean(ch.eye(3),axis=1).ndim, np.mean(np.eye(3),axis=1).ndim)
         self.assertEqual(ch.mean(ch.eye(3),axis=0).ndim, np.mean(np.eye(3),axis=0).ndim)
         self.assertEqual(ch.sum(ch.eye(3),axis=1).ndim, np.sum(np.eye(3),axis=1).ndim)
diff --git a/test_inner_composition.py b/chumpy/test_inner_composition.py
similarity index 100%
rename from test_inner_composition.py
rename to chumpy/test_inner_composition.py
diff --git a/test_linalg.py b/chumpy/test_linalg.py
similarity index 100%
rename from test_linalg.py
rename to chumpy/test_linalg.py
diff --git a/test_optimization.py b/chumpy/test_optimization.py
similarity index 83%
rename from test_optimization.py
rename to chumpy/test_optimization.py
index 349b6a2..d752acb 100755
--- a/test_optimization.py
+++ b/chumpy/test_optimization.py
@@ -123,13 +123,13 @@ class TestOptimization(unittest.TestCase):
 
     def test_dogleg_rosen(self):
         obj, freevars = Rosen()
-        minimize(fun=obj, x0=freevars, method='dogleg', options={'maxiter': 337})
+        minimize(fun=obj, x0=freevars, method='dogleg', options={'maxiter': 337, 'disp': False})
         self.assertTrue(freevars[0].r[0]==1.)
         self.assertTrue(freevars[1].r[0]==1.)
 
     def test_dogleg_madsen(self):
         obj = Madsen(x = Ch(np.array((3.,1.))))
-        minimize(fun=obj, x0=[obj.x], method='dogleg', options={'maxiter': 34})
+        minimize(fun=obj, x0=[obj.x], method='dogleg', options={'maxiter': 34, 'disp': False})
         self.assertTrue(np.sum(obj.r**2)/2 < 0.386599528247)
         
     @unittest.skip('negative sign in exponent screws with reverse mode')
@@ -161,12 +161,33 @@ def gradfunc(x):
         #tm = time.time()
         x1 = scipy.optimize.fmin_bfgs(errfunc, x0, fprime=gradfunc, maxiter=8, disp=0)
         #print 'forward: took %.es' % (time.time() - tm,)
-        self.assertTrue(obj.r/2. < 0.386599528247)
+        self.assertLess(obj.r/2., 0.4)
 
         # Optimize with chumpy's minimize (which uses scipy's bfgs).
         obj.x = x0
-        minimize(fun=obj, x0=[obj.x], method='bfgs', options={'maxiter': 8})
-        self.assertTrue(obj.r/2. < 0.386599528247)
+        minimize(fun=obj, x0=[obj.x], method='bfgs', options={'maxiter': 8, 'disp': False})
+        self.assertLess(obj.r/2., 0.4)
+
+    def test_nested_select(self):
+        def beales(x, y):
+            e1 = 1.5 - x + x*y
+            e2 = 2.25 - x  + x*(y**2)
+            e3 = 2.625 - x + x*(y**3)
+            return {'e1': e1, 'e2': e2, 'e3': e3}
+
+        x1 = ch.zeros(10)
+        y1 = ch.zeros(10)
+
+        # With a single select this worked
+        minimize(beales(x1, y1), x0=[x1[1:4], y1], method='dogleg', options={'disp': False})
+
+        x2 = ch.zeros(10)
+        y2 = ch.zeros(10)
+
+        # But this used to raise `AttributeError: 'Select' object has no attribute 'x'`
+        minimize(beales(x2, y2), x0=[x2[1:8][:3], y2], method='dogleg', options={'disp': False})
+        np.testing.assert_array_equal(x1, x2)
+        np.testing.assert_array_equal(y1, y2)
 
 
 suite = unittest.TestLoader().loadTestsFromTestCase(TestOptimization)
diff --git a/testing.py b/chumpy/testing.py
similarity index 100%
rename from testing.py
rename to chumpy/testing.py
diff --git a/chumpy/utils.py b/chumpy/utils.py
new file mode 100644
index 0000000..19287d3
--- /dev/null
+++ b/chumpy/utils.py
@@ -0,0 +1,93 @@
+"""
+Author(s): Matthew Loper
+
+See LICENCE.txt for licensing and contact information.
+"""
+import scipy.sparse as sp 
+import numpy as np
+
+def row(A):
+    return A.reshape((1, -1))
+
+
+def col(A):
+    return A.reshape((-1, 1))
+
+class timer(object):
+    def time(self):
+        import time
+        return time.time()
+    def __init__(self):
+        self._elapsed = 0
+        self._start = self.time()
+    def __call__(self):
+        if self._start is not None:
+            return self._elapsed + self.time() - self._start
+        else:
+            return self._elapsed
+    def pause(self):
+        assert self._start is not None
+        self._elapsed += self.time() - self._start
+        self._start = None
+    def resume(self):
+        assert self._start is None
+        self._start = self.time()
+
+def dfs_do_func_on_graph(node, func, *args, **kwargs):
+    '''
+    invoke func on each node of the dr graph
+    '''
+    for _node in node.tree_iterator():
+        func(_node, *args, **kwargs)
+
+
+def sparse_is_desireable(lhs, rhs):
+    '''
+    Examines a pair of matrices and determines if the result of their multiplication should be sparse or not.
+    '''
+    return False
+    if len(lhs.shape) == 1:
+        return False
+    else:
+        lhs_rows, lhs_cols = lhs.shape
+
+    if len(rhs.shape) == 1:
+        rhs_rows = 1
+        rhs_cols = rhs.size
+    else:
+        rhs_rows, rhs_cols = rhs.shape
+
+    result_size = lhs_rows * rhs_cols
+
+    if sp.issparse(lhs) and sp.issparse(rhs):
+        return True
+    elif sp.issparse(lhs):
+        lhs_zero_rows = lhs_rows - np.unique(lhs.nonzero()[0]).size
+        rhs_zero_cols = np.all(rhs==0, axis=0).sum()
+        
+    elif sp.issparse(rhs):
+        lhs_zero_rows = np.all(lhs==0, axis=1).sum()
+        rhs_zero_cols = rhs_cols- np.unique(rhs.nonzero()[1]).size
+    else:
+        lhs_zero_rows = np.all(lhs==0, axis=1).sum()
+        rhs_zero_cols = np.all(rhs==0, axis=0).sum()
+
+    num_zeros = lhs_zero_rows * rhs_cols + rhs_zero_cols * lhs_rows - lhs_zero_rows * rhs_zero_cols
+
+    # A sparse matrix uses roughly 16 bytes per nonzero element (8 + 2 4-byte inds), while a dense matrix uses 8 bytes per element. So the break even point for sparsity is 50% nonzero. But in practice, it seems to be that the compression in a csc or csr matrix gets us break even at ~65% nonzero, which lets us say 50% is a conservative, worst cases cutoff.
+    return (float(num_zeros) / float(size)) >= 0.5
+
+
+def convert_inputs_to_sparse_if_necessary(lhs, rhs):
+    '''
+    This function checks to see if a sparse output is desireable given the inputs and if so, casts the inputs to sparse in order to make it so.
+    '''
+    if not sp.issparse(lhs) or not sp.issparse(rhs):
+        if sparse_is_desireable(lhs, rhs):
+            if not sp.issparse(lhs):
+                lhs = sp.csc_matrix(lhs)
+                #print "converting lhs into sparse matrix"
+            if not sp.issparse(rhs):
+                rhs = sp.csc_matrix(rhs)
+                #print "converting rhs into sparse matrix"
+    return lhs, rhs
diff --git a/version.py b/chumpy/version.py
similarity index 71%
rename from version.py
rename to chumpy/version.py
index bb53753..30ceaf1 100644
--- a/version.py
+++ b/chumpy/version.py
@@ -1,3 +1,3 @@
-version = '0.66'
+version = '0.67.1'
 short_version = version
 full_version = version
diff --git a/circle.yml b/circle.yml
new file mode 100644
index 0000000..0300127
--- /dev/null
+++ b/circle.yml
@@ -0,0 +1,18 @@
+general:
+  branches:
+    ignore:
+      - /zz.*/  # Don't run tests on deprecated branches.
+
+machine:
+  environment:
+    PYTHONPATH: /usr/local/lib/python2.7/dist-packages
+
+dependencies:
+  pre:
+    - sudo apt-get update
+    # Is gfortran needed? This was copied from `.travis.yml` which included it.
+    - sudo apt-get install -qq python-dev gfortran pkg-config liblapack-dev
+
+test:
+  override:
+    - make test
diff --git a/optimization.py b/optimization.py
deleted file mode 100755
index 16d6ef9..0000000
--- a/optimization.py
+++ /dev/null
@@ -1,526 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Author(s): Matthew Loper
-
-See LICENCE.txt for licensing and contact information.
-"""
-
-__all__ = ['minimize']
-
-import time
-import math
-import sys
-import time
-import numpy as np
-from numpy.linalg import norm
-
-import ch, utils
-from utils import row, col
-
-import scipy.sparse as sp
-import scipy.sparse
-import scipy.optimize
-from scipy.sparse.linalg.interface import LinearOperator
-
-
-
-def vstack(x):
-    x = [a if not isinstance(a, LinearOperator) else a.dot(np.eye(a.shape[1])) for a in x]
-    return sp.vstack(x, format='csc') if any([sp.issparse(a) for a in x]) else np.vstack(x)
-def hstack(x):
-    x = [a if not isinstance(a, LinearOperator) else a.dot(np.eye(a.shape[1])) for a in x]
-    return sp.hstack(x, format='csc') if any([sp.issparse(a) for a in x]) else np.hstack(x)
-
-
-# Nelder-Mead
-# Powell
-# CG
-# BFGS
-# Newton-CG
-# Anneal
-# L-BFGS-B
-# TNC
-# COBYLA
-# SLSQP
-# dogleg
-# trust-ncg
-def minimize(fun, x0, method='dogleg', bounds=None, constraints=(), tol=None, callback=None, options=None):
-
-    if method == 'dogleg':
-        if options is None: options = {}
-        return _minimize_dogleg(fun, free_variables=x0, on_step=callback, **options)
-
-    if isinstance(fun, list) or isinstance(fun, tuple):
-        fun = ch.concatenate([f.ravel() for f in fun])
-    if isinstance(fun, dict):
-        fun = ch.concatenate([f.ravel() for f in fun.values()])
-    obj = fun
-    free_variables = x0
-
-
-    from ch import SumOfSquares
-
-    hessp = None
-    hess = None
-    if obj.size == 1:
-        obj_scalar = obj
-    else:
-        obj_scalar = SumOfSquares(obj)
-    
-        def hessp(vs, p,obj, obj_scalar, free_variables):
-            changevars(vs,obj,obj_scalar,free_variables)
-            if not hasattr(hessp, 'vs'):
-                hessp.vs = vs*0+1e16
-            if np.max(np.abs(vs-hessp.vs)) > 0:
-
-                J = ns_jacfunc(vs,obj,obj_scalar,free_variables)
-                hessp.J = J
-                hessp.H = 2. * J.T.dot(J)
-                hessp.vs = vs
-            return np.array(hessp.H.dot(p)).ravel()
-            #return 2*np.array(hessp.J.T.dot(hessp.J.dot(p))).ravel()
-            
-        if method.lower() != 'newton-cg':
-            def hess(vs, obj, obj_scalar, free_variables):
-                changevars(vs,obj,obj_scalar,free_variables)
-                if not hasattr(hessp, 'vs'):
-                    hessp.vs = vs*0+1e16
-                if np.max(np.abs(vs-hessp.vs)) > 0:
-                    J = ns_jacfunc(vs,obj,obj_scalar,free_variables)
-                    hessp.H = 2. * J.T.dot(J)
-                return hessp.H
-        
-    def changevars(vs, obj, obj_scalar, free_variables):
-        cur = 0
-        changed = False
-        for idx, freevar in enumerate(free_variables):
-            sz = freevar.r.size
-            newvals = vs[cur:cur+sz].copy().reshape(free_variables[idx].shape)
-            if np.max(np.abs(newvals-free_variables[idx]).ravel()) > 0:
-                free_variables[idx][:] = newvals
-                changed = True
-
-            cur += sz
-            
-        methods_without_callback = ('anneal', 'powell', 'cobyla', 'slsqp')
-        if callback is not None and changed and method.lower() in methods_without_callback:
-            callback(None)
-
-        return changed
-    
-    def residuals(vs,obj, obj_scalar, free_variables):
-        changevars(vs, obj, obj_scalar, free_variables)
-        residuals = obj_scalar.r.ravel()[0]
-        return residuals
-
-    def scalar_jacfunc(vs,obj, obj_scalar, free_variables):
-        if not hasattr(scalar_jacfunc, 'vs'):
-            scalar_jacfunc.vs = vs*0+1e16
-        if np.max(np.abs(vs-scalar_jacfunc.vs)) == 0:
-            return scalar_jacfunc.J
-            
-        changevars(vs, obj, obj_scalar, free_variables)
-        
-        if True: # faster, at least on some problems
-            result = np.concatenate([np.array(obj_scalar.lop(wrt, np.array([[1]]))).ravel() for wrt in free_variables])            
-        else:
-            jacs = [obj_scalar.dr_wrt(wrt) for wrt in free_variables]
-            for idx, jac in enumerate(jacs):
-                if sp.issparse(jac):
-                    jacs[idx] = jacs[idx].todense()
-            result = np.concatenate([jac.ravel() for jac in jacs])
-
-        scalar_jacfunc.J = result
-        scalar_jacfunc.vs = vs
-        return result.ravel()
-        
-    def ns_jacfunc(vs,obj, obj_scalar, free_variables):
-        if not hasattr(ns_jacfunc, 'vs'):
-            ns_jacfunc.vs = vs*0+1e16
-        if np.max(np.abs(vs-ns_jacfunc.vs)) == 0:
-            return ns_jacfunc.J
-            
-        changevars(vs, obj, obj_scalar, free_variables)
-        jacs = [obj.dr_wrt(wrt) for wrt in free_variables]
-        result = hstack(jacs)
-
-        ns_jacfunc.J = result
-        ns_jacfunc.vs = vs
-        return result
-
-        
-    x1 = scipy.optimize.minimize(
-        method=method,
-        fun=residuals,
-        callback=callback,
-        x0=np.concatenate([free_variable.r.ravel() for free_variable in free_variables]),
-        jac=scalar_jacfunc,
-        hessp=hessp, hess=hess, args=(obj, obj_scalar, free_variables),
-        bounds=bounds, constraints=constraints, tol=tol, options=options).x
-
-    changevars(x1, obj, obj_scalar, free_variables)
-    return free_variables
-    
-
-_giter = 0
-class ChInputsStacked(ch.Ch):
-    dterms = 'x', 'obj'
-    terms = 'free_variables'
-
-    def compute_r(self):
-        return self.obj.r.ravel()
-    
-    # def compute_dr_wrt(self, wrt):
-    #     if wrt is self.x:
-    #         return hstack([self.obj.dr_wrt(freevar) for freevar in self.free_variables])
-    
-    def dr_wrt(self, wrt):
-        if wrt is self.x:
-            mtxs = []
-            for freevar in self.free_variables:
-                if isinstance(freevar, ch.Select):
-                    new_mtx = self.obj.dr_wrt(freevar.a)
-                    try:
-                        mtxs.append(new_mtx[:,freevar.idxs])
-                    except:
-                        mtxs.append(new_mtx.tocsc()[:,freevar.idxs])
-                else:
-                    mtxs.append(self.obj.dr_wrt(freevar))
-            return hstack(mtxs)
-            #return hstack([self.obj.dr_wrt(freevar) for freevar in self.free_variables])
-    
-    def on_changed(self, which):
-        global _giter
-        _giter += 1
-        
-        if 'x' in which:
-            pos = 0
-            for idx, freevar in enumerate(self.free_variables):
-                sz = freevar.r.size
-                rng = np.arange(pos, pos+sz)
-                
-                if isinstance(self.free_variables[idx], ch.Select):
-                    newv = self.free_variables[idx].a.x.copy()
-                    newv.ravel()[self.free_variables[idx].idxs] = self.x.r[rng]
-                    self.free_variables[idx].a.__setattr__('x', newv, _giter)
-                    #self.free_variables[idx].a.x = newv
-                elif isinstance(self.free_variables[idx].x, np.ndarray):
-                    #self.free_variables[idx].x = self.x.r[rng].copy().reshape(self.free_variables[idx].x.shape)
-                    self.free_variables[idx].__setattr__('x', self.x.r[rng].copy().reshape(self.free_variables[idx].x.shape), _giter)
-                else: # a number
-                    #self.free_variables[idx].x = self.x.r[rng]
-                    self.free_variables[idx].__setattr__('x', self.x.r[rng], _giter)
-                #self.free_variables[idx] = self.obj.replace(freevar, Ch(self.x.r[rng].copy()))
-                pos += sz
-    
-
-    @property
-    def J(self):
-        result = self.dr_wrt(self.x).copy()
-        return np.atleast_2d(result) if not sp.issparse(result) else result
-    
-    def JT_dot(self, y):
-        return self.J.T.dot(y)
-    
-    def J_dot(self, y):
-        return self.J.dot(y)
-    
-    # Have not observed this to be faster than just using J directly
-    def JTJ(self):
-        if False:
-            return self.J.T.dot(self.J)
-        else:
-            Js = [self.obj.dr_wrt(freevar) for freevar in self.free_variables]
-            zeroArray=[None]*len(Js)
-            A = [zeroArray[:] for i in range(len(Js))]
-            for y in range(len(Js)):
-                for x in range(len(Js)):
-                    if y > x:
-                        A[y][x] = A[x][y].T
-                    else:
-                        A[y][x] = Js[y].T.dot(Js[x])
-            return vstack([hstack(A[y]) for y in range(len(Js))])
-
-    
-_solver_fns = {
-    'cg': lambda A, x, M=None : scipy.sparse.linalg.cg(A, x, M=M, tol=1e-10)[0],
-    'spsolve': lambda A, x : scipy.sparse.linalg.spsolve(A, x)
-}
-
-
-
-def _minimize_dogleg(obj, free_variables, on_step=None,
-                     maxiter=200, max_fevals=np.inf, sparse_solver='spsolve',
-                     disp=False, show_residuals=None, e_1=1e-15, e_2=1e-15, e_3=0., delta_0=None):
-
-    """"Nonlinear optimization using Powell's dogleg method.
-
-    See Lourakis et al, 2005, ICCV '05, "Is Levenberg-Marquardt
-    the Most Efficient Optimization for Implementing Bundle
-    Adjustment?":
-    http://www.ics.forth.gr/cvrl/publications/conferences/0201-P0401-lourakis-levenberg.pdf
-    """
-
-    import warnings
-    if show_residuals is not None:
-        import warnings
-        warnings.warn('minimize_dogleg: show_residuals parm is deprecaed, pass a dict instead.')
-
-    labels = {}
-    if isinstance(obj, list) or isinstance(obj, tuple):
-        obj = ch.concatenate([f.ravel() for f in obj])
-    elif isinstance(obj, dict):
-        labels = obj
-        obj = ch.concatenate([f.ravel() for f in obj.values()])
-
-
-    niters = maxiter
-    verbose = disp
-    num_unique_ids = len(np.unique(np.array([id(freevar) for freevar in free_variables])))
-    if num_unique_ids != len(free_variables):
-        raise Exception('The "free_variables" param contains duplicate variables.')
-        
-    obj = ChInputsStacked(obj=obj, free_variables=free_variables, x=np.concatenate([freevar.r.ravel() for freevar in free_variables]))
-
-    def call_cb():
-        if on_step is not None:
-            on_step(obj)
-
-        report_line = ""
-        if len(labels) > 0:
-            report_line += '%.2e | ' % (np.sum(obj.r**2),)
-        for label in sorted(labels.keys()):
-            objective = labels[label]
-            report_line += '%s: %.2e | ' % (label, np.sum(objective.r**2))
-        if len(labels) > 0:
-            report_line += '\n'
-        sys.stderr.write(report_line)
-
-    call_cb()
-
-    # pif = print-if-verbose.
-    # can't use "print" because it's a statement, not a fn
-    pif = lambda x: sys.stdout.write(x + '\n') if verbose else 0
-
-    if callable(sparse_solver):
-        solve = sparse_solver
-    elif isinstance(sparse_solver, str) and sparse_solver in _solver_fns.keys():
-        solve = _solver_fns[sparse_solver]
-    else:
-        raise Exception('sparse_solver argument must be either a string in the set (%s) or have the api of scipy.sparse.linalg.spsolve.' % ''.join(_solver_fns.keys(), ' '))
-
-    # optimization parms
-    k_max = niters
-    fevals = 0
-
-    k = 0
-    delta = delta_0
-    p = col(obj.x.r) 
-
-    fevals += 1
-    
-    tm = time.time()
-    pif('computing Jacobian...')
-    J = obj.J
-
-    if sp.issparse(J):
-        assert(J.nnz > 0)
-    pif('Jacobian (%dx%d) computed in %.2fs' % (J.shape[0], J.shape[1], time.time() - tm))
-    
-    if J.shape[1] != p.size:
-        import pdb; pdb.set_trace()
-    assert(J.shape[1] == p.size)
-    
-    tm = time.time()
-    pif('updating A and g...')
-    A = J.T.dot(J)    
-    r = col(obj.r.copy())
-    
-    g = col(J.T.dot(-r))
-    pif('A and g updated in %.2fs' % (time.time() - tm))
-    
-    stop = norm(g, np.inf) < e_1
-    while (not stop) and (k < k_max) and (fevals < max_fevals):
-        k += 1
-        pif('beginning iteration %d' % (k,))
-        d_sd = col((sqnorm(g)) / (sqnorm (J.dot(g))) * g)
-        GNcomputed = False
-
-        while True:
-            # if the Cauchy point is outside the trust region,
-            # take that direction but only to the edge of the trust region
-            if delta is not None and norm(d_sd) >= delta:
-                pif('PROGRESS: Using stunted cauchy')
-                d_dl = np.array(col(delta/norm(d_sd) * d_sd))
-            else:
-                if not GNcomputed:
-                    tm = time.time()
-                    if scipy.sparse.issparse(A):
-                        A.eliminate_zeros()
-                        pif('sparse solve...sparsity infill is %.3f%% (hessian %dx%d), J infill %.3f%%' % (
-                            100. * A.nnz / (A.shape[0] * A.shape[1]),
-                            A.shape[0],
-                            A.shape[1],
-                            100. * J.nnz / (J.shape[0] * J.shape[1])))
-                            
-                        if g.size > 1:             
-                            d_gn = col(solve(A, g))
-                            if np.any(np.isnan(d_gn)) or np.any(np.isinf(d_gn)):
-                                from scipy.sparse.linalg import lsqr
-                                d_gn = col(lsqr(A, g)[0])
-                        else:
-                            d_gn = np.atleast_1d(g.ravel()[0]/A[0,0])
-                        pif('sparse solve...done in %.2fs' % (time.time() - tm))
-                    else:
-                        pif('dense solve...')
-                        try:
-                            d_gn = col(np.linalg.solve(A, g))
-                        except Exception:
-                            d_gn = col(np.linalg.lstsq(A, g)[0])
-                        pif('dense solve...done in %.2fs' % (time.time() - tm))
-                    GNcomputed = True
-
-                # if the gauss-newton solution is within the trust region, use it
-                if delta is None or norm(d_gn) <= delta:
-                    pif('PROGRESS: Using gauss-newton solution')
-                    d_dl = np.array(d_gn)
-                    if delta is None:
-                        delta = norm(d_gn)
-
-                else: # between cauchy step and gauss-newton step
-                    pif('PROGRESS: between cauchy and gauss-newton')
-
-                    # compute beta multiplier
-                    delta_sq = delta**2
-                    pnow = (
-                        (d_gn-d_sd).T.dot(d_gn-d_sd)*delta_sq
-                        + d_gn.T.dot(d_sd)**2
-                        - sqnorm(d_gn) * (sqnorm(d_sd)))
-                    B = delta_sq - sqnorm(d_sd)
-                    B /= ((d_gn-d_sd).T.dot(d_sd) + math.sqrt(pnow))
-
-                    # apply step
-                    d_dl = np.array(d_sd + float(B) * (d_gn - d_sd))
-
-                    #assert(math.fabs(norm(d_dl) - delta) < 1e-12)
-            if norm(d_dl) <= e_2*norm(p):
-                pif('stopping because of small step size (norm_dl < %.2e)' % (e_2*norm(p)))
-                stop = True
-            else:
-                p_new = p + d_dl
-
-                tm_residuals = time.time()
-                obj.x = p_new
-                fevals += 1
-
-                r_trial = obj.r.copy()
-                tm_residuals = time.time() - tm
-
-                # rho is the ratio of...
-                # (improvement in SSE) / (predicted improvement in SSE)  
-                
-                # slower
-                #rho = norm(e_p)**2 - norm(e_p_trial)**2
-                #rho = rho / (L(d_dl*0, e_p, J) - L(d_dl, e_p, J))              
-                
-                # faster
-                sqnorm_ep = sqnorm(r)
-                rho = sqnorm_ep - norm(r_trial)**2
-                
-                with warnings.catch_warnings():
-                    warnings.filterwarnings('ignore',category=RuntimeWarning)
-                    if rho > 0:
-                        rho /= predicted_improvement(d_dl, -r, J, sqnorm_ep, A, g)
-                    
-                improvement_occurred = rho > 0
-
-                # if the objective function improved, update input parameter estimate.
-                # Note that the obj.x already has the new parms,
-                # and we should not set them again to the same (or we'll bust the cache)                
-                if improvement_occurred:
-                    p = col(p_new)
-                    call_cb()
-
-                    if (sqnorm_ep - norm(r_trial)**2) / sqnorm_ep < e_3:
-                        stop = True
-                        pif('stopping because improvement < %.1e%%' % (100*e_3,))
-
-
-                else:  # Put the old parms back
-                    obj.x = ch.Ch(p)
-                    obj.on_changed('x') # copies from flat vector to free variables
-
-                # if the objective function improved and we're not done,
-                # get ready for the next iteration
-                if improvement_occurred and not stop:
-                    tm_jac = time.time()
-                    pif('computing Jacobian...')
-                    J = obj.J.copy()
-                    tm_jac = time.time() - tm_jac
-                    pif('Jacobian (%dx%d) computed in %.2fs' % (J.shape[0], J.shape[1], tm_jac))
-
-                    pif('Residuals+Jac computed in %.2fs' % (tm_jac + tm_residuals,))
-
-                    tm = time.time()
-                    pif('updating A and g...')
-                    A = J.T.dot(J)
-                    r = col(r_trial)
-                    g = col(J.T.dot(-r))
-                    pif('A and g updated in %.2fs' % (time.time() - tm))
-                    
-                    if norm(g, np.inf) < e_1:
-                        stop = True
-                        pif('stopping because norm(g, np.inf) < %.2e' % (e_1))
-
-                # update our trust region
-                delta = updateRadius(rho, delta, d_dl)
-                
-                if delta <= e_2*norm(p):
-                    stop = True
-                    pif('stopping because trust region is too small')
-
-            # the following "collect" is very expensive.
-            # please contact matt if you find situations where it actually helps things.
-            #import gc; gc.collect()
-            if stop or improvement_occurred or (fevals >= max_fevals):
-                break
-        if k >= k_max:
-            pif('stopping because max number of user-specified iterations (%d) has been met' % (k_max,))
-        elif fevals >= max_fevals:
-            pif('stopping because max number of user-specified func evals (%d) has been met' % (max_fevals,))
-
-    return obj.free_variables
-
-
-def sqnorm(a):
-    return norm(a)**2
-
-def updateRadius(rho, delta, d_dl, lb=.05, ub=.9):
-    if rho > ub:
-        delta = max(delta, 2.5*norm(d_dl))
-    elif rho < lb:
-        delta *= .25
-    return delta
-
-
-def predicted_improvement(d, e, J, sqnorm_e, JTJ, JTe):
-    d = col(d)
-    e = col(e)
-    aa = .5 * sqnorm_e
-    bb = JTe.T.dot(d)
-    c1 = .5 * d.T
-    c2 = JTJ
-    c3 = d
-    cc = c1.dot(c2.dot(c3))
-    result = 2. * (aa - bb + cc)[0,0]
-    return sqnorm_e - result
-
-
-def main():
-    pass
-
-
-if __name__ == '__main__':
-    main()
-
diff --git a/requirements.txt b/requirements.txt
index 4402122..7125758 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,2 @@
-
--e .
+numpy>=1.8.1
+scipy>=0.13.0,<0.19.0
diff --git a/setup.py b/setup.py
index 69d0006..cc55079 100644
--- a/setup.py
+++ b/setup.py
@@ -5,19 +5,21 @@
 """
 
 from distutils.core import setup
-from version import version
+import importlib
+from pip.req import parse_requirements
+
+install_reqs = parse_requirements('requirements.txt', session=False)
+install_requires = [str(ir.req) for ir in install_reqs]
 
 setup(name='chumpy',
-    version=version,
-    #py_modules=['ch', 'ch_ops', 'linalg', 'utils', 'api_compatibility', 'ch_random', 'test_ch', 'test_inner_composition', 'test_linalg'],
+    version=importlib.import_module('chumpy').__version__,
     packages = ['chumpy'],
-    package_dir = {'chumpy': '.'},
     author='Matthew Loper',
     author_email='matt.loper@gmail.com',
     url='https://github.com/mattloper/chumpy',
     description='chumpy',
     license='MIT',
-    install_requires=['numpy >= 1.8.1', 'scipy >= 0.13.0'],
+    install_requires=install_requires,
 
     # See https://pypi.python.org/pypi?%3Aaction=list_classifiers
     classifiers=[
diff --git a/utils.py b/utils.py
deleted file mode 100644
index 18056d3..0000000
--- a/utils.py
+++ /dev/null
@@ -1,12 +0,0 @@
-"""
-Author(s): Matthew Loper
-
-See LICENCE.txt for licensing and contact information.
-"""
-
-def row(A):
-    return A.reshape((1, -1))
-
-
-def col(A):
-    return A.reshape((-1, 1))
\ No newline at end of file