-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
sqlarray functions from latest version of AdK/lib/python/Transitions …
…as a separate package git-svn-id: svn+ssh://gonzo.med.jhmi.edu/scratch/svn/woolf_repository/users/oliver/Library/RecSQL@3490 df5ba8eb-4b0b-0410-8c14-c10f23b0129c
- Loading branch information
0 parents
commit 54dbb9a
Showing
7 changed files
with
977 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# $Id$ | ||
# Copyright (C) 2009 Oliver Beckstein <orbeckst@gmail.com> | ||
# Released under the GNU Public License, version 3 or higher (your choice) | ||
|
||
__all__ = ['sqlarray'] | ||
|
||
from sqlarray import SQLarray |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,264 @@ | ||
# $Id: sqlfunctions.py 3379 2009-04-21 17:57:29Z oliver $ | ||
# Copyright (C) 2009 Oliver Beckstein <orbeckst@gmail.com> | ||
# Released under the GNU Public License, version 3 or higher (your choice) | ||
|
||
"""SQL functions to be added to a SQLite database | ||
Example: | ||
from sqlfunctions import * | ||
self.connection.create_function("sqrt", 1, _sqrt) | ||
self.connection.create_function("fformat",2,_fformat) | ||
self.connection.create_aggregate("std",1,_Stdev) | ||
self.connection.create_aggregate("median",1,_Median) | ||
self.connection.create_aggregate("array",1,_NumpyArray) | ||
self.connection.create_aggregate("histogram",4,_NumpyHistogram) | ||
self.connection.create_aggregate("distribution",4,_NormedNumpyHistogram) | ||
self.connection.create_aggregate("meanhistogram",5,_MeanHistogram) | ||
self.connection.create_aggregate("stdhistogram",5,_StdHistogram) | ||
self.connection.create_aggregate("minhistogram",5,_MinHistogram) | ||
self.connection.create_aggregate("maxhistogram",5,_MaxHistogram) | ||
self.connection.create_aggregate("medianhistogram",5,_MedianHistogram) | ||
self.connection.create_aggregate("zscorehistogram",5,_ZscoreHistogram) | ||
""" | ||
import numpy | ||
# compatibility check: we NEED consistent 1d histogram functions: we | ||
# decided to use numpy 1.x style, which returns edges, NOT lower bin edges | ||
_numpyversion = map(int, numpy.version.version.split('.')) | ||
if _numpyversion[0] < 1: | ||
raise ImportError('Need at least numpy 1.x, only have %r' % numpy.version.version) | ||
if _numpyversion[1] < 1: | ||
# we want a histogram that returns edges | ||
def histogram1d(*args,**kwargs): | ||
_range = kwargs.pop('range',None) | ||
if not _range is None: | ||
kwargs['range'] = (_range,) # needs to be a sequence | ||
h,e = numpy.histogramdd(*args,**kwargs) | ||
return h,e[0] | ||
histogram1d.__doc__ = "1D histogram, based on numpy histogramdd; returns edges as in numpy 1.1.x\n"+\ | ||
numpy.histogram.__doc__ | ||
else: | ||
# once deprecation for new=True sets in we can catch this here | ||
def histogram1d(*args,**kwargs): | ||
kwargs['new'] = True | ||
h,e = numpy.histogram(*args,**kwargs) | ||
return h,e | ||
histogram1d.__doc__ = numpy.histogram.__doc__ | ||
|
||
|
||
from sqlutil import adapt_numpyarray, convert_numpyarray,\ | ||
adapt_object, convert_object | ||
|
||
|
||
def _sqrt(x): | ||
try: | ||
x = float(x) | ||
except TypeError: | ||
return None | ||
return numpy.sqrt(x) | ||
|
||
def _fformat(format,x): | ||
return format % x | ||
|
||
class _Stdev(object): | ||
"""Implement standard deviation of the sample as SQL aggregate function. | ||
(Uses N-1 variance.) | ||
Do it in one pass (see eg | ||
http://smallcode.weblogs.us/2006/11/27/calculate-standard-deviation-in-one-pass/ | ||
though we may run in an underflow by calculating N/N-1<X^2-<X>^2>.). | ||
Also, we don't check if our arguments are valid as numbers. | ||
""" | ||
def __init__(self): | ||
self.x2 = 0 | ||
self.x = 0 | ||
self.n = 0 | ||
def step(self,x): | ||
try: | ||
x = float(x) | ||
self.x2 += x*x | ||
self.x += x | ||
self.n += 1 | ||
except TypeError: | ||
pass # don't contribute to average | ||
def finalize(self): | ||
if self.n<2: return 0.0 | ||
return numpy.sqrt((self.n*self.x2 - self.x*self.x)/(self.n*(self.n-1))) | ||
|
||
class _Median(object): | ||
def __init__(self): | ||
self.data = [] | ||
def step(self,x): | ||
try: | ||
x = float(x) | ||
self.data.append(x) | ||
except TypeError: | ||
pass # don't contribute | ||
def finalize(self): | ||
return numpy.median(self.data) | ||
|
||
class _NumpyArray(object): | ||
def __init__(self): | ||
self.data = [] | ||
def step(self,x): | ||
self.data.append(x) | ||
def finalize(self): | ||
return adapt_numpyarray(numpy.array(self.data)) | ||
|
||
class _NumpyHistogram(object): | ||
def __init__(self): | ||
self.is_initialized = False | ||
self.data = [] | ||
def step(self,x,bins,xmin,xmax): | ||
if not self.is_initialized: | ||
self.bins = bins | ||
self.range = (xmin,xmax) | ||
self.is_initialized = True | ||
self.data.append(x) | ||
def finalize(self): | ||
hist,edges = histogram1d(self.data,bins=self.bins,range=self.range, | ||
normed=False) | ||
return adapt_object((hist,edges)) | ||
|
||
class _NormedNumpyHistogram(_NumpyHistogram): | ||
def finalize(self): | ||
hist,edges = histogram1d(self.data,bins=self.bins,range=self.range, | ||
normed=True) | ||
return adapt_object((hist,edges)) | ||
|
||
class _FunctionHistogram(_NumpyHistogram): | ||
"""Baseclass for histogrammed functions. | ||
A histogrammed function is created by applying a function | ||
to all values y that have been accumulated in a bin x. | ||
""" | ||
def __init__(self): | ||
_NumpyHistogram.__init__(self) | ||
self.y = [] | ||
def step(self,x,y,bins,xmin,xmax): | ||
_NumpyHistogram.step(self,x,bins,xmin,xmax) | ||
self.y.append(y) | ||
def finalize(self): | ||
raise NotImplementedError("_FunctionHistogram must be inherited from.") | ||
# return adapt_object( (...,...,...) ) | ||
|
||
class _MeanHistogram(_FunctionHistogram): | ||
"""Mean of the weights in each bin. | ||
Takes TWO column arguments: value and weight""" | ||
def finalize(self): | ||
return adapt_object(regularized_function(\ | ||
self.data,self.y,numpy.mean,bins=self.bins,range=self.range)) | ||
|
||
class _StdHistogram(_FunctionHistogram): | ||
"""Standard deviation of the weights in each bin. | ||
Takes TWO column arguments: value and weight""" | ||
def finalize(self): | ||
return adapt_object(regularized_function(\ | ||
self.data,self.y,numpy.std,bins=self.bins,range=self.range)) | ||
|
||
class _MinHistogram(_FunctionHistogram): | ||
"""Min value of the weights in each bin. | ||
Takes TWO column arguments: value and weight""" | ||
def _min(self,v): | ||
try: | ||
return numpy.min(v) | ||
except ValueError: # empty array | ||
return numpy.nan | ||
|
||
def finalize(self): | ||
return adapt_object(regularized_function(\ | ||
self.data,self.y,self._min,bins=self.bins,range=self.range)) | ||
|
||
class _MaxHistogram(_FunctionHistogram): | ||
"""Max value of the weights in each bin. | ||
Takes TWO column arguments: value and weight""" | ||
def _max(self,v): | ||
try: | ||
return numpy.max(v) | ||
except ValueError: # empty array | ||
return numpy.nan | ||
|
||
def finalize(self): | ||
return adapt_object(regularized_function(\ | ||
self.data,self.y,self._max,bins=self.bins,range=self.range)) | ||
|
||
class _MedianHistogram(_FunctionHistogram): | ||
"""Median value of the weights in each bin. | ||
Takes TWO column arguments: value and weight""" | ||
def finalize(self): | ||
return adapt_object(regularized_function(\ | ||
self.data,self.y,numpy.median,bins=self.bins,range=self.range)) | ||
|
||
class _ZscoreHistogram(_FunctionHistogram): | ||
"""Z-score of the weights in each bin abs(Y - <Y>)/std(Y). | ||
Takes TWO column arguments: value and weight""" | ||
def Zscore(self,v): | ||
m = v.mean() | ||
s = v.std() | ||
return numpy.nan_to_num( numpy.mean(numpy.abs(v - m))/s ) | ||
|
||
def finalize(self): | ||
return adapt_object(\ | ||
regularized_function(self.data,self.y,self.Zscore,bins=self.bins,range=self.range)) | ||
|
||
|
||
# Helper functions | ||
|
||
def regularized_function(x,y,func,bins=None,range=None): | ||
"""Compute func() over data aggregated in bins. | ||
(x,y) --> (x', func(Y')) with Y' = {y: y(x) where x in x' bin} | ||
First the data is collected in bins x' along x and then func is applied to | ||
all data points Y' that have been collected in the bin. | ||
:Arguments: | ||
x abscissa values (for binning) | ||
y ordinate values (func is applied) | ||
func a numpy ufunc that takes one argument, func(Y') | ||
bins number or array | ||
range limits (used with number of bins) | ||
:Returns: | ||
F,edges function and edges (midpoints = 0.5*(edges[:-1]+edges[1:])) | ||
""" | ||
_x = numpy.asarray(x) | ||
_y = numpy.asarray(y) | ||
|
||
# setup of bins from numpy.histogram | ||
if (range is not None): | ||
mn, mx = range | ||
if (mn > mx): | ||
raise AttributeError('max must be larger than min in range parameter.') | ||
|
||
if not numpy.iterable(bins): | ||
if range is None: | ||
range = (_x.min(), _x.max()) | ||
mn, mx = [float(mi) for mi in range] | ||
if mn == mx: | ||
mn -= 0.5 | ||
mx += 0.5 | ||
bins = numpy.linspace(mn, mx, bins+1, endpoint=True) | ||
else: | ||
bins = numpy.asarray(bins) | ||
if (numpy.diff(bins) < 0).any(): | ||
raise AttributeError('bins must increase monotonically.') | ||
|
||
sorting_index = numpy.argsort(_x) | ||
sx = _x[sorting_index] | ||
sy = _y[sorting_index] | ||
|
||
# boundaries in SORTED data that demarcate bins; position in bin_index is the bin number | ||
bin_index = numpy.r_[sx.searchsorted(bins[:-1], 'left'), | ||
sx.searchsorted(bins[-1], 'right')] | ||
|
||
# naive implementation: apply operator to each chunk = sy[start:stop] separately | ||
# | ||
# It's not clear to me how one could effectively block this procedure (cf | ||
# block = 65536 in numpy.histogram) because there does not seem to be a | ||
# general way to combine the chunks for different blocks, just think of | ||
# func=median | ||
F = numpy.zeros(len(bins)-1) # final function | ||
F[:] = [func(sy[start:stop]) for start,stop in zip(bin_index[:-1],bin_index[1:])] | ||
return F,bins |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# $Id: sqlutil.py 2346 2008-10-06 19:36:21Z oliver $ | ||
# Copyright (C) 2009 Oliver Beckstein <orbeckst@gmail.com> | ||
# Released under the GNU Public License, version 3 or higher (your choice) | ||
|
||
"""Helper functions that are used throughout the SQLarray package.""" | ||
|
||
import cPickle | ||
|
||
# storing numpy arrays in the db as pickles | ||
def adapt_numpyarray(a): | ||
return cPickle.dumps(a,protocol=0) # must use text protocol for use with sqlite | ||
|
||
def convert_numpyarray(s): | ||
return cPickle.loads(s) | ||
|
||
def adapt_object(a): | ||
return cPickle.dumps(a,protocol=0) # must use text protocol for use with sqlite | ||
|
||
def convert_object(s): | ||
return cPickle.loads(s) | ||
|
||
# declare types as 'NumpyArray': | ||
# cur.execute("CREATE TABLE test(a NumpyArray)") | ||
# cur.execute("INSERT INTO test(a) values (?)", (my_array,)) | ||
# or as column types | ||
# cur.execute('SELECT a as "a [NumpyArray]" from test') | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# $Id: setup.py 3489 2009-05-26 21:55:54Z root $ | ||
# setuptools installation of RecSQL | ||
# Copyright (c) 2007-2009 Oliver Beckstein <orbeckst@gmail.com> | ||
# Released under the GNU Public License 2 (or higher, your choice) | ||
|
||
|
||
from ez_setup import use_setuptools | ||
use_setuptools() | ||
from setuptools import setup, find_packages | ||
|
||
setup(name="RecSQL", | ||
version="0.1rc1", | ||
description="Treat SQLlite tables as recarrays", | ||
long_description="""\ | ||
A simple implementation of nump.recarray-like tables that can | ||
be operated on via SQL. The underlying tables are SQLlite tables | ||
that are built from a numpy.recarray. | ||
""", | ||
author="Oliver Beckstein", | ||
author_email="orbeckst@gmail.com", | ||
license="GPLv3", | ||
url="http://sbcb.bioch.ox.ac.uk/oliver/software", # not set up yet | ||
keywords="utilities", | ||
packages=find_packages(exclude=['tests','extras','doc/examples']), | ||
install_requires=['numpy>=1.0', | ||
'pysqlite2', | ||
], | ||
) | ||
|
||
|
Oops, something went wrong.