initial commit

neurosynth · Oct 28, 2012 · c584aa6 · c584aa6
commit c584aa6
Show file tree

Hide file tree

Showing 23 changed files with 962 additions and 0 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,17 @@
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,61 @@
+# What is Neurosynth?
+
+Neurosynth is a Python package for large-scale synthesis of functional neuroimaging data.
+
+## Installation
+
+Neurosynth depends on the NumPy/SciPy libraries. It also depends on NiBabel. Assuming you have those packages in working order, download this package, then install it from source:
+
+	> python setup.py install
+
+Depending on your operating system, you may need superuser privileges (prefix the above line with 'sudo').
+
+That's it! You should now be ready to roll.
+
+
+## Usage
+
+Running analyses in Neurosynth is pretty straightforward. For a reasonably thorough walk-through, see the Getting Started page in the documentation (well, once it exists). This Quickstart guide just covers the bare minimum.
+
+First, [download some data](http://neurosynth.org/data/current_data.tar.gz) from the Neurosynth website:
+
+	> curl -O http://neurosynth.org/data/current_data.tar.gz
+
+Unpack the archive, which should contain 2 files: database.txt and features.txt.
+
+Now generate a new Dataset instance from the database.txt file:
+
+	> from neurosynth.base import dataset
+	> dataset = Dataset('database.txt')
+
+This should take several minutes to process.
+
+Once initialized, the Dataset instance contains activation data from nearly 6,000 published neuroimaging articles. But it doesn't yet have any features attached to those data, so let's add some:
+
+	> dataset.add_features('features.txt')
+
+Now our Dataset has both activation data and some features we can use to manipulate the data with. In this case, the features are just term-based tags--i.e., words that occur frequently in the articles from which the dataset is drawn (for details, see this [Nature Methods] paper, or the Neurosynth website).
+
+We can now do various kinds of analyses with the data. For example, we can use the features we just added to perform automated large-scale meta-analyses. Let's see what features we have:
+
+	> dataset.list_features()
+	['phonetic', 'associative', 'cues', 'visually', ... ]
+
+We can use these features--either in isolation or in combination--to select articles for inclusion in a meta-analysis. For example, suppose we want to run a meta-analysis of emotion studies. We could operationally define a study of emotion as one in which the authors used words starting with 'emo' with high frequency:
+
+	> ids = dataset.get_ids_by_features('emo*', threshold=0.001)
+
+Here we're asking for a list of IDs of all studies that use words starting with 'emo' (e.g.,'emotion', 'emotional', 'emotionally', etc.) at a frequency of 1 in 1,000 words or greater (in other words, if an article has 5,000 words of text, it will only be included in our set if it uses words starting with 'emo' at least 5 times).
+
+	> len(ids)
+	639
+
+The resulting set includes 639 studies.
+
+Once we've got a set of studies we're happy with, we can run a simple meta-analysis, prefixing all output files with the string 'emotion' to distinguish them from other analyses we might run:
+
+	> from neurosynth.analysis import meta
+	> ma = meta.MetaAnalysis(dataset, ids)
+	> ma.save_results('some_directory/emotion')
+
+You should now have a set of Nifti-format brain images on your drive that display various meta-analytic results. The image names are somewhat cryptic; see the Documentation for details. It's important to note that the meta-analysis routines currently implemented in Neurosynth aren't very sophisticated; they're designed primarily for efficiency (most analyses should take just a few seconds), and take multiple shortcuts as compared to other packages like ALE or MKDA. But with that caveat in mind (and one that will hopefully be remedied in the near future), Neurosynth gives you a streamlined and quick way of running large-scale meta-analyses of fMRI data.
diff --git a/neurosynth/__init__.py b/neurosynth/__init__.py
@@ -0,0 +1 @@
+__all__ = ["analysis", "base"]
diff --git a/neurosynth/analysis/__init__.py b/neurosynth/analysis/__init__.py
@@ -0,0 +1 @@
+__all__ = ['decode', 'meta', 'ml', 'network', 'reduce', 'stats']
diff --git a/neurosynth/analysis/decode.py b/neurosynth/analysis/decode.py
@@ -0,0 +1,3 @@
+import numpy as np
+
+""" Decoding-related methods """
diff --git a/neurosynth/analysis/meta.py b/neurosynth/analysis/meta.py
@@ -0,0 +1,112 @@
+
+import numpy as np
+from neurosynth.base import imageutils
+import stats
+from scipy import sparse
+from scipy.stats import norm
+
+
+class MetaAnalysis:
+
+	""" Meta-analysis of a Dataset. Currently contrasts two subsets of 
+	studies within a Dataset and saves a bunch of statistical images.
+	Only one list of study IDs (ids) needs to be passed; the Universe will
+	be bisected into studies that are and are not included in the 
+	list, and the contrast is then performed across these two groups.
+	If a seond optional second study list is provided (ids2), the Dataset 
+	is first constrained to the union of ids1 and ids2, and the standard 
+	contrast is then performed."""
+
+	# DESPERATELY NEEDS REFACTORING!!!
+
+	def __init__(self, dataset, ids, ids2=None, **kwargs):
+
+		self.dataset = dataset
+		mt = dataset.image_table
+		self.selected_ids = list(set(mt.ids) & set(ids))
+		self.selected_id_indices = np.in1d(mt.ids, ids)
+
+		# Calculate different count variables
+		print "Calculating counts..."
+		n_mappables = len(mt.ids)
+		n_selected = len(self.selected_ids)
+		n_unselected = n_mappables - n_selected
+
+		# If ids2 is provided, we only use mappables explicitly in either ids or ids2.
+		# Otherwise, all mappables not in the ids list are used as the control condition.
+		unselected_id_indices = ~self.selected_id_indices if ids2 == None else np.in1d(mt.ids, ids2)
+
+		n_selected_active_voxels = mt.data.dot(self.selected_id_indices)
+		n_unselected_active_voxels = mt.data.dot(unselected_id_indices)
+
+		# Nomenclature for variables below: p = probability, S = selected, g = given, 
+		# U = unselected, A = activation. So, e.g., pAgS = p(A|S) = probability of activation 
+		# in voxel given that the mappable is selected (i.e., is included in the ids list 
+		# passed to the constructor).
+		pS = (n_selected+1.0)/(n_mappables+2)
+
+		# Conditional probabilities, with Laplace smoothing
+		print "Calculating conditional probabilities..."
+		# pA = np.asarray(sparse.spmatrix.mean(mt.data, 1)) + 1.0/n_mappables
+		pA = (n_selected_active_voxels+1.0) / (n_mappables+2)
+		pAgS = (n_selected_active_voxels+1.0)/(n_selected+2)
+		pAgU = (n_unselected_active_voxels+1.0)/(n_unselected+2)
+		pSgA = pAgS * pS / pA
+
+		# Recompute conditionals with uniform prior
+		print "Recomputing with uniform priors..."
+		prior_pS = kwargs.get('prior', 0.5)
+		pAgS_unif = prior_pS * pAgS + (1-prior_pS) * pAgU
+		pSgA_unif = pAgS * prior_pS / pAgS_unif
+
+		# Set voxel-wise FDR to .05 unless explicitly specified	
+		q = kwargs.get('q', 0.01)
+
+		# One-way chi-square test for consistency of activation
+		p_vals = stats.one_way(np.squeeze(n_selected_active_voxels), n_selected)
+		p_vals[p_vals < 1e-240] = 1e-240  # prevents overflow due to loss of precision
+		z_sign = np.sign(n_selected_active_voxels - np.mean(n_selected_active_voxels)).ravel()
+		pAgS_z = np.abs(norm.ppf(p_vals/2)) * z_sign
+
+		fdr_thresh = stats.fdr(p_vals, q)
+		pAgS_z_FDR = imageutils.threshold_img(pAgS_z, fdr_thresh, p_vals, mask_out='above')
+
+
+		# Two-way chi-square for specificity of activation
+		cells = np.squeeze(np.array([[n_selected_active_voxels, n_unselected_active_voxels],
+			[n_selected-n_selected_active_voxels, n_unselected-n_unselected_active_voxels]]).T)
+		p_vals = stats.two_way(cells)
+		p_vals[p_vals < 1e-240] = 1e-240  # prevents overflow
+		z_sign = np.sign(pAgS - pAgU).ravel()
+		pSgA_z = np.abs(norm.ppf(p_vals/2)) * z_sign
+		fdr_thresh = stats.fdr(p_vals, q)
+		pSgA_z_FDR = imageutils.threshold_img(pSgA_z, fdr_thresh, p_vals, mask_out='above')
+
+		# Retain any images we may want to save later
+		self.images = { 'pAgS': pAgS,
+						'pSgA': pSgA,
+						'pAgS_unif': pAgS_unif,
+						'pSgA_unif': pSgA_unif,
+						'pAgS_z': pAgS_z,
+						'pSgA_z': pSgA_z,
+						('pAgS_z_FDR_%s' % q): pAgS_z_FDR,
+						('pSgA_z_FDR_%s' % q): pSgA_z_FDR }
+
+	# def _setup(self):
+	# 	pass
+
+	# def one_sample_test(self, ids):
+	# 	pass
+
+	# def two_sample_test(self, ids1, ids2):
+	# 	pass
+
+	def save_results(self, outroot, image_list=None):
+		""" Write out any images generated by the meta-analysis. The outroot argument is prepended 
+		to all file names. Optionally, a restricted list of images to save can be passed; otherwise, 
+		all images currently stored in self.images will be saved. """
+		print "Saving results..."
+		if image_list == None: image_list = self.images.keys()
+		for suffix, img in self.images.items():
+			if suffix in image_list:
+				imageutils.save_img(img, '%s_%s.nii.gz' % (outroot, suffix), self.dataset.volume)
diff --git a/neurosynth/analysis/misc.py b/neurosynth/analysis/misc.py
@@ -0,0 +1 @@
+""" Miscellaneous analysis methods. """
diff --git a/neurosynth/analysis/ml.py b/neurosynth/analysis/ml.py
@@ -0,0 +1 @@
+""" Machine-learning-related methods. """
diff --git a/neurosynth/analysis/network.py b/neurosynth/analysis/network.py
@@ -0,0 +1,2 @@
+
+""" Network analysis-related methods """
diff --git a/neurosynth/analysis/reduce.py b/neurosynth/analysis/reduce.py
@@ -0,0 +1,46 @@
+import numpy as np
+
+""" Dimensional/data reduction methods. """
+
+def average_within_regions(dataset, img, threshold=None, remove_zero=True, replace=False):
+	""" Averages over all voxels within each ROI in the input image.
+
+	Takes a Dataset and a Nifti image that defines distinct regions, and 
+	returns a numpy matrix of  ROIs x mappables, where the value at each ROI is 
+	the proportion of active voxels in that ROI. Each distinct ROI must have a 
+	unique value in the image; non-contiguous voxels with the same value will 
+	be assigned to the same ROI.
+
+	Args:
+		dataset: A Dataset instance
+		img: A NIFTI or Analyze-format image that provides the ROI definitions
+		threshold: An optional float in the range of 0 - 1. If passed, the array 
+			will be binarized, with ROI values above the threshold assigned to True 
+			and values below the threshold assigned to False. (E.g., if threshold = 
+			0.05, only ROIs in which more than 5% of voxels are active will be 
+			considered active.).
+		remove_zero: An optional boolean; when True, assume that voxels with value 
+		of 0 should not be considered as a separate ROI, and will be ignored. 
+		replace: When True, the voxel x mappable array contained within the Dataset's 
+			ImageTable will be replaced with the resulting array.
+
+	Returns:
+		If replace == True, nothing is returned (the Dataset is modified in-place).
+		Otherwise, returns a 2D numpy array with ROIs in rows and mappables in columns.
+	"""
+	regions = dataset.volume.mask(img)
+	labels = np.unique(regions)
+	if remove_zero: labels = labels[np.nonzero(labels)]
+	n_regions = labels.size
+	m = np.zeros((regions.size, n_regions))
+	for i in range(n_regions):
+		m[regions==labels[i],i] = 1.0/np.sum(regions==labels[i])
+	# produces roi x study matrix
+	result = np.transpose(m) * dataset.get_image_data(ids=None, dense=False)
+	if threshold is not None:
+		result[result < threshold] = 0.0
+		result = result.astype(bool)
+	return result
+
+def random_voxels(dataset, img, n_voxels):
+	pass
diff --git a/neurosynth/analysis/stats.py b/neurosynth/analysis/stats.py
@@ -0,0 +1,61 @@
+"""' Various statistical helper methods. """
+
+from scipy import special
+from scipy.stats import ss
+import numpy as np
+
+def pearson(self, x, y):
+	""" Correlates row vector x with each row vector in 2D array y. """
+	data = np.vstack((x,y))
+	ms = data.mean(axis=1)[(slice(None,None,None),None)]
+	datam = data - ms
+	datass = np.sqrt(ss(datam,axis=1))
+	temp = np.dot(datam[1:],datam[0].T)
+	rs = temp / (datass[1:]*datass[0])
+ 	return rs
+
+
+def two_way(cells):
+	""" Two-way chi-square test of independence. 
+	Takes a 3D array as input: N(voxels) x 2 x 2, where the last two dimensions
+	are the contingency table for each of N voxels. Returns an array of p-values.
+	"""
+	# dt = cells.dtype
+	cells = cells.astype('float64')  # Make sure we don't overflow
+	total = np.apply_over_axes(np.sum, cells, [1,2]).ravel()
+	chi_sq = np.zeros(cells.shape, dtype='float64')
+	for i in range(2):
+		for j in range(2):
+			exp = np.sum(cells[:,i,:], 1).ravel() * np.sum(cells[:,:,j], 1).ravel() / total
+			chi_sq[:,i,j] = (cells[:,i,j] - exp)**2 / exp
+	chi_sq = np.apply_over_axes(np.sum, chi_sq, [1,2]).ravel()
+	return special.chdtrc(1, chi_sq)#.astype(dt)
+
+
+def one_way(data, n):
+	""" One-way chi-square test of independence.
+	Takes a 1D array as input and compares activation at each voxel to proportion
+	expected under a uniform distribution throughout the array. Note that
+	if you're testing activation with this, make sure that only valid voxels
+	(e.g., in-mask gray matter voxels) are included in the array, or results
+	won't make any sense!
+	"""
+	# dt = data.dtype
+	term = data.astype('float64')
+	no_term = n - term
+	t_exp = np.mean(term, 0)
+	t_exp = np.array([t_exp,]*data.shape[0])
+	nt_exp = n - t_exp
+	t_mss = (term - t_exp)**2/t_exp
+	nt_mss = (no_term - nt_exp)**2/nt_exp
+	chi2 = t_mss + nt_mss
+	return special.chdtrc(1, chi2)#.astype(dt)
+
+
+def fdr(p, q=.05):
+	""" Determine FDR threshold given a p value array and desired false discovery rate q. """
+	s = np.sort(p)
+	nvox = p.shape[0]
+	null = np.array(range(1, nvox+1), dtype='float')*q/nvox
+	below = np.where(s<=null)[0]
+	return s[max(below)] if any(below) else -1
diff --git a/neurosynth/base/__init__.py b/neurosynth/base/__init__.py
@@ -0,0 +1 @@
+__all__ = ['dataset', 'mask', 'imageutils', 'mappable', 'transformations']