diff --git a/neurosynth/base/dataset.py b/neurosynth/base/dataset.py index d07a6ba..4906f5c 100644 --- a/neurosynth/base/dataset.py +++ b/neurosynth/base/dataset.py @@ -621,7 +621,7 @@ def add_features(self, features, merge='outer', duplicates='ignore', "instance from a newer database file that uses PMIDs rather " "than doi's as the study identifiers in the first column.") - old_data = self.data.to_dense() + old_data = self.data # Handle features with duplicate names common_features = list(set(old_data.columns) & set(features.columns)) if duplicates == 'ignore': @@ -631,7 +631,7 @@ def add_features(self, features, merge='outer', duplicates='ignore', data = old_data.merge( features, how=merge, left_index=True, right_index=True) - self.data = data.fillna(0.0).to_sparse() + self.data = data.fillna(0.0).astype(pd.SparseDtype(np.float64)) @property def feature_names(self): @@ -657,12 +657,12 @@ def get_feature_data(self, ids=None, features=None, dense=True): result = self.data if ids is not None: - result = result.ix[ids] + result = result.loc[ids] if features is not None: - result = result.ix[:, features] + result = result.loc[:, features] - return result.to_dense() if dense else result + return result.astype(np.float64) if dense else result def get_ordered_names(self, features): """ Given a list of features, returns features in order that they @@ -713,7 +713,7 @@ def get_ids(self, features, threshold=0.0, func=np.sum, get_weights=False): if isinstance(features, str): features = [features] features = self.search_features(features) # Expand wild cards - feature_weights = self.data.ix[:, features] + feature_weights = self.data.loc[:, features] weights = feature_weights.apply(func, 1) above_thresh = weights[weights >= threshold] # ids_to_keep = self.ids[above_thresh] @@ -751,13 +751,13 @@ def get_features_by_ids(self, ids=None, threshold=0.0001, func=np.mean, get_weights=False): ''' Returns features for which the mean loading across all specified studies (in ids) is >= threshold. ''' - weights = self.data.ix[ids].apply(func, 0) + weights = func(self.data.loc[ids], axis=0) above_thresh = weights[weights >= threshold] return above_thresh if get_weights else list(above_thresh.index) def _sdf_to_csr(self): """ Convert FeatureTable to SciPy CSR matrix. """ - data = self.data.to_dense() + data = self.data.astype(np.float64) self.data = { 'columns': list(data.columns), 'index': list(data.index), @@ -766,6 +766,8 @@ def _sdf_to_csr(self): def _csr_to_sdf(self): """ Inverse of _sdf_to_csr(). """ - self.data = pd.DataFrame(self.data['values'].todense(), - index=self.data['index'], - columns=self.data['columns']).to_sparse() + self.data = pd.DataFrame( + self.data["values"].todense(), + index=self.data["index"], + columns=self.data["columns"], + ).astype(pd.SparseDtype(np.float64)) diff --git a/neurosynth/tests/test_base.py b/neurosynth/tests/test_base.py index 206426d..3fd701a 100644 --- a/neurosynth/tests/test_base.py +++ b/neurosynth/tests/test_base.py @@ -67,7 +67,7 @@ def test_feature_table_loads(self): self.assertEqual(len(self.dataset.get_feature_names()), 5) self.assertEqual(tt.data.shape, (5, 5)) self.assertEqual(tt.data.columns[3], 'f4') - self.assertEqual(tt.data.to_dense().iloc[0, 0], 0.0003) + self.assertEqual(tt.data.astype(np.float64).iloc[0, 0], 0.0003) def test_feature_addition(self): """ Add feature data from multiple sources to FeatureTable. """ @@ -225,8 +225,11 @@ def test_get_feature_counts(self): self.assertGreater(c, 0, "feature %s has no hits" % f) # and should be equal to the ones computed directly (we do not do # any fancy queries atm), assumes default threshold of 0.001 - feature_counts_ = dict([(feature, np.sum(self.dataset.feature_table.data.to_dense().ix[:, col] > 0.001)) - for col, feature in enumerate(self.dataset.feature_table.feature_names)]) + ft = self.dataset.feature_table + feature_counts_ = dict( + (feature, np.sum(ft.data.iloc[:, col].sparse.to_dense() > 0.001)) + for col, feature in enumerate(ft.feature_names) + ) self.assertEqual(feature_counts, feature_counts_) def test_get_feature_data(self):