make the code compatible with newer versions of pandas

neurosynth · Mar 29, 2020 · f44165a · f44165a
1 parent 948ce7e
commit f44165a
Show file tree

Hide file tree

Showing 2 changed files with 19 additions and 14 deletions.
diff --git a/neurosynth/base/dataset.py b/neurosynth/base/dataset.py
@@ -621,7 +621,7 @@ def add_features(self, features, merge='outer', duplicates='ignore',
                 "instance from a newer database file that uses PMIDs rather "
                 "than doi's as the study identifiers in the first column.")
 
-        old_data = self.data.to_dense()
+        old_data = self.data
         # Handle features with duplicate names
         common_features = list(set(old_data.columns) & set(features.columns))
         if duplicates == 'ignore':
@@ -631,7 +631,7 @@ def add_features(self, features, merge='outer', duplicates='ignore',
 
         data = old_data.merge(
             features, how=merge, left_index=True, right_index=True)
-        self.data = data.fillna(0.0).to_sparse()
+        self.data = data.fillna(0.0).astype(pd.SparseDtype(np.float64))
 
     @property
     def feature_names(self):
@@ -657,12 +657,12 @@ def get_feature_data(self, ids=None, features=None, dense=True):
         result = self.data
 
         if ids is not None:
-            result = result.ix[ids]
+            result = result.loc[ids]
 
         if features is not None:
-            result = result.ix[:, features]
+            result = result.loc[:, features]
 
-        return result.to_dense() if dense else result
+        return result.astype(np.float64) if dense else result
 
     def get_ordered_names(self, features):
         """ Given a list of features, returns features in order that they
@@ -713,7 +713,7 @@ def get_ids(self, features, threshold=0.0, func=np.sum, get_weights=False):
         if isinstance(features, str):
             features = [features]
         features = self.search_features(features)  # Expand wild cards
-        feature_weights = self.data.ix[:, features]
+        feature_weights = self.data.loc[:, features]
         weights = feature_weights.apply(func, 1)
         above_thresh = weights[weights >= threshold]
         # ids_to_keep = self.ids[above_thresh]
@@ -751,13 +751,13 @@ def get_features_by_ids(self, ids=None, threshold=0.0001, func=np.mean,
                             get_weights=False):
         ''' Returns features for which the mean loading across all specified
         studies (in ids) is >= threshold. '''
-        weights = self.data.ix[ids].apply(func, 0)
+        weights = func(self.data.loc[ids], axis=0)
         above_thresh = weights[weights >= threshold]
         return above_thresh if get_weights else list(above_thresh.index)
 
     def _sdf_to_csr(self):
         """ Convert FeatureTable to SciPy CSR matrix. """
-        data = self.data.to_dense()
+        data = self.data.astype(np.float64)
         self.data = {
             'columns': list(data.columns),
             'index': list(data.index),
@@ -766,6 +766,8 @@ def _sdf_to_csr(self):
 
     def _csr_to_sdf(self):
         """ Inverse of _sdf_to_csr(). """
-        self.data = pd.DataFrame(self.data['values'].todense(),
-                                 index=self.data['index'],
-                                 columns=self.data['columns']).to_sparse()
+        self.data = pd.DataFrame(
+            self.data["values"].todense(),
+            index=self.data["index"],
+            columns=self.data["columns"],
+        ).astype(pd.SparseDtype(np.float64))
diff --git a/neurosynth/tests/test_base.py b/neurosynth/tests/test_base.py
@@ -67,7 +67,7 @@ def test_feature_table_loads(self):
         self.assertEqual(len(self.dataset.get_feature_names()), 5)
         self.assertEqual(tt.data.shape, (5, 5))
         self.assertEqual(tt.data.columns[3], 'f4')
-        self.assertEqual(tt.data.to_dense().iloc[0, 0], 0.0003)
+        self.assertEqual(tt.data.astype(np.float64).iloc[0, 0], 0.0003)
 
     def test_feature_addition(self):
         """ Add feature data from multiple sources to FeatureTable. """
@@ -225,8 +225,11 @@ def test_get_feature_counts(self):
             self.assertGreater(c, 0, "feature %s has no hits" % f)
         # and should be equal to the ones computed directly (we do not do
         # any fancy queries atm), assumes default threshold of 0.001
-        feature_counts_ = dict([(feature, np.sum(self.dataset.feature_table.data.to_dense().ix[:, col] > 0.001))
-                                for col, feature in enumerate(self.dataset.feature_table.feature_names)])
+        ft = self.dataset.feature_table
+        feature_counts_ = dict(
+            (feature, np.sum(ft.data.iloc[:, col].sparse.to_dense() > 0.001))
+            for col, feature in enumerate(ft.feature_names)
+        )
         self.assertEqual(feature_counts, feature_counts_)
 
     def test_get_feature_data(self):