Merge pull request #781 from lsst/tickets/DM-38969

DM-38969: Update to correct pandas usage that does not fragment dataframes.
lsst · May 3, 2023 · e3f20a9 · e3f20a9
2 parents 80cdd8e + 298162b
commit e3f20a9
Showing 1 changed file with 3 additions and 3 deletions.
diff --git a/python/lsst/pipe/tasks/postprocess.py b/python/lsst/pipe/tasks/postprocess.py
@@ -174,7 +174,6 @@ def run(self, catalogs, tract, patch):
         catalog : `pandas.DataFrame`
             Merged dataframe.
         """
-
         dfs = []
         for filt, tableDict in catalogs.items():
             for dataset, table in tableDict.items():
@@ -183,14 +182,15 @@ def run(self, catalogs, tract, patch):
 
                 # Sort columns by name, to ensure matching schema among patches
                 df = df.reindex(sorted(df.columns), axis=1)
-                df['tractId'] = tract
-                df['patchId'] = patch
+                df = df.assign(tractId=tract, patchId=patch)
 
                 # Make columns a 3-level MultiIndex
                 df.columns = pd.MultiIndex.from_tuples([(dataset, filt, c) for c in df.columns],
                                                        names=('dataset', 'band', 'column'))
                 dfs.append(df)
 
+        # We do this dance and not `pd.concat(dfs)` because the pandas
+        # concatenation uses infinite memory.
         catalog = functools.reduce(lambda d1, d2: d1.join(d2), dfs)
         return catalog