From a01bb83bc4e3946999d42cd123985253216d0ada Mon Sep 17 00:00:00 2001 From: Alvaro Fuentes Date: Tue, 9 Sep 2025 16:34:18 +0200 Subject: [PATCH] [IMP] util/orm: `recompute_fields` add query parameter In most cases when we pass a list of `ids` to `recompute_fields` we just run a query to get them. The main downside of this approach is when the list is too big: it can cause MemoryErrors. Instead we can let `recompute_fields` use a query to get the ids and take the necessary precautions to not fetch millions of entries all at once. As a nice side effect code becomes easier to review since we don't need to check the fetching of ids is done correctly. --- src/util/orm.py | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/src/util/orm.py b/src/util/orm.py index 489032c31..0c97b9943 100644 --- a/src/util/orm.py +++ b/src/util/orm.py @@ -251,7 +251,7 @@ def wrapper(*args, **kwargs): @no_selection_cache_validation -def recompute_fields(cr, model, fields, ids=None, logger=_logger, chunk_size=256, strategy="auto"): +def recompute_fields(cr, model, fields, ids=None, logger=_logger, chunk_size=256, strategy="auto", query=None): """ Recompute field values. @@ -271,31 +271,45 @@ def recompute_fields(cr, model, fields, ids=None, logger=_logger, chunk_size=256 :param str model: name of the model to recompute :param list(str) fields: list of the name of the fields to recompute :param list(int) or None ids: list of the IDs of the records to recompute, when `None` - recompute *all* records + recompute *all* records, unless `query` is also set (see + below) :param logger: logger used to report the progress :type logger: :class:`logging.Logger` :param int chunk_size: number of records per chunk - used to split the processing :param str strategy: strategy used to process the re-computation + :param str query: query to get the IDs of records to recompute, it is an error to set + both `ids` and `query` """ - assert strategy in {"flush", "commit", "auto"} + if strategy not in {"flush", "commit", "auto"}: + raise ValueError("Invalid strategy {!r}".format(strategy)) + if ids is not None and query is not None: + raise ValueError("Cannot set both `ids` and `query`") Model = env(cr)[model] if isinstance(model, basestring) else model model = Model._name - def get_ids(): - if ids is not None: - for id_ in ids: - yield id_ + if ids is None: + query = format_query(cr, "SELECT id FROM {} ORDER BY id", table_of_model(cr, model)) if query is None else query + cr.execute(query) + count = cr.rowcount + if count < 2**21: # avoid the overhead of a named cursor unless we have at least two chunks + ids_ = (id_ for (id_,) in cr.fetchall()) else: - with named_cursor(cr, itersize=2**20) as ncr: - ncr.execute(format_query(cr, "SELECT id FROM {t} ORDER BY id", t=table_of_model(cr, model))) - for (id_,) in ncr: - yield id_ - if ids is None: - cr.execute(format_query(cr, "SELECT COUNT(id) FROM {t}", t=table_of_model(cr, model))) - (count,) = cr.fetchone() + def get_ids(): + with named_cursor(cr, itersize=2**20) as ncr: + ncr.execute(query) + for (id_,) in ncr: + yield id_ + + ids_ = get_ids() else: count = len(ids) + ids_ = ids + + if not count: + return + + _logger.info("Computing fields %s of %r on %d records", fields, model, count) if strategy == "auto": big_table = count > BIG_TABLE_THRESHOLD @@ -303,8 +317,8 @@ def get_ids(): strategy = "commit" if big_table and any_tracked_field else "flush" size = (count + chunk_size - 1) / chunk_size - qual = "%s %d-bucket" % (model, chunk_size) if chunk_size != 1 else model - for subids in log_progress(chunks(get_ids(), chunk_size, list), logger, qualifier=qual, size=size): + qual = "{} {:d}-bucket".format(model, chunk_size) if chunk_size != 1 else model + for subids in log_progress(chunks(ids_, chunk_size, list), logger, qualifier=qual, size=size): records = Model.browse(subids) for field_name in fields: field = records._fields[field_name]