Merge pull request #3092 from mwiencek/mbs-12239

MBS-12239: Fix "too many URLs" in incremental sitemaps
metabrainz · Mar 8, 2024 · eee631e · eee631e
2 parents cbbb01e + 295ac14
commit eee631e
Show file tree

Hide file tree

Showing 2 changed files with 147 additions and 49 deletions.
diff --git a/lib/MusicBrainz/Server/Sitemap/Incremental.pm b/lib/MusicBrainz/Server/Sitemap/Incremental.pm
@@ -7,14 +7,17 @@ use feature 'state';
 
 use Digest::SHA qw( sha1_hex );
 use JSON qw( decode_json );
-use List::AllUtils qw( partition_by );
+use List::AllUtils qw( min partition_by );
 use Moose;
 use Sql;
 
 use MusicBrainz::Server::Constants qw( entities_with );
 use MusicBrainz::Server::Context;
 use MusicBrainz::Server::Log qw( log_info );
-use MusicBrainz::Server::Sitemap::Constants qw( %SITEMAP_SUFFIX_INFO );
+use MusicBrainz::Server::Sitemap::Constants qw(
+    $MAX_SITEMAP_SIZE
+    %SITEMAP_SUFFIX_INFO
+);
 use MusicBrainz::Server::Sitemap::Builder;
 
 extends 'MusicBrainz::Server::Sitemap::Builder';
@@ -217,52 +220,126 @@ sub post_replication_sequence {
     my ($self, $c) = @_;
 
     for my $entity_type (@{ $self->dumped_entity_types }) {
-        # The sitemaps.control columns will be NULL on first run, in which case
-        # we just select all updates for writing.
-        my $all_updates = $c->sql->select_list_of_hashes(
-            "SELECT lm.*
-               FROM sitemaps.${entity_type}_lastmod lm
-              WHERE lm.replication_sequence > COALESCE(
-                (SELECT overall_sitemaps_replication_sequence FROM sitemaps.control), 0)",
+        # In most cases, there should be no need to create batches for the
+        # incremental sitemaps. An exception may occur in cases where the
+        # sitemaps have not run for some time and many changes have
+        # accumulated (MBS-12239).
+        #
+        # The batching code is mostly copied from `build_one_entity` in
+        # `Sitemap::Overall`. However, the logic to always put the last batch
+        # in its own sitemap is not performed here: we're not concerned about
+        # which sitemap a URL starts in for an incremental dump, because the
+        # set of URLs in an incremental dump is inherently unstable (they
+        # change hourly).
+        #
+        # As mentioned previously, it's very unlikely there will be more than
+        # one batch for an incremental dump, anyway, due to the code that
+        # attempts to merge adjacent ones.
+
+        # Find the counts in each potential batch of $MAX_SITEMAP_SIZE.
+        my $raw_batches = $c->sql->select_list_of_hashes(
+            <<~"SQL",
+                SELECT batch, count(id) FROM (
+                    SELECT lm.id, ceil(lm.id / ?::float) AS batch
+                    FROM sitemaps.${entity_type}_lastmod lm
+                    WHERE lm.replication_sequence > COALESCE(
+                        (SELECT overall_sitemaps_replication_sequence
+                        FROM sitemaps.control),
+                        0
+                    )
+                ) q
+                GROUP BY batch
+                ORDER BY batch ASC
+                SQL
+            $MAX_SITEMAP_SIZE,
         );
 
-        my %updates_by_suffix_key = partition_by {
-            $_->{sitemap_suffix_key}
-        } @{$all_updates};
+        my $batch = {count => 0, batches => []};
+        my @batches = ($batch);
+        for my $raw_batch (@$raw_batches) {
+            # Add this potential batch to the previous one if the sum will
+            # come out less than $MAX_SITEMAP_SIZE. Otherwise create a new
+            # batch and push the previous one onto the list.
+            if ($batch->{count} + $raw_batch->{count} <= $MAX_SITEMAP_SIZE) {
+                $batch->{count} = $batch->{count} + $raw_batch->{count};
+                push @{$batch->{batches}}, $raw_batch->{batch};
+            } else {
+                push @batches, $batch;
+                $batch = {
+                    count => $raw_batch->{count},
+                    batches => [$raw_batch->{batch}],
+                };
+            }
+        }
 
-        for my $suffix_key (sort keys %updates_by_suffix_key) {
-            my $updates = $updates_by_suffix_key{$suffix_key};
+        for my $batch_info (@batches) {
+            my $minimum_batch_number = min(@{ $batch_info->{batches} });
+
+            # The sitemaps.control columns will be NULL on first run, in
+            # which case we just select all updates for writing.
+            my $updates = $c->sql->select_list_of_hashes(
+                <<~"SQL",
+                    SELECT lm.*
+                    FROM sitemaps.${entity_type}_lastmod lm
+                    WHERE ceil(lm.id / ?::float) = any(?)
+                    AND lm.replication_sequence > COALESCE(
+                        (SELECT overall_sitemaps_replication_sequence
+                        FROM sitemaps.control),
+                        0
+                    )
+                    SQL
+                $MAX_SITEMAP_SIZE,
+                $batch_info->{batches},
+            );
 
-            my (@base_urls, @paginated_urls);
-            my $urls = {base => \@base_urls, paginated => \@paginated_urls};
-            my $suffix_info = $SITEMAP_SUFFIX_INFO{$entity_type}{$suffix_key};
+            my %updates_by_suffix_key = partition_by {
+                $_->{sitemap_suffix_key}
+            } @{$updates};
+
+            for my $suffix_key (sort keys %updates_by_suffix_key) {
+                my $updates = $updates_by_suffix_key{$suffix_key};
+
+                my (@base_urls, @paginated_urls);
+                my $urls = {
+                    base => \@base_urls,
+                    paginated => \@paginated_urls,
+                };
+                my $suffix_info =
+                    $SITEMAP_SUFFIX_INFO{$entity_type}{$suffix_key};
+
+                for my $update (@{$updates}) {
+                    my $opts = $self->create_url_opts(
+                        $c,
+                        $entity_type,
+                        $update->{url},
+                        $suffix_info,
+                        {last_modified => $update->{last_modified}},
+                    );
 
-            for my $update (@{$updates}) {
-                my $opts = $self->create_url_opts(
-                    $c,
-                    $entity_type,
-                    $update->{url},
-                    $suffix_info,
-                    {last_modified => $update->{last_modified}},
-                );
+                    if ($update->{paginated}) {
+                        push @paginated_urls, $opts;
+                    } else {
+                        push @base_urls, $opts;
+                    }
+                }
+
+                my %opts = %{$suffix_info};
+                my $filename_suffix =
+                    $opts{filename_suffix} // $opts{suffix};
 
-                if ($update->{paginated}) {
-                    push @paginated_urls, $opts;
+                if ($filename_suffix) {
+                    $opts{filename_suffix} = "$filename_suffix-incremental";
                 } else {
-                    push @base_urls, $opts;
+                    $opts{filename_suffix} = 'incremental';
                 }
-            }
-
-            my %opts = %{$suffix_info};
-            my $filename_suffix = $opts{filename_suffix} // $opts{suffix};
 
-            if ($filename_suffix) {
-                $opts{filename_suffix} = "$filename_suffix-incremental";
-            } else {
-                $opts{filename_suffix} = 'incremental';
+                $self->build_one_suffix(
+                    $entity_type,
+                    $minimum_batch_number,
+                    $urls,
+                    %opts,
+                );
             }
-
-            $self->build_one_suffix($entity_type, 1, $urls, %opts);
         }
     }
 }

diff --git a/lib/MusicBrainz/Server/Sitemap/Overall.pm b/lib/MusicBrainz/Server/Sitemap/Overall.pm
@@ -264,7 +264,38 @@ sub build_one_entity {
 
     my $sql = $c->sql;
 
-    # Find the counts in each potential batch of 50,000
+    # Sitemaps for a particular suffix (see `%SITEMAP_SUFFIX_INFO` in
+    # `Sitemap::Constants`) are divided into batches of `$MAX_SITEMAP_SIZE`.
+    # Each entity has a batch number associated with it, which is calculated
+    # as `ceil(row_id / $MAX_SITEMAP_SIZE)`.
+    #
+    # As an example, if `$MAX_SITEMAP_SIZE` were 50,000, and we were
+    # outputting the "base" artist URL sitemap, then artists with
+    # row IDs >= 1 and <= 50000 would go into batch #1
+    # (sitemap-artist-1.xml.gz); the artist with row ID 50001 would go into
+    # batch #2 (sitemap-artist-2.xml.gz).
+    #
+    # An exception to this rule is that we sometimes bundle adjacent batches:
+    #
+    #  * We'll bundle a batch into the previous one if the sum of URLs they
+    #    contain is less than or equal to `$MAX_SITEMAP_SIZE`. (There may be
+    #    holes in the row ID space as entities are merged or removed, or as
+    #    transactions are rolled back.) This reduces the total number of
+    #    sitemap files and keeps them as close to `$MAX_SITEMAP_SIZE` as
+    #    possible, for uniformity.
+    #
+    #  * We exclude the last batch from this bundling so that it's always in
+    #    its own sitemap (even if it's few enough that it could theoretically
+    #    be part of the previous one). The goal is that each URL only ever
+    #    starts in its actual batch number and then moves down over time
+    #    (as entities are merged or removed).
+    #
+    #    This keeps the set of URLs contained in the second-to-last batch
+    #    more stable: we know the last batch will always grow as new entities
+    #    are added, so there's no point to combining it when splitting it is
+    #    inevitable. Thus, the second-to-last sitemap won't have to be
+    #    pointlessly crawled again after it would have been split.
+
     my $raw_batches = $sql->select_list_of_hashes(
         "SELECT batch, count(id) FROM (SELECT id, ceil(id / ?::float) AS batch FROM $entity_type) q GROUP BY batch ORDER BY batch ASC",
         $MAX_SITEMAP_SIZE,
@@ -273,16 +304,6 @@ sub build_one_entity {
     return unless @{$raw_batches};
     my @batches;
 
-    # Exclude the last batch, which should always be its own sitemap.
-    #
-    # Since sitemaps do a bit of a bundling thing to reach as close to 50,000
-    # URLs as possible, it'd be possible that right after a rollover past
-    # 50,000 IDs, the new one would be folded into the otherwise-most-recent
-    # batch. Since the goal is that each URL only ever starts in its actual
-    # batch number and then moves down over time, this ensures that the last
-    # batch is always its own sitemap, even if it's few enough it could
-    # theoretically be part of the previous one.
-
     if (scalar @$raw_batches > 1) {
         my $batch = {count => 0, batches => []};
         for my $raw_batch (@{ $raw_batches }[0..scalar @$raw_batches-2]) {