Skip to content

Commit

Permalink
Merge pull request #3092 from mwiencek/mbs-12239
Browse files Browse the repository at this point in the history
MBS-12239: Fix "too many URLs" in incremental sitemaps
  • Loading branch information
mwiencek committed Mar 8, 2024
2 parents cbbb01e + 295ac14 commit eee631e
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 49 deletions.
153 changes: 115 additions & 38 deletions lib/MusicBrainz/Server/Sitemap/Incremental.pm
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@ use feature 'state';

use Digest::SHA qw( sha1_hex );
use JSON qw( decode_json );
use List::AllUtils qw( partition_by );
use List::AllUtils qw( min partition_by );
use Moose;
use Sql;

use MusicBrainz::Server::Constants qw( entities_with );
use MusicBrainz::Server::Context;
use MusicBrainz::Server::Log qw( log_info );
use MusicBrainz::Server::Sitemap::Constants qw( %SITEMAP_SUFFIX_INFO );
use MusicBrainz::Server::Sitemap::Constants qw(
$MAX_SITEMAP_SIZE
%SITEMAP_SUFFIX_INFO
);
use MusicBrainz::Server::Sitemap::Builder;

extends 'MusicBrainz::Server::Sitemap::Builder';
Expand Down Expand Up @@ -217,52 +220,126 @@ sub post_replication_sequence {
my ($self, $c) = @_;

for my $entity_type (@{ $self->dumped_entity_types }) {
# The sitemaps.control columns will be NULL on first run, in which case
# we just select all updates for writing.
my $all_updates = $c->sql->select_list_of_hashes(
"SELECT lm.*
FROM sitemaps.${entity_type}_lastmod lm
WHERE lm.replication_sequence > COALESCE(
(SELECT overall_sitemaps_replication_sequence FROM sitemaps.control), 0)",
# In most cases, there should be no need to create batches for the
# incremental sitemaps. An exception may occur in cases where the
# sitemaps have not run for some time and many changes have
# accumulated (MBS-12239).
#
# The batching code is mostly copied from `build_one_entity` in
# `Sitemap::Overall`. However, the logic to always put the last batch
# in its own sitemap is not performed here: we're not concerned about
# which sitemap a URL starts in for an incremental dump, because the
# set of URLs in an incremental dump is inherently unstable (they
# change hourly).
#
# As mentioned previously, it's very unlikely there will be more than
# one batch for an incremental dump, anyway, due to the code that
# attempts to merge adjacent ones.

# Find the counts in each potential batch of $MAX_SITEMAP_SIZE.
my $raw_batches = $c->sql->select_list_of_hashes(
<<~"SQL",
SELECT batch, count(id) FROM (
SELECT lm.id, ceil(lm.id / ?::float) AS batch
FROM sitemaps.${entity_type}_lastmod lm
WHERE lm.replication_sequence > COALESCE(
(SELECT overall_sitemaps_replication_sequence
FROM sitemaps.control),
0
)
) q
GROUP BY batch
ORDER BY batch ASC
SQL
$MAX_SITEMAP_SIZE,
);

my %updates_by_suffix_key = partition_by {
$_->{sitemap_suffix_key}
} @{$all_updates};
my $batch = {count => 0, batches => []};
my @batches = ($batch);
for my $raw_batch (@$raw_batches) {
# Add this potential batch to the previous one if the sum will
# come out less than $MAX_SITEMAP_SIZE. Otherwise create a new
# batch and push the previous one onto the list.
if ($batch->{count} + $raw_batch->{count} <= $MAX_SITEMAP_SIZE) {
$batch->{count} = $batch->{count} + $raw_batch->{count};
push @{$batch->{batches}}, $raw_batch->{batch};
} else {
push @batches, $batch;
$batch = {
count => $raw_batch->{count},
batches => [$raw_batch->{batch}],
};
}
}

for my $suffix_key (sort keys %updates_by_suffix_key) {
my $updates = $updates_by_suffix_key{$suffix_key};
for my $batch_info (@batches) {
my $minimum_batch_number = min(@{ $batch_info->{batches} });

# The sitemaps.control columns will be NULL on first run, in
# which case we just select all updates for writing.
my $updates = $c->sql->select_list_of_hashes(
<<~"SQL",
SELECT lm.*
FROM sitemaps.${entity_type}_lastmod lm
WHERE ceil(lm.id / ?::float) = any(?)
AND lm.replication_sequence > COALESCE(
(SELECT overall_sitemaps_replication_sequence
FROM sitemaps.control),
0
)
SQL
$MAX_SITEMAP_SIZE,
$batch_info->{batches},
);

my (@base_urls, @paginated_urls);
my $urls = {base => \@base_urls, paginated => \@paginated_urls};
my $suffix_info = $SITEMAP_SUFFIX_INFO{$entity_type}{$suffix_key};
my %updates_by_suffix_key = partition_by {
$_->{sitemap_suffix_key}
} @{$updates};

for my $suffix_key (sort keys %updates_by_suffix_key) {
my $updates = $updates_by_suffix_key{$suffix_key};

my (@base_urls, @paginated_urls);
my $urls = {
base => \@base_urls,
paginated => \@paginated_urls,
};
my $suffix_info =
$SITEMAP_SUFFIX_INFO{$entity_type}{$suffix_key};

for my $update (@{$updates}) {
my $opts = $self->create_url_opts(
$c,
$entity_type,
$update->{url},
$suffix_info,
{last_modified => $update->{last_modified}},
);

for my $update (@{$updates}) {
my $opts = $self->create_url_opts(
$c,
$entity_type,
$update->{url},
$suffix_info,
{last_modified => $update->{last_modified}},
);
if ($update->{paginated}) {
push @paginated_urls, $opts;
} else {
push @base_urls, $opts;
}
}

my %opts = %{$suffix_info};
my $filename_suffix =
$opts{filename_suffix} // $opts{suffix};

if ($update->{paginated}) {
push @paginated_urls, $opts;
if ($filename_suffix) {
$opts{filename_suffix} = "$filename_suffix-incremental";
} else {
push @base_urls, $opts;
$opts{filename_suffix} = 'incremental';
}
}

my %opts = %{$suffix_info};
my $filename_suffix = $opts{filename_suffix} // $opts{suffix};

if ($filename_suffix) {
$opts{filename_suffix} = "$filename_suffix-incremental";
} else {
$opts{filename_suffix} = 'incremental';
$self->build_one_suffix(
$entity_type,
$minimum_batch_number,
$urls,
%opts,
);
}

$self->build_one_suffix($entity_type, 1, $urls, %opts);
}
}
}
Expand Down
43 changes: 32 additions & 11 deletions lib/MusicBrainz/Server/Sitemap/Overall.pm
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,38 @@ sub build_one_entity {

my $sql = $c->sql;

# Find the counts in each potential batch of 50,000
# Sitemaps for a particular suffix (see `%SITEMAP_SUFFIX_INFO` in
# `Sitemap::Constants`) are divided into batches of `$MAX_SITEMAP_SIZE`.
# Each entity has a batch number associated with it, which is calculated
# as `ceil(row_id / $MAX_SITEMAP_SIZE)`.
#
# As an example, if `$MAX_SITEMAP_SIZE` were 50,000, and we were
# outputting the "base" artist URL sitemap, then artists with
# row IDs >= 1 and <= 50000 would go into batch #1
# (sitemap-artist-1.xml.gz); the artist with row ID 50001 would go into
# batch #2 (sitemap-artist-2.xml.gz).
#
# An exception to this rule is that we sometimes bundle adjacent batches:
#
# * We'll bundle a batch into the previous one if the sum of URLs they
# contain is less than or equal to `$MAX_SITEMAP_SIZE`. (There may be
# holes in the row ID space as entities are merged or removed, or as
# transactions are rolled back.) This reduces the total number of
# sitemap files and keeps them as close to `$MAX_SITEMAP_SIZE` as
# possible, for uniformity.
#
# * We exclude the last batch from this bundling so that it's always in
# its own sitemap (even if it's few enough that it could theoretically
# be part of the previous one). The goal is that each URL only ever
# starts in its actual batch number and then moves down over time
# (as entities are merged or removed).
#
# This keeps the set of URLs contained in the second-to-last batch
# more stable: we know the last batch will always grow as new entities
# are added, so there's no point to combining it when splitting it is
# inevitable. Thus, the second-to-last sitemap won't have to be
# pointlessly crawled again after it would have been split.

my $raw_batches = $sql->select_list_of_hashes(
"SELECT batch, count(id) FROM (SELECT id, ceil(id / ?::float) AS batch FROM $entity_type) q GROUP BY batch ORDER BY batch ASC",
$MAX_SITEMAP_SIZE,
Expand All @@ -273,16 +304,6 @@ sub build_one_entity {
return unless @{$raw_batches};
my @batches;

# Exclude the last batch, which should always be its own sitemap.
#
# Since sitemaps do a bit of a bundling thing to reach as close to 50,000
# URLs as possible, it'd be possible that right after a rollover past
# 50,000 IDs, the new one would be folded into the otherwise-most-recent
# batch. Since the goal is that each URL only ever starts in its actual
# batch number and then moves down over time, this ensures that the last
# batch is always its own sitemap, even if it's few enough it could
# theoretically be part of the previous one.

if (scalar @$raw_batches > 1) {
my $batch = {count => 0, batches => []};
for my $raw_batch (@{ $raw_batches }[0..scalar @$raw_batches-2]) {
Expand Down

0 comments on commit eee631e

Please sign in to comment.