Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MBS-12239: Fix "too many URLs" in incremental sitemaps #3092

Merged
merged 2 commits into from
Mar 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
153 changes: 115 additions & 38 deletions lib/MusicBrainz/Server/Sitemap/Incremental.pm
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,17 @@ use feature 'state';

use Digest::SHA qw( sha1_hex );
use JSON qw( decode_json );
use List::AllUtils qw( partition_by );
use List::AllUtils qw( min partition_by );
use Moose;
use Sql;

use MusicBrainz::Server::Constants qw( entities_with );
use MusicBrainz::Server::Context;
use MusicBrainz::Server::Log qw( log_info );
use MusicBrainz::Server::Sitemap::Constants qw( %SITEMAP_SUFFIX_INFO );
use MusicBrainz::Server::Sitemap::Constants qw(
$MAX_SITEMAP_SIZE
%SITEMAP_SUFFIX_INFO
);
use MusicBrainz::Server::Sitemap::Builder;

extends 'MusicBrainz::Server::Sitemap::Builder';
Expand Down Expand Up @@ -217,52 +220,126 @@ sub post_replication_sequence {
my ($self, $c) = @_;

for my $entity_type (@{ $self->dumped_entity_types }) {
# The sitemaps.control columns will be NULL on first run, in which case
# we just select all updates for writing.
my $all_updates = $c->sql->select_list_of_hashes(
"SELECT lm.*
FROM sitemaps.${entity_type}_lastmod lm
WHERE lm.replication_sequence > COALESCE(
(SELECT overall_sitemaps_replication_sequence FROM sitemaps.control), 0)",
# In most cases, there should be no need to create batches for the
# incremental sitemaps. An exception may occur in cases where the
# sitemaps have not run for some time and many changes have
# accumulated (MBS-12239).
#
# The batching code is mostly copied from `build_one_entity` in
# `Sitemap::Overall`. However, the logic to always put the last batch
# in its own sitemap is not performed here: we're not concerned about
# which sitemap a URL starts in for an incremental dump, because the
# set of URLs in an incremental dump is inherently unstable (they
# change hourly).
#
# As mentioned previously, it's very unlikely there will be more than
# one batch for an incremental dump, anyway, due to the code that
# attempts to merge adjacent ones.

# Find the counts in each potential batch of $MAX_SITEMAP_SIZE.
my $raw_batches = $c->sql->select_list_of_hashes(
<<~"SQL",
SELECT batch, count(id) FROM (
SELECT lm.id, ceil(lm.id / ?::float) AS batch
FROM sitemaps.${entity_type}_lastmod lm
WHERE lm.replication_sequence > COALESCE(
(SELECT overall_sitemaps_replication_sequence
FROM sitemaps.control),
0
)
) q
GROUP BY batch
ORDER BY batch ASC
SQL
$MAX_SITEMAP_SIZE,
);

my %updates_by_suffix_key = partition_by {
$_->{sitemap_suffix_key}
} @{$all_updates};
my $batch = {count => 0, batches => []};
my @batches = ($batch);
for my $raw_batch (@$raw_batches) {
# Add this potential batch to the previous one if the sum will
# come out less than $MAX_SITEMAP_SIZE. Otherwise create a new
# batch and push the previous one onto the list.
if ($batch->{count} + $raw_batch->{count} <= $MAX_SITEMAP_SIZE) {
$batch->{count} = $batch->{count} + $raw_batch->{count};
push @{$batch->{batches}}, $raw_batch->{batch};
} else {
push @batches, $batch;
$batch = {
count => $raw_batch->{count},
batches => [$raw_batch->{batch}],
};
}
}

for my $suffix_key (sort keys %updates_by_suffix_key) {
my $updates = $updates_by_suffix_key{$suffix_key};
for my $batch_info (@batches) {
my $minimum_batch_number = min(@{ $batch_info->{batches} });

# The sitemaps.control columns will be NULL on first run, in
# which case we just select all updates for writing.
my $updates = $c->sql->select_list_of_hashes(
<<~"SQL",
mwiencek marked this conversation as resolved.
Show resolved Hide resolved
SELECT lm.*
FROM sitemaps.${entity_type}_lastmod lm
WHERE ceil(lm.id / ?::float) = any(?)
AND lm.replication_sequence > COALESCE(
(SELECT overall_sitemaps_replication_sequence
FROM sitemaps.control),
0
)
SQL
$MAX_SITEMAP_SIZE,
$batch_info->{batches},
);

my (@base_urls, @paginated_urls);
my $urls = {base => \@base_urls, paginated => \@paginated_urls};
my $suffix_info = $SITEMAP_SUFFIX_INFO{$entity_type}{$suffix_key};
my %updates_by_suffix_key = partition_by {
$_->{sitemap_suffix_key}
} @{$updates};

for my $suffix_key (sort keys %updates_by_suffix_key) {
my $updates = $updates_by_suffix_key{$suffix_key};

my (@base_urls, @paginated_urls);
my $urls = {
base => \@base_urls,
paginated => \@paginated_urls,
};
my $suffix_info =
$SITEMAP_SUFFIX_INFO{$entity_type}{$suffix_key};

for my $update (@{$updates}) {
my $opts = $self->create_url_opts(
$c,
$entity_type,
$update->{url},
$suffix_info,
{last_modified => $update->{last_modified}},
);

for my $update (@{$updates}) {
my $opts = $self->create_url_opts(
$c,
$entity_type,
$update->{url},
$suffix_info,
{last_modified => $update->{last_modified}},
);
if ($update->{paginated}) {
push @paginated_urls, $opts;
} else {
push @base_urls, $opts;
}
}

my %opts = %{$suffix_info};
my $filename_suffix =
$opts{filename_suffix} // $opts{suffix};

if ($update->{paginated}) {
push @paginated_urls, $opts;
if ($filename_suffix) {
$opts{filename_suffix} = "$filename_suffix-incremental";
} else {
push @base_urls, $opts;
$opts{filename_suffix} = 'incremental';
}
}

my %opts = %{$suffix_info};
my $filename_suffix = $opts{filename_suffix} // $opts{suffix};

if ($filename_suffix) {
$opts{filename_suffix} = "$filename_suffix-incremental";
} else {
$opts{filename_suffix} = 'incremental';
$self->build_one_suffix(
$entity_type,
$minimum_batch_number,
$urls,
%opts,
);
}

$self->build_one_suffix($entity_type, 1, $urls, %opts);
}
}
}
Expand Down
43 changes: 32 additions & 11 deletions lib/MusicBrainz/Server/Sitemap/Overall.pm
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,38 @@ sub build_one_entity {

my $sql = $c->sql;

# Find the counts in each potential batch of 50,000
# Sitemaps for a particular suffix (see `%SITEMAP_SUFFIX_INFO` in
# `Sitemap::Constants`) are divided into batches of `$MAX_SITEMAP_SIZE`.
# Each entity has a batch number associated with it, which is calculated
# as `ceil(row_id / $MAX_SITEMAP_SIZE)`.
#
# As an example, if `$MAX_SITEMAP_SIZE` were 50,000, and we were
# outputting the "base" artist URL sitemap, then artists with
# row IDs >= 1 and <= 50000 would go into batch #1
# (sitemap-artist-1.xml.gz); the artist with row ID 50001 would go into
# batch #2 (sitemap-artist-2.xml.gz).
#
# An exception to this rule is that we sometimes bundle adjacent batches:
#
# * We'll bundle a batch into the previous one if the sum of URLs they
# contain is less than or equal to `$MAX_SITEMAP_SIZE`. (There may be
# holes in the row ID space as entities are merged or removed, or as
# transactions are rolled back.) This reduces the total number of
# sitemap files and keeps them as close to `$MAX_SITEMAP_SIZE` as
# possible, for uniformity.
#
# * We exclude the last batch from this bundling so that it's always in
# its own sitemap (even if it's few enough that it could theoretically
# be part of the previous one). The goal is that each URL only ever
# starts in its actual batch number and then moves down over time
# (as entities are merged or removed).
#
# This keeps the set of URLs contained in the second-to-last batch
# more stable: we know the last batch will always grow as new entities
# are added, so there's no point to combining it when splitting it is
# inevitable. Thus, the second-to-last sitemap won't have to be
# pointlessly crawled again after it would have been split.

my $raw_batches = $sql->select_list_of_hashes(
"SELECT batch, count(id) FROM (SELECT id, ceil(id / ?::float) AS batch FROM $entity_type) q GROUP BY batch ORDER BY batch ASC",
$MAX_SITEMAP_SIZE,
Expand All @@ -273,16 +304,6 @@ sub build_one_entity {
return unless @{$raw_batches};
my @batches;

# Exclude the last batch, which should always be its own sitemap.
#
# Since sitemaps do a bit of a bundling thing to reach as close to 50,000
# URLs as possible, it'd be possible that right after a rollover past
# 50,000 IDs, the new one would be folded into the otherwise-most-recent
# batch. Since the goal is that each URL only ever starts in its actual
# batch number and then moves down over time, this ensures that the last
# batch is always its own sitemap, even if it's few enough it could
# theoretically be part of the previous one.

if (scalar @$raw_batches > 1) {
my $batch = {count => 0, batches => []};
for my $raw_batch (@{ $raw_batches }[0..scalar @$raw_batches-2]) {
Expand Down