Skip to content

Commit

Permalink
MBS-13449: Reimplement transactional cache without database locks
Browse files Browse the repository at this point in the history
This is intended to help resolve deadlocks (MBS-11345, MBS-12314) and
performance issues due to lock contention.

It supplants #212.

The new implementation is documented in `Data::Role::EntityCache`.
  • Loading branch information
mwiencek committed Feb 7, 2024
1 parent b5c7d4c commit 82c79f5
Show file tree
Hide file tree
Showing 11 changed files with 332 additions and 460 deletions.
153 changes: 0 additions & 153 deletions entities.json

Large diffs are not rendered by default.

153 changes: 0 additions & 153 deletions entities.mjs

Large diffs are not rendered by default.

16 changes: 15 additions & 1 deletion lib/MusicBrainz/Server.pm
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use namespace::autoclean;
extends 'Catalyst';

use Class::Load qw( load_class );
use Data::Dumper;
use DBDefs;
use Digest::SHA qw( sha256 );
use HTML::Entities ();
Expand All @@ -23,7 +24,7 @@ use MusicBrainz::Server::Entity::Util::JSON qw( to_json_array to_json_object );
use MusicBrainz::Server::Log qw( logger );
use MusicBrainz::Server::Validation qw( is_positive_integer );
use POSIX qw(SIGALRM);
use Scalar::Util qw( refaddr );
use Scalar::Util qw( looks_like_number refaddr );
use Sys::Hostname;
use Time::HiRes qw( clock_gettime CLOCK_REALTIME CLOCK_MONOTONIC );
use Try::Tiny;
Expand Down Expand Up @@ -476,6 +477,17 @@ around dispatch => sub {

my $max_request_time = DBDefs->DETERMINE_MAX_REQUEST_TIME($c->req);

if (
defined($max_request_time) &&
(!looks_like_number($max_request_time) || $max_request_time < 0)
) {
$c->log->warn(
'DETERMINE_MAX_REQUEST_TIME did not return a valid number: ' .
Dumper($max_request_time),
);
$max_request_time = undef;
}

if (defined($max_request_time) && $max_request_time > 0) {
alarm($max_request_time);

Expand All @@ -493,6 +505,8 @@ around dispatch => sub {
POSIX::sigaction(SIGALRM, $action);
}

$c->model('MB')->context->max_request_time($max_request_time);

$c->$orig(@args);

alarm(0);
Expand Down
8 changes: 8 additions & 0 deletions lib/MusicBrainz/Server/Context.pm
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,14 @@ sub create_script_context
return MusicBrainz::Server::Context->new(cache_manager => $cache_manager, database => 'MAINTENANCE', %args);
}

# `DBDefs::DETERMINE_MAX_REQUEST_TIME` must be called with a Catalyst request
# object. This attribute stores the result of that call so it can be accessed
# in the data layer.
has 'max_request_time' => (
is => 'rw',
isa => 'Maybe[Int]',
);

1;

=head1 COPYRIGHT AND LICENSE
Expand Down
208 changes: 111 additions & 97 deletions lib/MusicBrainz/Server/Data/Role/EntityCache.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3,34 +3,69 @@ package MusicBrainz::Server::Data::Role::EntityCache;
use DBDefs;
use Moose::Role;
use namespace::autoclean;
use List::AllUtils qw( natatime uniq );
use MusicBrainz::Server::Constants qw( %ENTITIES );
use MusicBrainz::Server::Log qw( log_debug );
use MusicBrainz::Server::Validation qw( is_database_row_id );
use List::AllUtils qw( uniq );
use Readonly;
use Time::HiRes qw( time );

requires '_type';

Readonly our $MAX_CACHE_ENTRIES => 500;
=head1 Cache invalidation and locking
sub _cache_id {
my ($self) = @_;
In order to prevent stale entities from being repopulated in the cache after
another process invalidates those entries (MBS-7241), we track which entity
IDs were recently invalidated in our Redis store. (In production, MBS uses
two Redis instances: one as an LRU cache for entity data, and another as a
persistent data store, mainly for login sessions.) As an example, consider
the following sequence of events:
my $type = $self->_type;
if ($type && (my $entity_properties = $ENTITIES{$type})) {
if (exists $entity_properties->{cache}) {
return $entity_properties->{cache}{id};
}
}
}
1. Request A updates artist ID=123 and calls _delete_from_cache(123). In our
persistent Redis store, we set `artist:recently_invalidated:123` to '1'.
In our Redis cache, we delete `artist:123`. (While cache addition is
delayed until after the relevant transaction commits, cache deletion is
performed immediately.)
2. Request B starts before request A's database transaction commits, and
attempts to load artist ID=123. Since it's not in the cache, we read it
from the database, seeing the "old" version.
3. Request B commits, and adds the stale artist to the cache at
`artist:123`.
4. Immediately after adding it to the cache, we check for the presence of an
`artist:recently_invalidated:123` key in the Redis store. If it exists,
we remove the stale `artist:123` entry we just added to the cache.
The `recently_invalidated` keys have their TTLs set to the "max request time"
determined by `DBDefs::DETERMINE_MAX_REQUEST_TIME`. The idea behind this is
that if the request times out, any database handles used in the request will
be closed and all transactions will be rolled back. In case the max request
time is undefined or 0 (indicating no limit), we set the TTL to
`$RECENTLY_INVALIDATED_TTL`, which defaults to 10 minutes.
Originally, MBS-7241 was resolved by using using database-level locks
(`pg_advisory_xact_lock`). Lock contention and deadlocks (MBS-11345,
MBS-12314, ...) were a persistent issue with that approach. We also hit
limits on the number of locks being held in a single transaction (MBS-10497).
=cut

Readonly our $RECENTLY_INVALIDATED_TTL => 600;

Readonly our $MAX_CACHE_ENTRIES => 500;

has _cache_prefix => (
is => 'ro',
isa => 'Str',
lazy => 1,
default => sub { shift->_type . ':' },
);

has _recently_invalidated_prefix => (
is => 'ro',
isa => 'Str',
lazy => 1,
default => sub { shift->_cache_prefix . 'recently_invalidated:' },
);

around get_by_ids => sub {
my ($orig, $self, @ids) = @_;
@ids = grep { is_database_row_id($_) } @ids;
Expand All @@ -48,7 +83,8 @@ around get_by_ids => sub {
delete $ids{$id};
}
if (%ids) {
my $data = $self->$orig(keys %ids) || {};
my @ids_to_fetch = keys %ids;
my $data = $self->$orig(@ids_to_fetch) || {};
my @ids_to_cache = keys %$data;
foreach my $id (@ids_to_cache) {
$result{$id} = $data->{$id};
Expand Down Expand Up @@ -79,64 +115,62 @@ after merge => sub {
sub _create_cache_entries {
my ($self, $data, $ids) = @_;

my $cache_id = $self->_cache_id;
my $cache_prefix = $self->_cache_prefix;

my @ids = @$ids;
return unless @ids;

my $ttl = DBDefs->ENTITY_CACHE_TTL;

if (DBDefs->DB_READ_ONLY) {
# There's no point in acquiring advisory locks for caching if
# DB_READ_ONLY is set, because there should be no concurrent
# writes in that case. While it's possible to have some servers
# set in DB_READ_ONLY and others still accepting writes, that's
# never done intentionally, and there's not much we can do
# about it: READONLY connections (which are made when
# DB_READ_ONLY is set) typically go to a standby server
# different from the master. Locks are not shared between
# primary and standby servers.
return map {
[$cache_prefix . $_, $data->{$_}, ($ttl ? $ttl : ())]
} @ids;
}

my @entries;
my $it = natatime 100, @ids;
while (my @next_ids = $it->()) {
# MBS-7241: Try to acquire deletion locks on all of the cache
# keys, so that we don't repopulate those which are being
# deleted in a concurrent transaction. (This is non-blocking;
# if we fail to acquire the lock for a particular id, we skip
# the key.)
#
# MBS-10497: `id % 50` is used to limit the number of locks
# obtained per entity type per transaction to be within
# PostgreSQL's default configuration value for
# max_locks_per_transaction, 64. Note that 64 is not a hard
# limit, just an average. A transaction can have as many locks
# as will fit in the lock table.
my $locks = $self->c->sql->select_list_of_hashes(
'SELECT id, pg_try_advisory_xact_lock(?, id % 50) AS got_lock ' .
' FROM unnest(?::integer[]) AS id',
$cache_id,
\@next_ids,
);
push @entries, map {
my $id = $_->{id};
[$cache_prefix . $id, $data->{$id}, ($ttl ? $ttl : ())]
} grep { $_->{got_lock} } @$locks;
}

@entries;
return map {
[$cache_prefix . $_, $data->{$_}, ($ttl ? $ttl : ())]
} @$ids;
}

sub _add_to_cache {
my ($self, $cache, $data, $ids) = @_;

my @entries = $self->_create_cache_entries($data, $ids);
$cache->set_multi(@entries) if @entries;
return unless @entries;

my $sql = $self->c->sql;
if ($sql->is_in_transaction) {
$sql->add_post_txn_callback(sub {
$self->_add_to_cache_impl($cache, \@entries, $ids);
});
} else {
$self->_add_to_cache_impl($cache, \@entries, $ids);
}
}

sub _add_to_cache_impl {
my ($self, $cache, $entries, $ids) = @_;

$cache->set_multi(@$entries);

# Check if any entities we've just cached were recently invalidated.
# If so, delete them. This must be performed after the cache
# addition, but means there may be *very* brief intervals (between
# the `set_multi` call above and the `delete_multi` call below) where
# we return stale entities from the cache.
my $invalidated_prefix = $self->_recently_invalidated_prefix;
# Map keys like `MB:artist:recently_invalidated:123` to the entity ID
# they refer to (`123`).
my %possible_recently_invalidated_id_map = map {
my $key = $invalidated_prefix . $_;
($key => $_)
} @$ids;
# Check which of these keys actually exist in our Redis store,
# indicating which entity IDs were recently-invalidated in the cache.
my @recently_invalidated_ids = map {
$possible_recently_invalidated_id_map{$_}
} keys %{ $self->c->store->get_multi(
keys %possible_recently_invalidated_id_map,
) };
if (@recently_invalidated_ids) {
my $cache_prefix = $self->_cache_prefix;
# Delete the recently-invalidated entity IDs from the cache at
# their corresponding key locations, e.g., `MB:artist:123`.
$cache->delete_multi(
map { $cache_prefix . $_ } @recently_invalidated_ids,
);
}
}

sub _delete_from_cache {
Expand All @@ -145,41 +179,21 @@ sub _delete_from_cache {
@ids = uniq grep { defined } @ids;
return unless @ids;

my $cache_id = $self->_cache_id;

# MBS-7241: Lock cache deletions from `_create_cache_entries`
# above, so that these keys can't be repopulated until after the
# database transaction commits.
my @row_ids = uniq
# MBS-10497: Limit the number of locks obtained per cache id
# per transaction to 50; see the comment in
# `_create_cache_entries` above. This increases the chance of
# contention, but invalidating cache entries is infrequent
# enough that it shouldn't matter.
map { $_ % 50 }
grep { is_database_row_id($_) } @ids;
if (@row_ids) {
my $start_time = time;
$self->c->sql->do(
'SELECT pg_advisory_xact_lock(?, id) ' .
' FROM unnest(?::integer[]) AS id',
$cache_id,
\@row_ids,
);
my $elapsed_time = (time - $start_time);
if ($elapsed_time > 0.01) {
log_debug {
"[$start_time] _delete_from_cache: waited $elapsed_time" .
' seconds for ' . (scalar @row_ids) .
" locks with cache id $cache_id"
};
}
my $c = $self->c;
my $cache = $c->cache($self->_type);
my $cache_prefix = $self->_cache_prefix;

my $max_request_time = $c->max_request_time;
if (!defined($max_request_time) || $max_request_time == 0) {
$max_request_time = $RECENTLY_INVALIDATED_TTL;
}
my $invalidated_prefix = $self->_recently_invalidated_prefix;
my @invalidated_flags = map { $invalidated_prefix . $_ } @ids;
$c->store->set_multi(map {
[$_, 1, $max_request_time + 1]
} @invalidated_flags);

my $cache_prefix = $self->_cache_prefix;
my @keys = map { $cache_prefix . $_ } @ids;

my $cache = $self->c->cache($self->_type);
my $method = @keys > 1 ? 'delete_multi' : 'delete';
$cache->$method(@keys);
}
Expand Down
3 changes: 0 additions & 3 deletions lib/MusicBrainz/Server/Data/Role/GIDEntityCache.pm
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,6 @@ around _create_cache_entries => sub {
my $prefix = $self->_cache_prefix;
my @orig_entries = $self->$orig($data, $ids);
my @entries = @orig_entries;
# Only add gid entries for entities returned from $self->$orig, which
# may be a subset of $data if any are being deleted in a concurrent
# transaction.
for my $entry (@orig_entries) {
my $entity = $entry->[1];
push @entries, [$prefix . $entity->gid, $entity->id];
Expand Down

0 comments on commit 82c79f5

Please sign in to comment.