MBS-13449: Reimplement transactional cache without database locks

This is intended to help resolve deadlocks (MBS-11345, MBS-12314) and performance issues due to lock contention. It supplants #212. The new implementation is documented in `Data::Role::EntityCache`.
metabrainz · Feb 7, 2024 · 82c79f5 · 82c79f5
1 parent b5c7d4c
commit 82c79f5
Show file tree

Hide file tree

Showing 11 changed files with 332 additions and 460 deletions.
diff --git a/entities.json b/entities.json
diff --git a/entities.mjs b/entities.mjs
diff --git a/lib/MusicBrainz/Server.pm b/lib/MusicBrainz/Server.pm
@@ -6,6 +6,7 @@ use namespace::autoclean;
 extends 'Catalyst';
 
 use Class::Load qw( load_class );
+use Data::Dumper;
 use DBDefs;
 use Digest::SHA qw( sha256 );
 use HTML::Entities ();
@@ -23,7 +24,7 @@ use MusicBrainz::Server::Entity::Util::JSON qw( to_json_array to_json_object );
 use MusicBrainz::Server::Log qw( logger );
 use MusicBrainz::Server::Validation qw( is_positive_integer );
 use POSIX qw(SIGALRM);
-use Scalar::Util qw( refaddr );
+use Scalar::Util qw( looks_like_number refaddr );
 use Sys::Hostname;
 use Time::HiRes qw( clock_gettime CLOCK_REALTIME CLOCK_MONOTONIC );
 use Try::Tiny;
@@ -476,6 +477,17 @@ around dispatch => sub {
 
     my $max_request_time = DBDefs->DETERMINE_MAX_REQUEST_TIME($c->req);
 
+    if (
+        defined($max_request_time) &&
+        (!looks_like_number($max_request_time) || $max_request_time < 0)
+     ) {
+        $c->log->warn(
+            'DETERMINE_MAX_REQUEST_TIME did not return a valid number: ' .
+            Dumper($max_request_time),
+        );
+        $max_request_time = undef;
+    }
+
     if (defined($max_request_time) && $max_request_time > 0) {
         alarm($max_request_time);
 
@@ -493,6 +505,8 @@ around dispatch => sub {
         POSIX::sigaction(SIGALRM, $action);
     }
 
+    $c->model('MB')->context->max_request_time($max_request_time);
+
     $c->$orig(@args);
 
     alarm(0);

diff --git a/lib/MusicBrainz/Server/Context.pm b/lib/MusicBrainz/Server/Context.pm
@@ -121,6 +121,14 @@ sub create_script_context
     return MusicBrainz::Server::Context->new(cache_manager => $cache_manager, database => 'MAINTENANCE', %args);
 }
 
+# `DBDefs::DETERMINE_MAX_REQUEST_TIME` must be called with a Catalyst request
+# object. This attribute stores the result of that call so it can be accessed
+# in the data layer.
+has 'max_request_time' => (
+    is => 'rw',
+    isa => 'Maybe[Int]',
+);
+
 1;
 
 =head1 COPYRIGHT AND LICENSE

diff --git a/lib/MusicBrainz/Server/Data/Role/EntityCache.pm b/lib/MusicBrainz/Server/Data/Role/EntityCache.pm
@@ -3,34 +3,69 @@ package MusicBrainz::Server::Data::Role::EntityCache;
 use DBDefs;
 use Moose::Role;
 use namespace::autoclean;
-use List::AllUtils qw( natatime uniq );
-use MusicBrainz::Server::Constants qw( %ENTITIES );
-use MusicBrainz::Server::Log qw( log_debug );
-use MusicBrainz::Server::Validation qw( is_database_row_id );
+use List::AllUtils qw( uniq );
 use Readonly;
-use Time::HiRes qw( time );
 
 requires '_type';
 
-Readonly our $MAX_CACHE_ENTRIES => 500;
+=head1 Cache invalidation and locking
 
-sub _cache_id {
-    my ($self) = @_;
+In order to prevent stale entities from being repopulated in the cache after
+another process invalidates those entries (MBS-7241), we track which entity
+IDs were recently invalidated in our Redis store. (In production, MBS uses
+two Redis instances: one as an LRU cache for entity data, and another as a
+persistent data store, mainly for login sessions.) As an example, consider
+the following sequence of events:
 
-    my $type = $self->_type;
-    if ($type && (my $entity_properties = $ENTITIES{$type})) {
-        if (exists $entity_properties->{cache}) {
-            return $entity_properties->{cache}{id};
-        }
-    }
-}
+ 1. Request A updates artist ID=123 and calls _delete_from_cache(123). In our
+    persistent Redis store, we set `artist:recently_invalidated:123` to '1'.
+    In our Redis cache, we delete `artist:123`. (While cache addition is
+    delayed until after the relevant transaction commits, cache deletion is
+    performed immediately.)
+
+ 2. Request B starts before request A's database transaction commits, and
+    attempts to load artist ID=123. Since it's not in the cache, we read it
+    from the database, seeing the "old" version.
+
+ 3. Request B commits, and adds the stale artist to the cache at
+    `artist:123`.
+
+ 4. Immediately after adding it to the cache, we check for the presence of an
+    `artist:recently_invalidated:123` key in the Redis store. If it exists,
+    we remove the stale `artist:123` entry we just added to the cache.
+
+The `recently_invalidated` keys have their TTLs set to the "max request time"
+determined by `DBDefs::DETERMINE_MAX_REQUEST_TIME`. The idea behind this is
+that if the request times out, any database handles used in the request will
+be closed and all transactions will be rolled back. In case the max request
+time is undefined or 0 (indicating no limit), we set the TTL to
+`$RECENTLY_INVALIDATED_TTL`, which defaults to 10 minutes.
+
+Originally, MBS-7241 was resolved by using using database-level locks
+(`pg_advisory_xact_lock`). Lock contention and deadlocks (MBS-11345,
+MBS-12314, ...) were a persistent issue with that approach. We also hit
+limits on the number of locks being held in a single transaction (MBS-10497).
+
+=cut
+
+Readonly our $RECENTLY_INVALIDATED_TTL => 600;
+
+Readonly our $MAX_CACHE_ENTRIES => 500;
 
 has _cache_prefix => (
     is => 'ro',
     isa => 'Str',
+    lazy => 1,
     default => sub { shift->_type . ':' },
 );
 
+has _recently_invalidated_prefix => (
+    is => 'ro',
+    isa => 'Str',
+    lazy => 1,
+    default => sub { shift->_cache_prefix . 'recently_invalidated:' },
+);
+
 around get_by_ids => sub {
     my ($orig, $self, @ids) = @_;
     @ids = grep { is_database_row_id($_) } @ids;
@@ -48,7 +83,8 @@ around get_by_ids => sub {
         delete $ids{$id};
     }
     if (%ids) {
-        my $data = $self->$orig(keys %ids) || {};
+        my @ids_to_fetch = keys %ids;
+        my $data = $self->$orig(@ids_to_fetch) || {};
         my @ids_to_cache = keys %$data;
         foreach my $id (@ids_to_cache) {
             $result{$id} = $data->{$id};
@@ -79,64 +115,62 @@ after merge => sub {
 sub _create_cache_entries {
     my ($self, $data, $ids) = @_;
 
-    my $cache_id = $self->_cache_id;
     my $cache_prefix = $self->_cache_prefix;
-
-    my @ids = @$ids;
-    return unless @ids;
-
     my $ttl = DBDefs->ENTITY_CACHE_TTL;
 
-    if (DBDefs->DB_READ_ONLY) {
-        # There's no point in acquiring advisory locks for caching if
-        # DB_READ_ONLY is set, because there should be no concurrent
-        # writes in that case. While it's possible to have some servers
-        # set in DB_READ_ONLY and others still accepting writes, that's
-        # never done intentionally, and there's not much we can do
-        # about it: READONLY connections (which are made when
-        # DB_READ_ONLY is set) typically go to a standby server
-        # different from the master. Locks are not shared between
-        # primary and standby servers.
-        return map {
-            [$cache_prefix . $_, $data->{$_}, ($ttl ? $ttl : ())]
-        } @ids;
-    }
-
-    my @entries;
-    my $it = natatime 100, @ids;
-    while (my @next_ids = $it->()) {
-        # MBS-7241: Try to acquire deletion locks on all of the cache
-        # keys, so that we don't repopulate those which are being
-        # deleted in a concurrent transaction. (This is non-blocking;
-        # if we fail to acquire the lock for a particular id, we skip
-        # the key.)
-        #
-        # MBS-10497: `id % 50` is used to limit the number of locks
-        # obtained per entity type per transaction to be within
-        # PostgreSQL's default configuration value for
-        # max_locks_per_transaction, 64. Note that 64 is not a hard
-        # limit, just an average. A transaction can have as many locks
-        # as will fit in the lock table.
-        my $locks = $self->c->sql->select_list_of_hashes(
-            'SELECT id, pg_try_advisory_xact_lock(?, id % 50) AS got_lock ' .
-            '  FROM unnest(?::integer[]) AS id',
-            $cache_id,
-            \@next_ids,
-        );
-        push @entries, map {
-            my $id = $_->{id};
-            [$cache_prefix . $id, $data->{$id}, ($ttl ? $ttl : ())]
-        } grep { $_->{got_lock} } @$locks;
-    }
-
-    @entries;
+    return map {
+        [$cache_prefix . $_, $data->{$_}, ($ttl ? $ttl : ())]
+    } @$ids;
 }
 
 sub _add_to_cache {
     my ($self, $cache, $data, $ids) = @_;
 
     my @entries = $self->_create_cache_entries($data, $ids);
-    $cache->set_multi(@entries) if @entries;
+    return unless @entries;
+
+    my $sql = $self->c->sql;
+    if ($sql->is_in_transaction) {
+        $sql->add_post_txn_callback(sub {
+            $self->_add_to_cache_impl($cache, \@entries, $ids);
+        });
+    } else {
+        $self->_add_to_cache_impl($cache, \@entries, $ids);
+    }
+}
+
+sub _add_to_cache_impl {
+    my ($self, $cache, $entries, $ids) = @_;
+
+    $cache->set_multi(@$entries);
+
+    # Check if any entities we've just cached were recently invalidated.
+    # If so, delete them. This must be performed after the cache
+    # addition, but means there may be *very* brief intervals (between
+    # the `set_multi` call above and the `delete_multi` call below) where
+    # we return stale entities from the cache.
+    my $invalidated_prefix = $self->_recently_invalidated_prefix;
+    # Map keys like `MB:artist:recently_invalidated:123` to the entity ID
+    # they refer to (`123`).
+    my %possible_recently_invalidated_id_map = map {
+        my $key = $invalidated_prefix . $_;
+        ($key => $_)
+    } @$ids;
+    # Check which of these keys actually exist in our Redis store,
+    # indicating which entity IDs were recently-invalidated in the cache.
+    my @recently_invalidated_ids = map {
+        $possible_recently_invalidated_id_map{$_}
+    } keys %{ $self->c->store->get_multi(
+        keys %possible_recently_invalidated_id_map,
+    ) };
+    if (@recently_invalidated_ids) {
+        my $cache_prefix = $self->_cache_prefix;
+        # Delete the recently-invalidated entity IDs from the cache at
+        # their corresponding key locations, e.g., `MB:artist:123`.
+        $cache->delete_multi(
+            map { $cache_prefix . $_ } @recently_invalidated_ids,
+        );
+    }
 }
 
 sub _delete_from_cache {
@@ -145,41 +179,21 @@ sub _delete_from_cache {
     @ids = uniq grep { defined } @ids;
     return unless @ids;
 
-    my $cache_id = $self->_cache_id;
-
-    # MBS-7241: Lock cache deletions from `_create_cache_entries`
-    # above, so that these keys can't be repopulated until after the
-    # database transaction commits.
-    my @row_ids = uniq
-        # MBS-10497: Limit the number of locks obtained per cache id
-        # per transaction to 50; see the comment in
-        # `_create_cache_entries` above. This increases the chance of
-        # contention, but invalidating cache entries is infrequent
-        # enough that it shouldn't matter.
-        map { $_ % 50 }
-        grep { is_database_row_id($_) } @ids;
-    if (@row_ids) {
-        my $start_time = time;
-        $self->c->sql->do(
-            'SELECT pg_advisory_xact_lock(?, id) ' .
-            '  FROM unnest(?::integer[]) AS id',
-            $cache_id,
-            \@row_ids,
-        );
-        my $elapsed_time = (time - $start_time);
-        if ($elapsed_time > 0.01) {
-            log_debug {
-                "[$start_time] _delete_from_cache: waited $elapsed_time" .
-                ' seconds for ' . (scalar @row_ids) .
-                " locks with cache id $cache_id"
-            };
-        }
+    my $c = $self->c;
+    my $cache = $c->cache($self->_type);
+    my $cache_prefix = $self->_cache_prefix;
+
+    my $max_request_time = $c->max_request_time;
+    if (!defined($max_request_time) || $max_request_time == 0) {
+        $max_request_time = $RECENTLY_INVALIDATED_TTL;
     }
+    my $invalidated_prefix = $self->_recently_invalidated_prefix;
+    my @invalidated_flags = map { $invalidated_prefix . $_ } @ids;
+    $c->store->set_multi(map {
+        [$_, 1, $max_request_time + 1]
+    } @invalidated_flags);
 
-    my $cache_prefix = $self->_cache_prefix;
     my @keys = map { $cache_prefix . $_ } @ids;
-
-    my $cache = $self->c->cache($self->_type);
     my $method = @keys > 1 ? 'delete_multi' : 'delete';
     $cache->$method(@keys);
 }

diff --git a/lib/MusicBrainz/Server/Data/Role/GIDEntityCache.pm b/lib/MusicBrainz/Server/Data/Role/GIDEntityCache.pm
@@ -32,9 +32,6 @@ around _create_cache_entries => sub {
     my $prefix = $self->_cache_prefix;
     my @orig_entries = $self->$orig($data, $ids);
     my @entries = @orig_entries;
-    # Only add gid entries for entities returned from $self->$orig, which
-    # may be a subset of $data if any are being deleted in a concurrent
-    # transaction.
     for my $entry (@orig_entries) {
         my $entity = $entry->[1];
         push @entries, [$prefix . $entity->gid, $entity->id];