Permalink
Browse files

Back off if master DB is down

If monitor worker figures master DB is down, send a flag telling workers to
avoid attempting to connect to the master. If a worker is already connected
and it functions, it will ignore the flag. If a worker is starting up or
reconnecting to the master, it will avoid the attempt.

After this commit a timed-out master DB will finally not cause the tracker to
fail, and reads will continue to work.

Takes almost a minute to notice the master is gone though. some room for
improvement there.
  • Loading branch information...
1 parent d58c1b9 commit 1597f84c155d63ba8a6d7309d013d6f418925592 @dormando dormando committed Feb 27, 2012
Showing with 33 additions and 10 deletions.
  1. +3 −2 lib/MogileFS/Config.pm
  2. +12 −4 lib/MogileFS/Store.pm
  3. +18 −4 lib/MogileFS/Worker/Monitor.pm
View
@@ -281,8 +281,9 @@ sub cache_server_setting {
}
sub server_setting_cached {
- my ($class, $key) = @_;
- unless ($has_cached_settings) {
+ my ($class, $key, $fallback) = @_;
+ $fallback = 1 unless (defined $fallback);
+ if (!$has_cached_settings && $fallback) {
return MogileFS::Config->server_setting($key);
}
return $server_settings{$key};
View
@@ -171,12 +171,12 @@ sub mark_as_slave {
my $self = shift;
die "Incapable of becoming slave." unless $self->can_do_slaves;
- $self->{slave} = 1;
+ $self->{is_slave} = 1;
}
sub is_slave {
my $self = shift;
- return $self->{slave};
+ return $self->{is_slave};
}
sub _slaves_list_changed {
@@ -353,6 +353,14 @@ sub dbh {
return $self->{dbh} if $self->{dbh};
}
+ # Shortcut flag: if monitor thinks the master is down, avoid attempting to
+ # connect to it for now. If we already have a connection to the master,
+ # keep using it as above.
+ if (!$self->is_slave) {
+ my $flag = MogileFS::Config->server_setting_cached('_master_db_alive', 0);
+ return if (defined $flag && $flag == 0);;
+ }
+
eval {
local $SIG{ALRM} = sub { die "timeout\n" };
alarm($self->connect_timeout);
@@ -365,9 +373,9 @@ sub dbh {
};
alarm(0);
if ($@ eq "timeout\n") {
- confess "Failed to connect to database: timeout";
+ die "Failed to connect to database: timeout";
} elsif ($@) {
- confess "Failed to connect to database: " . DBI->errstr;
+ die "Failed to connect to database: " . DBI->errstr;
}
$self->post_dbi_connect;
$self->{handles_left} = $self->{max_handles} if $self->{max_handles};
@@ -13,6 +13,7 @@ use fields (
'prev_data', # DB data from previous run
'devutil', # Running tally of device utilization
'events', # Queue of state events
+ 'have_masterdb', # Hint flag for if the master DB is available
);
use Danga::Socket 1.56;
@@ -35,6 +36,7 @@ sub new {
device => {} };
$self->{devutil} = { cur => {}, prev => {} };
$self->{events} = [];
+ $self->{have_masterdb} = 0;
return $self;
}
@@ -46,12 +48,24 @@ sub cache_refresh {
my $self = shift;
debug("Monitor running; checking DB for updates");
- return unless $self->validate_dbh;
+ # "Fix" our local cache of this flag, so we always check the master DB.
+ MogileFS::Config->cache_server_setting('_master_db_alive', 1);
+ my $have_dbh = $self->validate_dbh;
+ if ($have_dbh && !$self->{have_masterdb}) {
+ $self->{have_masterdb} = 1;
+ $self->set_event('srvset', '_master_db_alive', { value => 1 });
+ } elsif (!$have_dbh) {
+ $self->{have_masterdb} = 0;
+ $self->set_event('srvset', '_master_db_alive', { value => 0 });
+ error("Cannot connect to master database!");
+ }
- my $db_data = $self->grab_all_data;
+ if ($have_dbh) {
+ my $db_data = $self->grab_all_data;
- # Stack diffs to ship back later
- $self->diff_data($db_data);
+ # Stack diffs to ship back later
+ $self->diff_data($db_data);
+ }
$self->send_events_to_parent;
}

0 comments on commit 1597f84

Please sign in to comment.