Skip to content

Commit

Permalink
Back off if master DB is down
Browse files Browse the repository at this point in the history
If monitor worker figures master DB is down, send a flag telling workers to
avoid attempting to connect to the master. If a worker is already connected
and it functions, it will ignore the flag. If a worker is starting up or
reconnecting to the master, it will avoid the attempt.

After this commit a timed-out master DB will finally not cause the tracker to
fail, and reads will continue to work.

Takes almost a minute to notice the master is gone though. some room for
improvement there.
  • Loading branch information
dormando committed Feb 27, 2012
1 parent d58c1b9 commit 1597f84
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 10 deletions.
5 changes: 3 additions & 2 deletions lib/MogileFS/Config.pm
Expand Up @@ -281,8 +281,9 @@ sub cache_server_setting {
}

sub server_setting_cached {
my ($class, $key) = @_;
unless ($has_cached_settings) {
my ($class, $key, $fallback) = @_;
$fallback = 1 unless (defined $fallback);
if (!$has_cached_settings && $fallback) {
return MogileFS::Config->server_setting($key);
}
return $server_settings{$key};
Expand Down
16 changes: 12 additions & 4 deletions lib/MogileFS/Store.pm
Expand Up @@ -171,12 +171,12 @@ sub mark_as_slave {
my $self = shift;
die "Incapable of becoming slave." unless $self->can_do_slaves;

$self->{slave} = 1;
$self->{is_slave} = 1;
}

sub is_slave {
my $self = shift;
return $self->{slave};
return $self->{is_slave};
}

sub _slaves_list_changed {
Expand Down Expand Up @@ -353,6 +353,14 @@ sub dbh {
return $self->{dbh} if $self->{dbh};
}
# Shortcut flag: if monitor thinks the master is down, avoid attempting to
# connect to it for now. If we already have a connection to the master,
# keep using it as above.
if (!$self->is_slave) {
my $flag = MogileFS::Config->server_setting_cached('_master_db_alive', 0);
return if (defined $flag && $flag == 0);;
}
eval {
local $SIG{ALRM} = sub { die "timeout\n" };
alarm($self->connect_timeout);
Expand All @@ -365,9 +373,9 @@ sub dbh {
};
alarm(0);
if ($@ eq "timeout\n") {
confess "Failed to connect to database: timeout";
die "Failed to connect to database: timeout";
} elsif ($@) {
confess "Failed to connect to database: " . DBI->errstr;
die "Failed to connect to database: " . DBI->errstr;
}
$self->post_dbi_connect;
$self->{handles_left} = $self->{max_handles} if $self->{max_handles};
Expand Down
22 changes: 18 additions & 4 deletions lib/MogileFS/Worker/Monitor.pm
Expand Up @@ -13,6 +13,7 @@ use fields (
'prev_data', # DB data from previous run
'devutil', # Running tally of device utilization
'events', # Queue of state events
'have_masterdb', # Hint flag for if the master DB is available
);

use Danga::Socket 1.56;
Expand All @@ -35,6 +36,7 @@ sub new {
device => {} };
$self->{devutil} = { cur => {}, prev => {} };
$self->{events} = [];
$self->{have_masterdb} = 0;
return $self;
}

Expand All @@ -46,12 +48,24 @@ sub cache_refresh {
my $self = shift;

debug("Monitor running; checking DB for updates");
return unless $self->validate_dbh;
# "Fix" our local cache of this flag, so we always check the master DB.
MogileFS::Config->cache_server_setting('_master_db_alive', 1);
my $have_dbh = $self->validate_dbh;
if ($have_dbh && !$self->{have_masterdb}) {
$self->{have_masterdb} = 1;
$self->set_event('srvset', '_master_db_alive', { value => 1 });
} elsif (!$have_dbh) {
$self->{have_masterdb} = 0;
$self->set_event('srvset', '_master_db_alive', { value => 0 });
error("Cannot connect to master database!");
}

my $db_data = $self->grab_all_data;
if ($have_dbh) {
my $db_data = $self->grab_all_data;

# Stack diffs to ship back later
$self->diff_data($db_data);
# Stack diffs to ship back later
$self->diff_data($db_data);
}

$self->send_events_to_parent;
}
Expand Down

0 comments on commit 1597f84

Please sign in to comment.