Skip to content

Commit

Permalink
Merge pull request #5306 from perlpunk/scheduler-logging
Browse files Browse the repository at this point in the history
scheduler: Log statistics of rejected jobs
  • Loading branch information
mergify[bot] committed Sep 21, 2023
2 parents e134982 + 508ef3e commit 648de5c
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 14 deletions.
27 changes: 21 additions & 6 deletions lib/OpenQA/Scheduler/Model/Jobs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@ use OpenQA::Utils 'random_string';
use OpenQA::Constants qw(WEBSOCKET_API_VERSION);
use OpenQA::Schema;
use Time::HiRes 'time';
use List::Util qw(all shuffle min);
use List::Util qw(all shuffle min sum);

# How many jobs to allocate in one tick. Defaults to 80 ( set it to 0 for as much as possible)
use constant MAX_JOB_ALLOCATION => $ENV{OPENQA_SCHEDULER_MAX_JOB_ALLOCATION} // 80;
use constant MAX_REPORT_MISSING_WORKER_CLASSES => $ENV{OPENQA_SCHEDULER_MAX_REPORT_MISSING_WORKER_CLASSES} // 30;

# How much the priority should be increased (the priority value decreased) to protect a parallel cluster from starvation
use constant STARVATION_PROTECTION_PRIORITY_OFFSET => $ENV{OPENQA_SCHEDULER_STARVATION_PROTECTION_PRIORITY_OFFSET} // 1;
Expand Down Expand Up @@ -50,9 +51,21 @@ sub _allocate_jobs ($self, $free_workers) {
my $max_allocate = $limit >= 0 ? min(MAX_JOB_ALLOCATION, $limit - $running) : MAX_JOB_ALLOCATION;

# update the matching workers to the current free
for my $jobinfo (values %$scheduled_jobs) {
$jobinfo->{matching_workers} = _matching_workers($jobinfo, $free_workers);
my %rejected;
for my $id (keys %$scheduled_jobs) {
my $jobinfo = $scheduled_jobs->{$id};
$jobinfo->{matching_workers} = _matching_workers($jobinfo, $free_workers, \%rejected);
delete $scheduled_jobs->{$id} unless @{$jobinfo->{matching_workers}};
}
if (keys %rejected) {
my @rejected = sort { $rejected{$b} <=> $rejected{$a} || $a cmp $b } keys %rejected;
splice @rejected, MAX_REPORT_MISSING_WORKER_CLASSES;
my $stats = join ',', map { "$_:$rejected{$_}" } @rejected;
my $info = sprintf "Skipping %d jobs because of no free workers for requested worker classes (%s)",
sum(values %rejected), $stats;
log_debug($info);
}

# before we start looking at sorted jobs, we try to repair half scheduled clusters
# note: This can happen e.g. with workers connected to multiple web UIs or when jobs are scheduled
# non-atomically via openqa-clone-job.
Expand Down Expand Up @@ -121,7 +134,7 @@ sub _allocate_jobs ($self, $free_workers) {
if ($busy >= $max_allocate || $busy >= @$free_workers) {
my $free_worker_count = @$free_workers;
log_debug("limit reached, scheduling no additional jobs"
. "(max_running_jobs=$limit, free workers=$free_worker_count, running=$running, allocated=$busy)");
. " (max_running_jobs=$limit, free workers=$free_worker_count, running=$running, allocated=$busy)");
last;
}
}
Expand Down Expand Up @@ -303,12 +316,14 @@ sub schedule ($self) {

sub singleton { state $jobs ||= __PACKAGE__->new }

sub _matching_workers ($jobinfo, $free_workers) {
sub _matching_workers ($jobinfo, $free_workers, $rejected = {}) {
my @filtered;
my @needed = sort @{$jobinfo->{worker_classes}};
for my $worker (@$free_workers) {
my $matched_all = all { $worker->check_class($_) } @{$jobinfo->{worker_classes}};
my $matched_all = all { $worker->check_class($_) } @needed;
push(@filtered, $worker) if $matched_all;
}
$rejected->{join ',', @needed}++ unless @filtered;
return \@filtered;
}

Expand Down
52 changes: 44 additions & 8 deletions t/04-scheduler.t
Original file line number Diff line number Diff line change
Expand Up @@ -325,29 +325,65 @@ subtest 'job grab (no jobs because max_running_jobs is 0)' => sub {
$jobs->find($_->id)->delete for @jobs;
};

my @workers;
for my $wid (2 .. 10) {
is(my $id = register_worker(host => $wid), $wid, 'new worker registered');
my $worker = $workers->find($id);
$worker->set_property(WORKER_CLASS => 'qemu_x86_64');
push @workers, $worker;
}
subtest 'job grab (no jobs because max_running_jobs limit is exceeded)' => sub {
my $log_mock = Test::MockModule->new('OpenQA::Scheduler::Model::Jobs');
my $log = '';
$log_mock->redefine(log_debug => sub { $log .= "$_[0]\n" });
my @jobs;
my @workers;
for my $wid (2 .. 10) {
is(my $id = register_worker(host => $wid), $wid, 'new worker registered');
my $worker = $workers->find($id);
$worker->set_property(WORKER_CLASS => 'qemu_x86_64');
push @workers, $worker;
}
push @jobs, $jobs->create_from_settings(\%settings2) for 1 .. 10;

local OpenQA::App->singleton->config->{scheduler}->{max_running_jobs} = 5;
my $res = OpenQA::Scheduler::Model::Jobs->singleton->schedule();
is @$res, 5, 'schedule() returns 5 items';
like $log,
qr/limit reached, scheduling no additional jobs .max_running_jobs=5, free workers=10, running=0, allocated=5./,
'Log message about exceeded limit';

my $scheduled = list_jobs(state => SCHEDULED);
my $assigned = list_jobs(state => ASSIGNED);
is scalar @$assigned, 5, '5 jobs assigned';
is scalar @$scheduled, 5, '5 jobs still scheduled';
$jobs->find($_->id)->delete for @jobs;
$_->delete for @workers;
};

subtest 'job grab (statistics about rejected jobs)' => sub {
my $log_mock = Test::MockModule->new('OpenQA::Scheduler::Model::Jobs');
my @jobs;
my @workers;
my @classes = qw(atari c64 quantum);
my $scheduler = OpenQA::Scheduler::Model::Jobs->singleton;
for my $i (1 .. 10) {
my %set = %settings2;
$set{WORKER_CLASS} = $classes[$i % 3];
push @jobs, $jobs->create_from_settings(\%set);
}
my $scheduled_jobs = $scheduler->determine_scheduled_jobs;
my $free_workers = OpenQA::Scheduler::Model::Jobs::determine_free_workers();
my %rejected;
for my $jobinfo (values %$scheduled_jobs) {
$jobinfo->{matching_workers}
= OpenQA::Scheduler::Model::Jobs::_matching_workers($jobinfo, $free_workers, \%rejected);
}
my $expected = {atari => 3, c64 => 4, quantum => 3};
is_deeply \%rejected, $expected, 'Rejected worker classes statistics like expected';

my $log = '';
$log_mock->redefine(log_debug => sub { $log .= "$_[0]\n" });
$scheduler->schedule;
like $log, qr/Skipping 10 jobs because of no free workers for requested worker classes .c64:4,atari:3,quantum:3./,
'Log message about rejected jobs';
$jobs->find($_->id)->delete for @jobs;
};

$_->delete for @workers;

subtest 'job grab (successful assignment)' => sub {
$job->update({state => SCHEDULED});
$job2->update({state => SCHEDULED});
Expand Down

0 comments on commit 648de5c

Please sign in to comment.