Skip to content

Commit

Permalink
scheduler: Log statistics of rejected jobs
Browse files Browse the repository at this point in the history
We want to know how many jobs are skipped because of no available workers
matching requested WORKER_CLASS settings.

Related issues:
* https://progress.opensuse.org/issues/135578
* https://progress.opensuse.org/issues/129619
  • Loading branch information
perlpunk committed Sep 21, 2023
1 parent d2a7168 commit c39533c
Show file tree
Hide file tree
Showing 2 changed files with 64 additions and 14 deletions.
26 changes: 20 additions & 6 deletions lib/OpenQA/Scheduler/Model/Jobs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@ use OpenQA::Utils 'random_string';
use OpenQA::Constants qw(WEBSOCKET_API_VERSION);
use OpenQA::Schema;
use Time::HiRes 'time';
use List::Util qw(all shuffle min);
use List::Util qw(all shuffle min sum);

# How many jobs to allocate in one tick. Defaults to 80 ( set it to 0 for as much as possible)
use constant MAX_JOB_ALLOCATION => $ENV{OPENQA_SCHEDULER_MAX_JOB_ALLOCATION} // 80;
use constant MAX_REPORT_MISSING_WORKER_CLASSES => $ENV{OPENQA_SCHEDULER_MAX_REPORT_MISSING_WORKER_CLASSES} // 30;

# How much the priority should be increased (the priority value decreased) to protect a parallel cluster from starvation
use constant STARVATION_PROTECTION_PRIORITY_OFFSET => $ENV{OPENQA_SCHEDULER_STARVATION_PROTECTION_PRIORITY_OFFSET} // 1;
Expand Down Expand Up @@ -50,9 +51,20 @@ sub _allocate_jobs ($self, $free_workers) {
my $max_allocate = $limit >= 0 ? min(MAX_JOB_ALLOCATION, $limit - $running) : MAX_JOB_ALLOCATION;

# update the matching workers to the current free
for my $jobinfo (values %$scheduled_jobs) {
$jobinfo->{matching_workers} = _matching_workers($jobinfo, $free_workers);
my %rejected;
for my $id (keys %$scheduled_jobs) {
my $jobinfo = $scheduled_jobs->{$id};
$jobinfo->{matching_workers} = _matching_workers($jobinfo, $free_workers, \%rejected);
delete $scheduled_jobs->{$id} unless @{$jobinfo->{matching_workers}};
}
if (keys %rejected) {
my @rejected = sort { $rejected{$b} <=> $rejected{$a} || $a cmp $b } keys %rejected;
splice @rejected, MAX_REPORT_MISSING_WORKER_CLASSES;
my $stats = join ',', map { "$_:$rejected{$_}" } @rejected;
my $info = sprintf "Skipping %d jobs because of no free workers for requested worker classes (%s)", sum(values %rejected), $stats;
log_debug($info);
}

# before we start looking at sorted jobs, we try to repair half scheduled clusters
# note: This can happen e.g. with workers connected to multiple web UIs or when jobs are scheduled
# non-atomically via openqa-clone-job.
Expand Down Expand Up @@ -121,7 +133,7 @@ sub _allocate_jobs ($self, $free_workers) {
if ($busy >= $max_allocate || $busy >= @$free_workers) {
my $free_worker_count = @$free_workers;
log_debug("limit reached, scheduling no additional jobs"
. "(max_running_jobs=$limit, free workers=$free_worker_count, running=$running, allocated=$busy)");
. " (max_running_jobs=$limit, free workers=$free_worker_count, running=$running, allocated=$busy)");
last;
}
}
Expand Down Expand Up @@ -303,12 +315,14 @@ sub schedule ($self) {

sub singleton { state $jobs ||= __PACKAGE__->new }

sub _matching_workers ($jobinfo, $free_workers) {
sub _matching_workers ($jobinfo, $free_workers, $rejected = {}) {
my @filtered;
my @needed = sort @{$jobinfo->{worker_classes}};
for my $worker (@$free_workers) {
my $matched_all = all { $worker->check_class($_) } @{$jobinfo->{worker_classes}};
my $matched_all = all { $worker->check_class($_) } @needed;
push(@filtered, $worker) if $matched_all;
}
$rejected->{join ',', @needed}++ unless @filtered;
return \@filtered;
}

Expand Down
52 changes: 44 additions & 8 deletions t/04-scheduler.t
Original file line number Diff line number Diff line change
Expand Up @@ -325,29 +325,65 @@ subtest 'job grab (no jobs because max_running_jobs is 0)' => sub {
$jobs->find($_->id)->delete for @jobs;
};

my @workers;
for my $wid (2 .. 10) {
is(my $id = register_worker(host => $wid), $wid, 'new worker registered');
my $worker = $workers->find($id);
$worker->set_property(WORKER_CLASS => 'qemu_x86_64');
push @workers, $worker;
}
subtest 'job grab (no jobs because max_running_jobs limit is exceeded)' => sub {
my $log_mock = Test::MockModule->new('OpenQA::Scheduler::Model::Jobs');
my $log = '';
$log_mock->redefine(log_debug => sub { $log .= "$_[0]\n" });
my @jobs;
my @workers;
for my $wid (2 .. 10) {
is(my $id = register_worker(host => $wid), $wid, 'new worker registered');
my $worker = $workers->find($id);
$worker->set_property(WORKER_CLASS => 'qemu_x86_64');
push @workers, $worker;
}
push @jobs, $jobs->create_from_settings(\%settings2) for 1 .. 10;

local OpenQA::App->singleton->config->{scheduler}->{max_running_jobs} = 5;
my $res = OpenQA::Scheduler::Model::Jobs->singleton->schedule();
is @$res, 5, 'schedule() returns 5 items';
like $log,
qr/limit reached, scheduling no additional jobs .max_running_jobs=5, free workers=10, running=0, allocated=5./,
'Log message about exceeded limit';

my $scheduled = list_jobs(state => SCHEDULED);
my $assigned = list_jobs(state => ASSIGNED);
is scalar @$assigned, 5, '5 jobs assigned';
is scalar @$scheduled, 5, '5 jobs still scheduled';
$jobs->find($_->id)->delete for @jobs;
$_->delete for @workers;
};

subtest 'job grab (statistics about rejected jobs)' => sub {
my $log_mock = Test::MockModule->new('OpenQA::Scheduler::Model::Jobs');
my @jobs;
my @workers;
my @classes = qw(atari c64 quantum);
my $scheduler = OpenQA::Scheduler::Model::Jobs->singleton;
for my $i (1 .. 10) {
my %set = %settings2;
$set{WORKER_CLASS} = $classes[$i % 3];
push @jobs, $jobs->create_from_settings(\%set);
}
my $scheduled_jobs = $scheduler->determine_scheduled_jobs;
my $free_workers = OpenQA::Scheduler::Model::Jobs::determine_free_workers();
my %rejected;
for my $jobinfo (values %$scheduled_jobs) {
$jobinfo->{matching_workers}
= OpenQA::Scheduler::Model::Jobs::_matching_workers($jobinfo, $free_workers, \%rejected);
}
my $expected = {atari => 3, c64 => 4, quantum => 3};
is_deeply \%rejected, $expected, 'Rejected worker classes statistics like expected';

my $log = '';
$log_mock->redefine(log_debug => sub { $log .= "$_[0]\n" });
$scheduler->schedule;
like $log, qr/Skipping 10 jobs because of no free workers for requested worker classes .c64:4,atari:3,quantum:3./,
'Log message about rejected jobs';
$jobs->find($_->id)->delete for @jobs;
};

$_->delete for @workers;

subtest 'job grab (successful assignment)' => sub {
$job->update({state => SCHEDULED});
$job2->update({state => SCHEDULED});
Expand Down

0 comments on commit c39533c

Please sign in to comment.