Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduler: Log statistics of rejected jobs #5306

Merged
merged 1 commit into from
Sep 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
27 changes: 21 additions & 6 deletions lib/OpenQA/Scheduler/Model/Jobs.pm
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@ use OpenQA::Utils 'random_string';
use OpenQA::Constants qw(WEBSOCKET_API_VERSION);
use OpenQA::Schema;
use Time::HiRes 'time';
use List::Util qw(all shuffle min);
use List::Util qw(all shuffle min sum);

# How many jobs to allocate in one tick. Defaults to 80 ( set it to 0 for as much as possible)
use constant MAX_JOB_ALLOCATION => $ENV{OPENQA_SCHEDULER_MAX_JOB_ALLOCATION} // 80;
use constant MAX_REPORT_MISSING_WORKER_CLASSES => $ENV{OPENQA_SCHEDULER_MAX_REPORT_MISSING_WORKER_CLASSES} // 30;

# How much the priority should be increased (the priority value decreased) to protect a parallel cluster from starvation
use constant STARVATION_PROTECTION_PRIORITY_OFFSET => $ENV{OPENQA_SCHEDULER_STARVATION_PROTECTION_PRIORITY_OFFSET} // 1;
Expand Down Expand Up @@ -50,9 +51,21 @@ sub _allocate_jobs ($self, $free_workers) {
my $max_allocate = $limit >= 0 ? min(MAX_JOB_ALLOCATION, $limit - $running) : MAX_JOB_ALLOCATION;

# update the matching workers to the current free
for my $jobinfo (values %$scheduled_jobs) {
$jobinfo->{matching_workers} = _matching_workers($jobinfo, $free_workers);
my %rejected;
for my $id (keys %$scheduled_jobs) {
my $jobinfo = $scheduled_jobs->{$id};
$jobinfo->{matching_workers} = _matching_workers($jobinfo, $free_workers, \%rejected);
delete $scheduled_jobs->{$id} unless @{$jobinfo->{matching_workers}};
Comment on lines +57 to +58
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a suggestion:

Suggested change
$jobinfo->{matching_workers} = _matching_workers($jobinfo, $free_workers, \%rejected);
delete $scheduled_jobs->{$id} unless @{$jobinfo->{matching_workers}};
my $workers = $jobinfo->{matching_workers} = _matching_workers($jobinfo, $free_workers, \%rejected);
delete $scheduled_jobs->{$id} unless @$workers;

}
if (keys %rejected) {
my @rejected = sort { $rejected{$b} <=> $rejected{$a} || $a cmp $b } keys %rejected;
splice @rejected, MAX_REPORT_MISSING_WORKER_CLASSES;
my $stats = join ',', map { "$_:$rejected{$_}" } @rejected;
my $info = sprintf "Skipping %d jobs because of no free workers for requested worker classes (%s)",
sum(values %rejected), $stats;
log_debug($info);
}

# before we start looking at sorted jobs, we try to repair half scheduled clusters
# note: This can happen e.g. with workers connected to multiple web UIs or when jobs are scheduled
# non-atomically via openqa-clone-job.
Expand Down Expand Up @@ -121,7 +134,7 @@ sub _allocate_jobs ($self, $free_workers) {
if ($busy >= $max_allocate || $busy >= @$free_workers) {
my $free_worker_count = @$free_workers;
log_debug("limit reached, scheduling no additional jobs"
. "(max_running_jobs=$limit, free workers=$free_worker_count, running=$running, allocated=$busy)");
. " (max_running_jobs=$limit, free workers=$free_worker_count, running=$running, allocated=$busy)");
last;
}
}
Expand Down Expand Up @@ -303,12 +316,14 @@ sub schedule ($self) {

sub singleton { state $jobs ||= __PACKAGE__->new }

sub _matching_workers ($jobinfo, $free_workers) {
sub _matching_workers ($jobinfo, $free_workers, $rejected = {}) {
my @filtered;
my @needed = sort @{$jobinfo->{worker_classes}};
for my $worker (@$free_workers) {
my $matched_all = all { $worker->check_class($_) } @{$jobinfo->{worker_classes}};
my $matched_all = all { $worker->check_class($_) } @needed;
push(@filtered, $worker) if $matched_all;
}
$rejected->{join ',', @needed}++ unless @filtered;
return \@filtered;
}

Expand Down
52 changes: 44 additions & 8 deletions t/04-scheduler.t
Original file line number Diff line number Diff line change
Expand Up @@ -325,29 +325,65 @@ subtest 'job grab (no jobs because max_running_jobs is 0)' => sub {
$jobs->find($_->id)->delete for @jobs;
};

my @workers;
for my $wid (2 .. 10) {
is(my $id = register_worker(host => $wid), $wid, 'new worker registered');
my $worker = $workers->find($id);
$worker->set_property(WORKER_CLASS => 'qemu_x86_64');
push @workers, $worker;
}
subtest 'job grab (no jobs because max_running_jobs limit is exceeded)' => sub {
my $log_mock = Test::MockModule->new('OpenQA::Scheduler::Model::Jobs');
my $log = '';
$log_mock->redefine(log_debug => sub { $log .= "$_[0]\n" });
my @jobs;
my @workers;
for my $wid (2 .. 10) {
is(my $id = register_worker(host => $wid), $wid, 'new worker registered');
my $worker = $workers->find($id);
$worker->set_property(WORKER_CLASS => 'qemu_x86_64');
push @workers, $worker;
}
push @jobs, $jobs->create_from_settings(\%settings2) for 1 .. 10;

local OpenQA::App->singleton->config->{scheduler}->{max_running_jobs} = 5;
my $res = OpenQA::Scheduler::Model::Jobs->singleton->schedule();
is @$res, 5, 'schedule() returns 5 items';
like $log,
qr/limit reached, scheduling no additional jobs .max_running_jobs=5, free workers=10, running=0, allocated=5./,
'Log message about exceeded limit';

my $scheduled = list_jobs(state => SCHEDULED);
my $assigned = list_jobs(state => ASSIGNED);
is scalar @$assigned, 5, '5 jobs assigned';
is scalar @$scheduled, 5, '5 jobs still scheduled';
$jobs->find($_->id)->delete for @jobs;
$_->delete for @workers;
};

subtest 'job grab (statistics about rejected jobs)' => sub {
my $log_mock = Test::MockModule->new('OpenQA::Scheduler::Model::Jobs');
my @jobs;
my @workers;
my @classes = qw(atari c64 quantum);
my $scheduler = OpenQA::Scheduler::Model::Jobs->singleton;
for my $i (1 .. 10) {
my %set = %settings2;
$set{WORKER_CLASS} = $classes[$i % 3];
push @jobs, $jobs->create_from_settings(\%set);
}
my $scheduled_jobs = $scheduler->determine_scheduled_jobs;
my $free_workers = OpenQA::Scheduler::Model::Jobs::determine_free_workers();
my %rejected;
for my $jobinfo (values %$scheduled_jobs) {
$jobinfo->{matching_workers}
= OpenQA::Scheduler::Model::Jobs::_matching_workers($jobinfo, $free_workers, \%rejected);
}
my $expected = {atari => 3, c64 => 4, quantum => 3};
is_deeply \%rejected, $expected, 'Rejected worker classes statistics like expected';

my $log = '';
$log_mock->redefine(log_debug => sub { $log .= "$_[0]\n" });
$scheduler->schedule;
like $log, qr/Skipping 10 jobs because of no free workers for requested worker classes .c64:4,atari:3,quantum:3./,
'Log message about rejected jobs';
$jobs->find($_->id)->delete for @jobs;
};

$_->delete for @workers;
kalikiana marked this conversation as resolved.
Show resolved Hide resolved

subtest 'job grab (successful assignment)' => sub {
$job->update({state => SCHEDULED});
$job2->update({state => SCHEDULED});
Expand Down