Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically rerun incompleted jobs because of no space left #3672

Merged
merged 1 commit into from Jan 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .circleci/ci-packages.txt
Expand Up @@ -244,6 +244,7 @@ perl-XML-Simple-2.24
perl-YAML-1.24
perl-YAML-LibYAML-0.82
perl-YAML-PP-0.026
perl-File-Map
psmisc-23.0
python3-appdirs-1.4.3
python3-EditorConfig-0.12.2
Expand Down
1 change: 1 addition & 0 deletions container/travis_test/Dockerfile
Expand Up @@ -129,6 +129,7 @@ RUN zypper in -y -C \
'perl(Time::ParseDate)' \
'perl(XSLoader) >= 0.24' \
'perl(XML::SemanticDiff)' \
'perl(File::Map)' \
perl-Archive-Extract \
perl-Test-Simple \
'perl(aliased)' \
Expand Down
1 change: 1 addition & 0 deletions cpanfile
Expand Up @@ -35,6 +35,7 @@ requires 'Fcntl';
requires 'File::Basename';
requires 'File::Copy';
requires 'File::Copy::Recursive';
requires 'File::Map';
requires 'File::Path';
requires 'File::Spec';
requires 'FindBin';
Expand Down
1 change: 1 addition & 0 deletions dependencies.yaml
Expand Up @@ -157,6 +157,7 @@ worker_requires:
perl(Mojo::IOLoop::ReadWriteProcess): '>= 0.26'
perl(Minion::Backend::SQLite): '>= 5.0.1'
perl(Mojo::SQLite):
perl(File::Map):
psmisc:

test_requires:
Expand Down
2 changes: 1 addition & 1 deletion dist/rpm/openQA.spec
Expand Up @@ -56,7 +56,7 @@
# The following line is generated from dependencies.yaml
%define client_requires curl git-core jq perl(Getopt::Long::Descriptive) perl(IO::Socket::SSL) >= 2.009 perl(IPC::Run) perl(JSON::Validator) perl(LWP::Protocol::https) perl(LWP::UserAgent) perl(Test::More) perl(YAML::PP) >= 0.020 perl(YAML::XS)
# The following line is generated from dependencies.yaml
%define worker_requires openQA-client optipng os-autoinst < 5 perl(Minion::Backend::SQLite) >= 5.0.1 perl(Mojo::IOLoop::ReadWriteProcess) >= 0.26 perl(Mojo::SQLite) psmisc sqlite3 >= 3.24.0
%define worker_requires openQA-client optipng os-autoinst < 5 perl(File::Map) perl(Minion::Backend::SQLite) >= 5.0.1 perl(Mojo::IOLoop::ReadWriteProcess) >= 0.26 perl(Mojo::SQLite) psmisc sqlite3 >= 3.24.0
# The following line is generated from dependencies.yaml
%define build_requires %assetpack_requires rubygem(sass)

Expand Down
2 changes: 1 addition & 1 deletion etc/openqa/openqa.ini
Expand Up @@ -73,7 +73,7 @@

## Causes jobs reported as incomplete by the worker to be cloned automatically when the
## reason matches; set to 0 to disable
#auto_clone_regex = ^cache failure:
#auto_clone_regex = ^(cache failure|terminated prematurely):

#[scm git]
# name of remote to get updates from before commiting changes (e.g. origin, leave out-commented to disable remote update)
Expand Down
2 changes: 1 addition & 1 deletion lib/OpenQA/Setup.pm
Expand Up @@ -54,7 +54,7 @@ sub read_config {
job_investigate_git_timeout => 20,
worker_timeout => DEFAULT_WORKER_TIMEOUT,
search_results_limit => 50000,
auto_clone_regex => '^cache failure: ',
auto_clone_regex => '^(cache failure|terminated prematurely): ',
},
rate_limits => {
search => 5,
Expand Down
13 changes: 11 additions & 2 deletions lib/OpenQA/Worker/Job.pm
Expand Up @@ -35,6 +35,7 @@ use Mojo::JSON 'decode_json';
use Mojo::File 'path';
use Try::Tiny;
use Scalar::Util 'looks_like_number';
use File::Map 'map_file';

# define attributes for public properties
has 'worker';
Expand Down Expand Up @@ -568,11 +569,19 @@ sub _format_reason {
}
}
catch {
# read autoinst-log.txt to check the reason, see poo#80334
my $msg = '';
eval {
map_file my $log_content, path($self->worker->pool_directory, 'autoinst-log.txt'), '<';
$msg = ': No space left on device' if ($log_content =~ /No space left on device/);
};
log_warning($@) if $@;

if ($reason eq WORKER_SR_DONE) {
$reason = "$reason: terminated with corrupted state file";
$reason = "$reason: terminated with corrupted state file$msg";
}
else {
$reason = "$reason: terminated prematurely with corrupted state file, see log output for details";
$reason = "terminated prematurely: Encountered corrupted state file$msg, see log output for details";
}
log_warning("Found $state_file but failed to parse the JSON: $_");
};
Expand Down
25 changes: 24 additions & 1 deletion t/24-worker-jobs.t
Expand Up @@ -443,7 +443,7 @@ subtest 'Job aborted, broken state file' => sub {
combined_like { wait_until_job_status_ok($job, 'stopped') } qr/failed to parse.*JSON/, 'warning about corrupt JSON';
is(
@{$client->sent_messages}[-1]->{reason},
'died: terminated prematurely with corrupted state file, see log output for details',
'terminated prematurely: Encountered corrupted state file, see log output for details',
'reason propagated'
) or diag explain $client->sent_messages;
combined_like {
Expand Down Expand Up @@ -1238,6 +1238,29 @@ subtest 'known images and files populated from status update' => sub {
is_deeply($job->known_files, \@fake_known_files, 'known files populated from status update');
};

subtest 'write no space left to job reason by parsing autoinst-log' => sub {
my $job = OpenQA::Worker::Job->new($worker, $client, {id => 12, URL => $engine_url});
$engine_mock->redefine(
engine_workit => sub {
$pool_directory->child('base_state.json')->spurt(qq(foo boo));
$pool_directory->child('autoinst-log.txt')
->spurt(
'[debug] Unable to serialize fatal error: Can\'t write to file "base_state.json": No space left on device at /usr/lib/os-autoinst/bmwqemu.pm line 86.'
);
$job->stop(WORKER_SR_DIED);
return {error => 'worker interrupted'};
});
$job->accept;
wait_until_job_status_ok($job, 'accepted');
$job->start;
combined_like { wait_until_job_status_ok($job, 'stopped') } qr/failed to parse.*JSON/, 'warning about corrupt JSON';
is(
@{$client->sent_messages}[-1]->{reason},
'terminated prematurely: Encountered corrupted state file: No space left on device, see log output for details',
'The job incomplete reason includes "No space left on device"'
) or diag explain $client->sent_messages;
};

subtest 'Cache setup error handling' => sub {
my $job = OpenQA::Worker::Job->new($worker, $client, {id => 12, URL => $engine_url});
$worker->settings->global_settings->{CACHEDIRECTORY} = '/var/lib/openqa/cache';
Expand Down