Skip to content

Commit

Permalink
Merge pull request #15316 from b10n1k/multi_logs
Browse files Browse the repository at this point in the history
Collect logs from parallel jobs when one node is failing
  • Loading branch information
b10n1k committed Aug 11, 2022
2 parents 6b8d908 + 276c98e commit e4e05a8
Show file tree
Hide file tree
Showing 21 changed files with 97 additions and 8 deletions.
74 changes: 72 additions & 2 deletions lib/hpcbase.pm
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use testapi;
use utils;
use Utils::Architectures;
use version_utils 'is_sle';
use lockapi;

=head2 enable_and_start
Expand All @@ -26,14 +27,83 @@ sub enable_and_start {
}

sub upload_service_log {
my ($self, $service_name) = @_;
my ($self, $service_name, $args) = @_;
script_run("journalctl -u $service_name -o short-precise > /tmp/$service_name");
script_run("cat /tmp/$service_name");
upload_logs("/tmp/$service_name", failok => 1);
upload_logs("/tmp/$service_name", failok => 1, log_name => $args->{log_name});
}

our %log_files = (
loadavg => {cmd => 'cat /proc/loadavg', logfile => 'loadavg.txt'},
psaxf => {cmd => 'ps axf', logfile => 'psaxf.log'},
journal => {cmd => 'journalctl -o short-precise', logfile => 'journal.log'},
dmesg => {cmd => 'dmesg', logfile => 'dmesg.txt'}
);

sub destroy_test_barriers {
my ($self) = @_;
if (check_var('HPC', 'slurm_master') || check_var('HPC', 'slurm_master_backup') || check_var('HPC', 'slurm_slave')) {
barrier_destroy('CLUSTER_PROVISIONED');
barrier_destroy('SLURM_MASTER_SERVICE_ENABLED');
barrier_destroy('SLURM_SLAVE_SERVICE_ENABLED');
barrier_destroy('SLURM_SETUP_DONE');
barrier_destroy('SLURM_MASTER_RUN_TESTS');
barrier_destroy('SLURM_SETUP_DBD');
}
elsif (check_var('HPC', 'mrsh_master') || check_var('HPC', 'mrsh_slave')) {
barrier_destroy('MRSH_INSTALLATION_FINISHED');
barrier_destroy('MRSH_KEY_COPIED');
barrier_destroy('MRSH_MUNGE_ENABLED');
barrier_destroy('SLAVE_MRLOGIN_STARTED');
barrier_destroy('MRSH_MASTER_DONE');
}
elsif (check_var('HPC', 'munge_master') || check_var('HPC', 'munge_slave')) {
barrier_destroy('MUNGE_INSTALLATION_FINISHED');
barrier_destroy('MUNGE_KEY_COPIED');
barrier_destroy('MUNGE_SERVICE_ENABLED');
barrier_destroy('MUNGE_DONE');
}
elsif (check_var('HPC', 'pdsh_master') || check_var('HPC', 'pdsh_slave')) {
barrier_destroy('PDSH_INSTALLATION_FINISHED');
barrier_destroy('PDSH_KEY_COPIED');
barrier_destroy('PDSH_MUNGE_ENABLED');
barrier_destroy('MRSH_SOCKET_STARTED');
barrier_destroy('PDSH_SLAVE_DONE');
}
elsif (check_var('HPC', 'ganglia_server') || check_var('HPC', 'ganglia_client')) {
barrier_destroy('GANGLIA_INSTALLED');
barrier_destroy('GANGLIA_SERVER_DONE');
barrier_destroy('GANGLIA_CLIENT_DONE');
barrier_destroy('GANGLIA_GMETAD_STARTED');
barrier_destroy('GANGLIA_GMOND_STARTED');
}
elsif (check_var('HPC', 'mpi_master') || check_var('HPC', 'mpi_slave')) {
barrier_destroy('CLUSTER_PROVISIONED');
barrier_destroy('MPI_SETUP_READY');
barrier_destroy('MPI_BINARIES_READY');
barrier_destroy('MPI_RUN_TEST');
}
}

sub post_run_hook {
my ($self) = @_;
select_console('log-console');
my $hname = get_required_var('HOSTNAME');
my $nodes = get_required_var('CLUSTER_NODES');
foreach (keys %log_files) {
$self->save_and_upload_log($log_files{$_}{cmd}, "/tmp/$hname-" . $log_files{$_}{logfile}, {screenshot => 1});
}
$self->upload_service_log("wicked");
if ($hname =~ /master/) {
upload_logs('/var/log/zypper.log');
upload_logs('/tmp/mpi_bin.log')
if (check_var('HPC', 'mpi_master') && script_run(qq{test -e /tmp/mpi_bin.log}) == 0);
}
}

sub post_fail_hook {
my ($self) = @_;
$self->destroy_test_barriers();
$self->select_serial_terminal;
script_run("SUSEConnect --status-text");
script_run("journalctl -o short-precise > /tmp/journal.log");
Expand Down
3 changes: 2 additions & 1 deletion lib/opensusebasetest.pm
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ Afterwards a screenshot will be created if C<$screenshot> is set.
sub save_and_upload_log {
my ($self, $cmd, $file, $args) = @_;
script_run("$cmd | tee $file", $args->{timeout});
upload_logs($file, failok => 1) unless $args->{noupload};
my $lname = $args->{logname} ? $args->{logname} : '';
upload_logs($file, failok => 1, log_name => $lname) unless $args->{noupload};
save_screenshot if $args->{screenshot};
}

Expand Down
2 changes: 1 addition & 1 deletion tests/hpc/barrier_init.pm
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use utils;
sub run ($self) {
# Get number of nodes
my $nodes = get_required_var('CLUSTER_NODES');

record_info("#barriers", $nodes);
# Initialize barriers
if (check_var('HPC', 'slurm')) {
barrier_create('CLUSTER_PROVISIONED', $nodes);
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/before_test.pm
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,6 @@ sub run ($self) {
sub test_flags ($self) {
return {fatal => 1, milestone => 1};
}
sub post_run_hook ($self) { }

1;
1 change: 1 addition & 0 deletions tests/hpc/ganglia_client.pm
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ sub run ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('gmond');
}
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/ganglia_server.pm
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ sub run ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('apache2');
$self->upload_service_log('gmond');
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/hpc_master.pm
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('slurmd');
$self->upload_service_log('munge');
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/hpc_master_backup.pm
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('slurmd');
$self->upload_service_log('munge');
Expand Down
5 changes: 2 additions & 3 deletions tests/hpc/mpi_master.pm
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ sub run ($self) {
bin => '/home/bernhard/bin',
hpc_lib => '/usr/lib/hpc',
);

zypper_call("in $mpi-gnu-hpc $mpi-gnu-hpc-devel python3-devel");
my $need_restart = $self->setup_scientific_module();
$self->relogin_root if $need_restart;
Expand Down Expand Up @@ -64,7 +65,6 @@ sub run ($self) {
# python code is not compiled. *mpi_bin* is expected as a compiled binary. if compilation was not
# invoked return source code (ex: sample_scipy.py).
$mpi_bin = ($mpi_compiler) ? $mpi_bin : $mpi_c;

barrier_wait('MPI_BINARIES_READY');
my $mpirun_s = hpc::formatter->new();

Expand Down Expand Up @@ -103,7 +103,6 @@ sub run ($self) {
} else {
assert_script_run($mpirun_s->all_nodes("$exports_path{'bin'}/$mpi_bin"), timeout => 120);
}

barrier_wait('MPI_RUN_TEST');
}

Expand All @@ -112,7 +111,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
upload_logs('/tmp/mpi_bin.log');
$self->destroy_test_barriers();
$self->export_logs();
}

Expand Down
5 changes: 4 additions & 1 deletion tests/hpc/mpi_slave.pm
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@ sub test_flags ($self) {
return {fatal => 1, milestone => 1};
}

sub post_fail_hook ($self) { }
sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->export_logs();
}

1;
1 change: 1 addition & 0 deletions tests/hpc/mrsh_master.pm
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ sub test_flags {

sub post_fail_hook {
my ($self) = @_;
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('munge');
}
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/mrsh_slave.pm
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('munge');
$self->upload_service_log('mrshd');
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/munge_master.pm
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ sub run ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('munge');
}
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/munge_slave.pm
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ sub run ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('munge');
}
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/pdsh_master.pm
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('mrshd');
$self->upload_service_log('munge');
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/pdsh_slave.pm
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
upload_logs '/tmp/pdsh.log';
$self->upload_service_log('munge');
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/slurm_db.pm
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('slurmd');
$self->upload_service_log('munge');
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/slurm_master.pm
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('slurmd');
$self->upload_service_log('munge');
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/slurm_master_backup.pm
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('slurmd');
$self->upload_service_log('munge');
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/slurm_master_backup_db.pm
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('slurmd');
$self->upload_service_log('munge');
Expand Down
1 change: 1 addition & 0 deletions tests/hpc/slurm_slave.pm
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ sub test_flags ($self) {
}

sub post_fail_hook ($self) {
$self->destroy_test_barriers();
$self->select_serial_terminal;
$self->upload_service_log('slurmd');
$self->upload_service_log('munge');
Expand Down

0 comments on commit e4e05a8

Please sign in to comment.