From 82e675d16163da66fbb0b4c42cd3ced7bc8d16a0 Mon Sep 17 00:00:00 2001 From: Matias A Cabral Date: Thu, 12 May 2016 16:01:41 -0700 Subject: [PATCH] Update in PSM and PSM2 MTLs to detect entries created by drivers for Intel TrueScale and Intel OmniPath, and detect a link in ACTIVE state. This fix addresses the scenario reported in the below OMPI users email, including formerly named Qlogic IB, now Intel True scale. Given the nature of the PSM/PSM2 mtls this fix applies to OmniPath: https://www.open-mpi.org/community/lists/users/2016/04/29018.php --- config/ompi_check_psm.m4 | 9 ++++++ config/ompi_check_psm2.m4 | 8 +++++ ompi/mca/mtl/psm/mtl_psm_component.c | 38 ++++++++++++++++++++--- ompi/mca/mtl/psm2/mtl_psm2_component.c | 42 +++++++++++++++++++++++--- 4 files changed, 88 insertions(+), 9 deletions(-) diff --git a/config/ompi_check_psm.m4 b/config/ompi_check_psm.m4 index 2c25b5f43f..9142c75603 100644 --- a/config/ompi_check_psm.m4 +++ b/config/ompi_check_psm.m4 @@ -14,6 +14,8 @@ dnl Copyright (c) 2006 QLogic Corp. All rights reserved. dnl Copyright (c) 2009 Cisco Systems, Inc. All rights reserved. dnl Copyright (c) 2015 Research Organization for Information Science dnl and Technology (RIST). All rights reserved. +dnl Copyright (c) 2016 Intel Corporation. All rights reserved. +dnl dnl $COPYRIGHT$ dnl dnl Additional copyrights may follow @@ -65,6 +67,13 @@ AC_DEFUN([OMPI_CHECK_PSM],[ [AC_MSG_WARN([PSM driver does not currently support progress threads. Disabling BTL.]) ompi_check_psm_happy="no"]) + AS_IF([test "$ompi_check_psm_happy" = "yes"], + [AC_CHECK_HEADERS( + glob.h, + [], + [AC_MSG_WARN([glob.h not found. Can not build component.]) + ompi_check_psm_happy="no"])]) + AS_IF([test "$ompi_check_psm_happy" = "yes"], [$2], [AS_IF([test ! -z "$with_psm" && test "$with_psm" != "no"], diff --git a/config/ompi_check_psm2.m4 b/config/ompi_check_psm2.m4 index 15d94c9a4b..e5f0a25555 100644 --- a/config/ompi_check_psm2.m4 +++ b/config/ompi_check_psm2.m4 @@ -66,6 +66,14 @@ AC_DEFUN([OMPI_CHECK_PSM2],[ [AC_MSG_WARN([PSM2 driver does not currently support progress threads. Disabling MTL.]) ompi_check_psm2_happy="no"]) + AS_IF([test "$ompi_check_psm2_happy" = "yes"], + [AC_CHECK_HEADERS( + glob.h, + [], + [AC_MSG_WARN([glob.h not found. Can not build component.]) + ompi_check_psm2_happy="no"])]) + + AS_IF([test "$ompi_check_psm2_happy" = "yes"], [$2], [AS_IF([test ! -z "$with_psm2" && test "$with_psm2" != "no"], diff --git a/ompi/mca/mtl/psm/mtl_psm_component.c b/ompi/mca/mtl/psm/mtl_psm_component.c index b4b71b2f99..ef72bc6e34 100644 --- a/ompi/mca/mtl/psm/mtl_psm_component.c +++ b/ompi/mca/mtl/psm/mtl_psm_component.c @@ -13,7 +13,7 @@ * Copyright (c) 2006-2010 QLogic Corporation. All rights reserved. * Copyright (c) 2012-2015 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2016 Intel Corporation. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,6 +37,7 @@ #include #include #include +#include static int param_priority; @@ -185,12 +186,41 @@ ompi_mtl_psm_component_open(void) } /* Component available only if Truescale hardware is present */ - if (0 == stat("/dev/ipath", &st)) { - return OMPI_SUCCESS; + if (0 != stat("/dev/ipath", &st)) { + return OPAL_ERR_NOT_AVAILABLE; + } + + /* Component available only if at least one qib port is ACTIVE */ + bool foundOnlineQibPort = false; + size_t i; + char portState[128]; + FILE *devFile; + glob_t globbuf; + globbuf.gl_offs = 0; + if (glob("/sys/class/infiniband/qib*/ports/*/state", + GLOB_DOOFFS, NULL, &globbuf) != 0) { + return OPAL_ERR_NOT_AVAILABLE; + } + + for (i=0;i < globbuf.gl_pathc; i++) { + devFile = fopen(globbuf.gl_pathv[i], "r"); + fgets(portState, sizeof(portState), devFile); + fclose(devFile); + + if (strstr(portState, "ACTIVE") != NULL) { + /* Found at least one ACTIVE port */ + foundOnlineQibPort = true; + break; + } } - else { + + globfree(&globbuf); + + if (!foundOnlineQibPort) { return OPAL_ERR_NOT_AVAILABLE; } + + return OMPI_SUCCESS; } static int diff --git a/ompi/mca/mtl/psm2/mtl_psm2_component.c b/ompi/mca/mtl/psm2/mtl_psm2_component.c index 3ae4151360..26bccd2204 100644 --- a/ompi/mca/mtl/psm2/mtl_psm2_component.c +++ b/ompi/mca/mtl/psm2/mtl_psm2_component.c @@ -13,7 +13,7 @@ * Copyright (c) 2006-2010 QLogic Corporation. All rights reserved. * Copyright (c) 2012-2015 Los Alamos National Security, LLC. * All rights reserved. - * Copyright (c) 2013-2015 Intel, Inc. All rights reserved + * Copyright (c) 2013-2016 Intel, Inc. All rights reserved * $COPYRIGHT$ * * Additional copyrights may follow @@ -37,6 +37,7 @@ #include #include #include +#include static int param_priority; @@ -101,15 +102,46 @@ ompi_mtl_psm2_component_register(void) static int ompi_mtl_psm2_component_open(void) { - struct stat st; + glob_t globbuf; + globbuf.gl_offs = 0; /* Component available only if Omni-Path hardware is present */ - if (0 == stat("/dev/hfi1", &st)) { - return OMPI_SUCCESS; + if ((glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf) != 0) && + (glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf) != 0)) { + return OPAL_ERR_NOT_AVAILABLE; + } + + globfree(&globbuf); + + /* Component available only if at least one hfi1 port is ACTIVE */ + bool foundOnlineHfi1Port = false; + size_t i; + char portState[128]; + FILE *devFile; + if (glob("/sys/class/infiniband/hfi1_*/ports/*/state", + GLOB_DOOFFS, NULL, &globbuf) != 0) { + return OPAL_ERR_NOT_AVAILABLE; + } + + for (i=0;i < globbuf.gl_pathc; i++) { + devFile = fopen(globbuf.gl_pathv[i], "r"); + fgets(portState, sizeof(portState), devFile); + fclose(devFile); + + if (strstr(portState, "ACTIVE") != NULL) { + /* Found at least one ACTIVE port */ + foundOnlineHfi1Port = true; + break; + } } - else { + + globfree(&globbuf); + + if (!foundOnlineHfi1Port) { return OPAL_ERR_NOT_AVAILABLE; } + + return OMPI_SUCCESS; } static int