11/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
22/*
3- * Copyright (c) 2011-2014 Los Alamos National Security, LLC. All rights
3+ * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
44 * reserved.
55 * Copyright (c) 2011 UT-Battelle, LLC. All rights reserved.
66 * $COPYRIGHT$
1717
1818enum mca_btl_ugni_endpoint_state_t {
1919 MCA_BTL_UGNI_EP_STATE_INIT = 0 ,
20+ MCA_BTL_UGNI_EP_STATE_START ,
2021 MCA_BTL_UGNI_EP_STATE_RDMA ,
2122 MCA_BTL_UGNI_EP_STATE_CONNECTING ,
2223 MCA_BTL_UGNI_EP_STATE_CONNECTED
@@ -30,7 +31,10 @@ typedef struct mca_btl_base_endpoint_t {
3031
3132 opal_proc_t * peer_proc ;
3233
33- opal_mutex_t lock ;
34+ /** may need to lock recursively as the modex lookup could call opal_progress
35+ * and hence our progress function. if this changes modify this mutex to not
36+ * be recursive. also need to update the constructor function. */
37+ opal_recursive_mutex_t lock ;
3438 mca_btl_ugni_endpoint_state_t state ;
3539
3640 opal_common_ugni_endpoint_t * common ;
@@ -48,6 +52,8 @@ typedef struct mca_btl_base_endpoint_t {
4852
4953 opal_list_t frag_wait_list ;
5054 bool wait_listed ;
55+ /** protect against race on connection */
56+ bool dg_posted ;
5157
5258 int32_t smsg_progressing ;
5359
@@ -74,7 +80,6 @@ static inline int mca_btl_ugni_init_ep (mca_btl_ugni_module_t *ugni_module,
7480
7581 endpoint -> btl = btl ;
7682 endpoint -> peer_proc = peer_proc ;
77- endpoint -> common = NULL ;
7883 endpoint -> index = opal_pointer_array_add (& ugni_module -> endpoints , endpoint );
7984
8085 * ep = endpoint ;
@@ -116,6 +121,7 @@ static inline int mca_btl_ugni_check_endpoint_state (mca_btl_ugni_endpoint_t *ep
116121 switch (ep -> state ) {
117122 case MCA_BTL_UGNI_EP_STATE_INIT :
118123 case MCA_BTL_UGNI_EP_STATE_RDMA :
124+ case MCA_BTL_UGNI_EP_STATE_START :
119125 rc = mca_btl_ugni_ep_connect_progress (ep );
120126 if (OPAL_SUCCESS != rc ) {
121127 break ;
@@ -139,7 +145,15 @@ static inline int mca_btl_ugni_ep_connect_rdma (mca_btl_base_endpoint_t *ep) {
139145 return OPAL_SUCCESS ;
140146 }
141147
142- /* get the modex info for this endpoint and setup a ugni endpoint */
148+ /* protect against re-entry from opal_progress */
149+ if (OPAL_UNLIKELY (MCA_BTL_UGNI_EP_STATE_START == ep -> state )) {
150+ return OPAL_ERR_RESOURCE_BUSY ;
151+ }
152+
153+ ep -> state = MCA_BTL_UGNI_EP_STATE_START ;
154+
155+ /* get the modex info for this endpoint and setup a ugni endpoint. this call may lead
156+ * to re-entry through opal_progress(). */
143157 rc = opal_common_ugni_endpoint_for_proc (ep -> btl -> device , ep -> peer_proc , & ep -> common );
144158 if (OPAL_SUCCESS != rc ) {
145159 assert (0 );
0 commit comments