Skip to content

Commit 90fb58d

Browse files
author
Gleb Natapov
committed
When frags are allocated from mpool by free_list the frag structure is also
allocated from mpool memory (which is registered memory for RDMA transports) This is not a problem for a small jobs, but for a big number of ranks an amount of waisted memory is big. This commit was SVN r13921.
1 parent e932d9a commit 90fb58d

23 files changed

+254
-171
lines changed

ompi/class/ompi_free_list.c

Lines changed: 54 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -20,26 +20,11 @@
2020
#include "ompi_config.h"
2121

2222
#include "ompi/class/ompi_free_list.h"
23+
#include "opal/include/opal/align.h"
2324
#include "opal/sys/cache.h"
2425
#include "opal/util/output.h"
2526
#include "ompi/mca/mpool/mpool.h"
2627

27-
static inline size_t align_to(size_t val, size_t alignment);
28-
static inline size_t align_to(size_t val, size_t alignment)
29-
{
30-
size_t mod;
31-
32-
if(0 == alignment)
33-
return val;
34-
35-
mod = val % alignment;
36-
37-
if(mod)
38-
val += (alignment - mod);
39-
40-
return val;
41-
}
42-
4328
static void ompi_free_list_construct(ompi_free_list_t* fl);
4429
static void ompi_free_list_destruct(ompi_free_list_t* fl);
4530

@@ -49,6 +34,7 @@ OBJ_CLASS_INSTANCE(ompi_free_list_t, opal_atomic_lifo_t,
4934
struct ompi_free_list_memory_t {
5035
opal_list_item_t super;
5136
mca_mpool_base_registration_t *registration;
37+
void *base_ptr;
5238
};
5339
typedef struct ompi_free_list_memory_t ompi_free_list_memory_t;
5440
static OBJ_CLASS_INSTANCE(ompi_free_list_memory_t,
@@ -69,7 +55,6 @@ static void ompi_free_list_construct(ompi_free_list_t* fl)
6955
fl->fl_num_waiting = 0;
7056
fl->fl_elem_size = sizeof(ompi_free_list_item_t);
7157
fl->fl_elem_class = OBJ_CLASS(ompi_free_list_item_t);
72-
fl->fl_header_space = 0;
7358
fl->fl_alignment = 0;
7459
fl->fl_mpool = 0;
7560
OBJ_CONSTRUCT(&(fl->fl_allocations), opal_list_t);
@@ -78,6 +63,7 @@ static void ompi_free_list_construct(ompi_free_list_t* fl)
7863
static void ompi_free_list_destruct(ompi_free_list_t* fl)
7964
{
8065
opal_list_item_t *item;
66+
ompi_free_list_memory_t *fl_mem;
8167

8268
#if 0 && OMPI_ENABLE_DEBUG
8369
if(opal_list_get_size(&fl->super) != fl->fl_num_allocated) {
@@ -87,21 +73,15 @@ static void ompi_free_list_destruct(ompi_free_list_t* fl)
8773
}
8874
#endif
8975

90-
if (NULL != fl->fl_mpool) {
91-
ompi_free_list_memory_t *fl_mem;
92-
93-
while (NULL != (item = opal_list_remove_first(&(fl->fl_allocations)))) {
94-
/* destruct the item (we constructed it), then free the memory chunk */
95-
OBJ_DESTRUCT(item);
96-
fl_mem = (ompi_free_list_memory_t*) item;
97-
fl->fl_mpool->mpool_free(fl->fl_mpool, item, fl_mem->registration);
98-
}
99-
} else {
100-
while (NULL != (item = opal_list_remove_first(&(fl->fl_allocations)))) {
101-
/* destruct the item (we constructed it), then free the memory chunk */
102-
OBJ_DESTRUCT(item);
103-
free(item);
76+
while(NULL != (item = opal_list_remove_first(&(fl->fl_allocations)))) {
77+
fl_mem = (ompi_free_list_memory_t*)item;
78+
if(fl->fl_mpool != NULL) {
79+
fl->fl_mpool->mpool_free(fl->fl_mpool, fl_mem->base_ptr,
80+
fl_mem->registration);
10481
}
82+
/* destruct the item (we constructed it), then free the memory chunk */
83+
OBJ_DESTRUCT(item);
84+
free(item);
10585
}
10686

10787
OBJ_DESTRUCT(&fl->fl_allocations);
@@ -112,14 +92,17 @@ static void ompi_free_list_destruct(ompi_free_list_t* fl)
11292
int ompi_free_list_init_ex(
11393
ompi_free_list_t *flist,
11494
size_t elem_size,
115-
size_t header_space,
11695
size_t alignment,
11796
opal_class_t* elem_class,
11897
int num_elements_to_alloc,
11998
int max_elements_to_alloc,
12099
int num_elements_per_alloc,
121100
mca_mpool_base_module_t* mpool)
122101
{
102+
/* alignment must be more than zero and power of two */
103+
if(alignment <= 1 || (alignment & (alignment - 1)))
104+
return OMPI_ERROR;
105+
123106
if(elem_size > flist->fl_elem_size)
124107
flist->fl_elem_size = elem_size;
125108
if(elem_class)
@@ -128,61 +111,75 @@ int ompi_free_list_init_ex(
128111
flist->fl_num_allocated = 0;
129112
flist->fl_num_per_alloc = num_elements_per_alloc;
130113
flist->fl_mpool = mpool;
131-
flist->fl_header_space = header_space;
132114
flist->fl_alignment = alignment;
133-
flist->fl_elem_size = align_to(flist->fl_elem_size, flist->fl_alignment);
134115
if(num_elements_to_alloc)
135116
return ompi_free_list_grow(flist, num_elements_to_alloc);
136117
return OMPI_SUCCESS;
137118
}
138119

139120
int ompi_free_list_grow(ompi_free_list_t* flist, size_t num_elements)
140121
{
141-
unsigned char* ptr;
122+
unsigned char *ptr, *mpool_alloc_ptr = NULL;
142123
ompi_free_list_memory_t *alloc_ptr;
143-
size_t i, alloc_size;
144-
mca_mpool_base_registration_t* user_out = NULL;
124+
size_t i, alloc_size, head_size, elem_size = 0;
125+
mca_mpool_base_registration_t *reg = NULL;
145126

146-
if (flist->fl_max_to_alloc > 0)
147-
if (flist->fl_num_allocated + num_elements > flist->fl_max_to_alloc)
127+
if(flist->fl_max_to_alloc > 0)
128+
if(flist->fl_num_allocated + num_elements > flist->fl_max_to_alloc)
148129
num_elements = flist->fl_max_to_alloc - flist->fl_num_allocated;
149130

150-
if (num_elements == 0)
131+
if(num_elements == 0)
151132
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
152133

153-
alloc_size = num_elements * flist->fl_elem_size +
154-
sizeof(ompi_free_list_memory_t) + flist->fl_header_space +
134+
head_size = (NULL == flist->fl_mpool) ? flist->fl_elem_size:
135+
flist->fl_elem_class->cls_sizeof;
136+
head_size = OPAL_ALIGN(head_size, flist->fl_alignment, size_t);
137+
138+
/* calculate head allocation size */
139+
alloc_size = num_elements * head_size + sizeof(ompi_free_list_memory_t) +
155140
flist->fl_alignment;
156141

157-
if (NULL != flist->fl_mpool)
158-
alloc_ptr = (ompi_free_list_memory_t*)flist->fl_mpool->mpool_alloc(flist->fl_mpool,
159-
alloc_size, 0, MCA_MPOOL_FLAGS_CACHE_BYPASS, &user_out);
160-
else
161-
alloc_ptr = (ompi_free_list_memory_t*)malloc(alloc_size);
142+
alloc_ptr = (ompi_free_list_memory_t*)malloc(alloc_size);
162143

163144
if(NULL == alloc_ptr)
164145
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
165146

166-
/* make the alloc_ptr a list item, save the chunk in the allocations list, and
167-
have ptr point to memory right after the list item structure */
168-
OBJ_CONSTRUCT(alloc_ptr, ompi_free_list_memory_t);
169-
opal_list_append(&(flist->fl_allocations), (opal_list_item_t*) alloc_ptr);
147+
/* allocate the rest from the mpool */
148+
if(flist->fl_mpool != NULL) {
149+
elem_size = OPAL_ALIGN(flist->fl_elem_size -
150+
flist->fl_elem_class->cls_sizeof, flist->fl_alignment, size_t);
151+
if(elem_size != 0) {
152+
mpool_alloc_ptr = flist->fl_mpool->mpool_alloc(flist->fl_mpool,
153+
num_elements * elem_size, flist->fl_alignment,
154+
MCA_MPOOL_FLAGS_CACHE_BYPASS, &reg);
155+
if(NULL == mpool_alloc_ptr) {
156+
free(alloc_ptr);
157+
return OMPI_ERR_TEMP_OUT_OF_RESOURCE;
158+
}
159+
}
160+
}
170161

171-
alloc_ptr->registration = user_out;
162+
/* make the alloc_ptr a list item, save the chunk in the allocations list,
163+
* and have ptr point to memory right after the list item structure */
164+
OBJ_CONSTRUCT(alloc_ptr, ompi_free_list_memory_t);
165+
opal_list_append(&(flist->fl_allocations), (opal_list_item_t*)alloc_ptr);
172166

173-
ptr = (unsigned char*) alloc_ptr + sizeof(ompi_free_list_memory_t);
167+
alloc_ptr->registration = reg;
168+
alloc_ptr->base_ptr = mpool_alloc_ptr;
174169

175-
ptr = (unsigned char*)(align_to((size_t)ptr + flist->fl_header_space,
176-
flist->fl_alignment) - flist->fl_header_space);
170+
ptr = (unsigned char*)alloc_ptr + sizeof(ompi_free_list_memory_t);
171+
ptr = OPAL_ALIGN_PTR(ptr, flist->fl_alignment, unsigned char*);
177172

178173
for(i=0; i<num_elements; i++) {
179174
ompi_free_list_item_t* item = (ompi_free_list_item_t*)ptr;
180-
item->user_data = user_out;
175+
item->registration = reg;
176+
item->ptr = mpool_alloc_ptr;
181177

182178
OBJ_CONSTRUCT_INTERNAL(item, flist->fl_elem_class);
183179

184180
opal_atomic_lifo_push(&(flist->super), &(item->super));
185-
ptr += flist->fl_elem_size;
181+
ptr += head_size;
182+
mpool_alloc_ptr += elem_size;
186183
}
187184

188185
flist->fl_num_allocated += num_elements;

ompi/class/ompi_free_list.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ struct ompi_free_list_t
4040
size_t fl_num_per_alloc;
4141
size_t fl_num_waiting;
4242
size_t fl_elem_size;
43-
size_t fl_header_space;
4443
size_t fl_alignment;
4544
opal_class_t* fl_elem_class;
4645
struct mca_mpool_base_module_t* fl_mpool;
@@ -51,10 +50,12 @@ struct ompi_free_list_t
5150
typedef struct ompi_free_list_t ompi_free_list_t;
5251
OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_free_list_t);
5352

53+
struct mca_mpool_base_registration_t;
5454
struct ompi_free_list_item_t
5555
{
5656
opal_list_item_t super;
57-
void* user_data;
57+
struct mca_mpool_base_registration_t *registration;
58+
void *ptr;
5859
};
5960
typedef struct ompi_free_list_item_t ompi_free_list_item_t;
6061

@@ -75,7 +76,6 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_free_list_item_t);
7576
OMPI_DECLSPEC int ompi_free_list_init_ex(
7677
ompi_free_list_t *free_list,
7778
size_t element_size,
78-
size_t header_size,
7979
size_t alignment,
8080
opal_class_t* element_class,
8181
int num_elements_to_alloc,
@@ -92,7 +92,7 @@ static inline int ompi_free_list_init(
9292
int num_elements_per_alloc,
9393
struct mca_mpool_base_module_t* mpool)
9494
{
95-
return ompi_free_list_init_ex(free_list, element_size, 0, CACHE_LINE_SIZE,
95+
return ompi_free_list_init_ex(free_list, element_size, CACHE_LINE_SIZE,
9696
element_class, num_elements_to_alloc, max_elements_to_alloc,
9797
num_elements_per_alloc, mpool);
9898
}

ompi/mca/btl/gm/btl_gm_frag.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ do { \
2929

3030
static void mca_btl_gm_frag_eager_constructor(mca_btl_gm_frag_t* frag)
3131
{
32-
frag->hdr = (mca_btl_base_header_t*)(frag + 1);
32+
frag->hdr = (mca_btl_base_header_t*)frag->base.super.ptr;
3333
frag->segment.seg_addr.pval = (unsigned char*)(frag->hdr + 1);
3434
frag->segment.seg_len = mca_btl_gm_module.super.btl_eager_limit - sizeof(mca_btl_base_header_t);
3535
frag->size = mca_btl_gm_component.gm_eager_frag_size;

ompi/mca/btl/mvapi/btl_mvapi_component.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,10 @@ mca_btl_base_module_t** mca_btl_mvapi_component_init(int *num_btl_modules,
583583
2*MCA_BTL_IB_FRAG_ALIGN;
584584

585585
mvapi_btl->eager_rdma_frag_size =
586-
length & ~(2 * MCA_BTL_IB_FRAG_ALIGN - 1);
586+
(sizeof(mca_btl_mvapi_header_t) +
587+
sizeof(mca_btl_mvapi_footer_t) +
588+
mvapi_btl->super.btl_eager_limit +
589+
2*MCA_BTL_IB_FRAG_ALIGN) & ~(2 * MCA_BTL_IB_FRAG_ALIGN - 1);
587590

588591
ompi_free_list_init(&mvapi_btl->send_free_eager,
589592
length,

ompi/mca/btl/mvapi/btl_mvapi_eager_rdma.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ extern "C" {
1818
#endif
1919

2020
struct mca_btl_mvapi_reg_t;
21+
struct mca_btl_mvapi_frag_t;
2122

2223
struct mca_btl_mvapi_eager_rdma_local_t {
2324
ompi_ptr_t base; /**< buffer for RDMAing eager messages */
25+
struct mca_btl_mvapi_frag_t *frags;
2426
struct mca_btl_mvapi_reg_t *reg;
2527
uint16_t head; /**< RDMA buffer to poll */
2628
uint16_t tail; /**< Needed for credit managment */
@@ -73,9 +75,7 @@ typedef struct mca_btl_mvapi_eager_rdma_remote_t mca_btl_mvapi_eager_rdma_remote
7375
}while (0)
7476

7577
#define MCA_BTL_MVAPI_GET_LOCAL_RDMA_FRAG(E, I) \
76-
(mca_btl_mvapi_frag_t*) \
77-
((char*)(E)->eager_rdma_local.base.pval + \
78-
(I) * (E)->endpoint_btl->eager_rdma_frag_size)
78+
(&(E)->eager_rdma_local.frags[(I)])
7979

8080
#define MCA_BTL_MVAPI_RDMA_NEXT_INDEX(I) do { \
8181
(I) = ((I) + 1) % \

ompi/mca/btl/mvapi/btl_mvapi_endpoint.c

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -165,10 +165,9 @@ static inline int mca_btl_mvapi_endpoint_post_send(
165165
#endif
166166
frag->desc.sr_desc.r_key = (VAPI_rkey_t)endpoint->eager_rdma_remote.rkey;
167167
frag->desc.sr_desc.remote_addr = (VAPI_virt_addr_t)
168-
endpoint->eager_rdma_remote.base.lval +
168+
(uintptr_t)endpoint->eager_rdma_remote.base.pval +
169169
endpoint->eager_rdma_remote.head *
170170
mvapi_btl->eager_rdma_frag_size +
171-
sizeof(mca_btl_mvapi_frag_t) +
172171
sizeof(mca_btl_mvapi_header_t) +
173172
frag->size +
174173
sizeof(mca_btl_mvapi_footer_t);
@@ -1223,25 +1222,34 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma(
12231222
{
12241223
mca_btl_mvapi_module_t* mvapi_btl = endpoint->endpoint_btl;
12251224
char *buf;
1225+
mca_btl_mvapi_recv_frag_eager_t *headers_buf;
12261226
unsigned int i;
12271227

12281228
OPAL_THREAD_LOCK(&endpoint->eager_rdma_local.lock);
12291229
if (endpoint->eager_rdma_local.base.pval)
12301230
goto unlock_rdma_local;
12311231

1232+
headers_buf = (mca_btl_mvapi_recv_frag_eager_t*)
1233+
malloc(sizeof(mca_btl_mvapi_recv_frag_eager_t) *
1234+
mca_btl_mvapi_component.eager_rdma_num);
1235+
1236+
if(NULL == headers_buf)
1237+
goto unlock_rdma_local;
1238+
12321239
buf = mvapi_btl->super.btl_mpool->mpool_alloc(mvapi_btl->super.btl_mpool,
12331240
mvapi_btl->eager_rdma_frag_size *
12341241
mca_btl_mvapi_component.eager_rdma_num, 0,
12351242
MCA_MPOOL_FLAGS_CACHE_BYPASS,
12361243
(mca_mpool_base_registration_t**)&endpoint->eager_rdma_local.reg);
12371244

12381245
if(!buf)
1239-
goto unlock_rdma_local;
1246+
goto free_headers_buf;
12401247

12411248
for(i = 0; i < mca_btl_mvapi_component.eager_rdma_num; i++) {
1242-
ompi_free_list_item_t *item = (ompi_free_list_item_t *)(buf +
1243-
i*mvapi_btl->eager_rdma_frag_size);
1244-
item->user_data = (void*)endpoint->eager_rdma_local.reg;
1249+
ompi_free_list_item_t *item;
1250+
item = (ompi_free_list_item_t *)&headers_buf[i];
1251+
item->registration = (void*)endpoint->eager_rdma_local.reg;
1252+
item->ptr = buf + i * mvapi_btl->eager_rdma_frag_size;
12451253
OBJ_CONSTRUCT(item, mca_btl_mvapi_recv_frag_eager_t);
12461254
((mca_btl_mvapi_frag_t*)item)->endpoint = endpoint;
12471255
((mca_btl_mvapi_frag_t*)item)->type = MCA_BTL_MVAPI_FRAG_EAGER_RDMA;
@@ -1253,6 +1261,7 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma(
12531261
goto cleanup;
12541262

12551263
endpoint->eager_rdma_local.base.pval = buf;
1264+
endpoint->eager_rdma_local.frags = headers_buf;
12561265
mvapi_btl->eager_rdma_buffers_count++;
12571266
if (mca_btl_mvapi_endpoint_send_eager_rdma(endpoint) == 0) {
12581267
OPAL_THREAD_UNLOCK(&mvapi_btl->eager_rdma_lock);
@@ -1262,13 +1271,16 @@ void mca_btl_mvapi_endpoint_connect_eager_rdma(
12621271

12631272
mvapi_btl->eager_rdma_buffers_count--;
12641273
endpoint->eager_rdma_local.base.pval = NULL;
1274+
endpoint->eager_rdma_local.frags = NULL;
12651275
orte_pointer_array_set_item(mvapi_btl->eager_rdma_buffers,
12661276
endpoint->eager_rdma_index, NULL);
12671277

12681278
cleanup:
12691279
OPAL_THREAD_UNLOCK(&mvapi_btl->eager_rdma_lock);
12701280
mvapi_btl->super.btl_mpool->mpool_free(mvapi_btl->super.btl_mpool,
12711281
buf, (mca_mpool_base_registration_t*)endpoint->eager_rdma_local.reg);
1282+
free_headers_buf:
1283+
free(headers_buf);
12721284
unlock_rdma_local:
12731285
OPAL_THREAD_UNLOCK(&endpoint->eager_rdma_local.lock);
12741286
}

ompi/mca/btl/mvapi/btl_mvapi_frag.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323
static void mca_btl_mvapi_frag_common_constructor( mca_btl_mvapi_frag_t* frag)
2424
{
2525
mca_btl_mvapi_reg_t* mem_hndl =
26-
(mca_btl_mvapi_reg_t*)frag->base.super.user_data;
27-
frag->hdr = (mca_btl_mvapi_header_t*) (frag+1); /* initialize btl header to start at end of frag */
26+
(mca_btl_mvapi_reg_t*)frag->base.super.registration;
27+
frag->hdr = (mca_btl_mvapi_header_t*)frag->base.super.ptr;
2828
frag->segment.seg_addr.pval = ((unsigned char* )frag->hdr) + sizeof(mca_btl_mvapi_header_t);
2929
/* init the segment address to start after the btl header */
3030

0 commit comments

Comments
 (0)