diff --git a/configure.ac b/configure.ac index b05e33b08d7..801bffdbd57 100644 --- a/configure.ac +++ b/configure.ac @@ -3,7 +3,7 @@ # Copyright (c) 2004-2009 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2014 The University of Tennessee and The University +# Copyright (c) 2004-2015 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2007 High Performance Computing Center Stuttgart, @@ -1460,6 +1460,7 @@ AC_CONFIG_FILES([ test/support/Makefile test/threads/Makefile test/util/Makefile + test/monitoring/Makefile ]) AC_CONFIG_FILES([contrib/dist/mofed/debian/rules], [chmod +x contrib/dist/mofed/debian/rules]) diff --git a/ompi/mca/pml/monitoring/Makefile.am b/ompi/mca/pml/monitoring/Makefile.am new file mode 100644 index 00000000000..504a2c6fd4e --- /dev/null +++ b/ompi/mca/pml/monitoring/Makefile.am @@ -0,0 +1,41 @@ +# +# Copyright (c) 2013-2015 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2013-2015 Inria. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +monitoring_sources = \ + pml_monitoring.c \ + pml_monitoring.h \ + pml_monitoring_comm.c \ + pml_monitoring_comm.h \ + pml_monitoring_component.c \ + pml_monitoring_component.h \ + pml_monitoring_hdr.h \ + pml_monitoring_iprobe.c \ + pml_monitoring_irecv.c \ + pml_monitoring_isend.c \ + pml_monitoring_start.c + +if MCA_BUILD_ompi_pml_monitoring_DSO +component_noinst = +component_install = mca_pml_monitoring.la +else +component_noinst = libmca_pml_monitoring.la +component_install = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component_install) +mca_pml_monitoring_la_SOURCES = $(monitoring_sources) +mca_pml_monitoring_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(component_noinst) +libmca_pml_monitoring_la_SOURCES = $(monitoring_sources) +libmca_pml_monitoring_la_LDFLAGS = -module -avoid-version diff --git a/ompi/mca/pml/monitoring/README b/ompi/mca/pml/monitoring/README new file mode 100644 index 00000000000..8361027d658 --- /dev/null +++ b/ompi/mca/pml/monitoring/README @@ -0,0 +1,181 @@ + + Copyright (c) 2013-2015 The University of Tennessee and The University + of Tennessee Research Foundation. All rights + reserved. + Copyright (c) 2013-2015 Inria. All rights reserved. + $COPYRIGHT$ + + Additional copyrights may follow + + $HEADER$ + +=========================================================================== + +Low level communication monitoring interface in Open MPI + +Introduction +------------ +This interface traces and monitors all messages sent by MPI before they go to the +communication channels. At that levels all communication are point-to-point communications: +collectives are already decomposed in send and receive calls. + +The monitoring is stored internally by each process and output on stderr at the end of the +application (during MPI_Finalize()). + + +Enabling the monitoring +----------------------- +To enable the monitoring add --mca pml_monitoring_enable x to the mpirun command line. +If x = 1 it monitors internal and external tags indifferently and aggregate everything. +If x = 2 it monitors internal tags and external tags separately. +If x = 0 the monitoring is disabled. +Other value of x are not supported. + +Internal tags are tags < 0. They are used to tag send and receive coming from +collective operations or from protocol communications + +External tags are tags >=0. They are used by the application in point-to-point communication. + +Therefore, distinguishing external and internal tags help to distinguish between point-to-point +and other communication (mainly collectives). + +Output format +------------- +The output of the monitoring looks like (with --mca pml_monitoring_enable 2): +I 0 1 108 bytes 27 msgs sent +E 0 1 1012 bytes 30 msgs sent +E 0 2 23052 bytes 61 msgs sent +I 1 2 104 bytes 26 msgs sent +I 1 3 208 bytes 52 msgs sent +E 1 0 860 bytes 24 msgs sent +E 1 3 2552 bytes 56 msgs sent +I 2 3 104 bytes 26 msgs sent +E 2 0 22804 bytes 49 msgs sent +E 2 3 860 bytes 24 msgs sent +I 3 0 104 bytes 26 msgs sent +I 3 1 204 bytes 51 msgs sent +E 3 1 2304 bytes 44 msgs sent +E 3 2 860 bytes 24 msgs sent + +Where: + - the first column distinguishes internal (I) and external (E) tags. + - the second column is the sender rank + - the third column is the receiver rank + - the fourth column is the number of bytes sent + - the last column is the number of messages. + +In this example process 0 as sent 27 messages to process 1 using point-to-point call +for 108 bytes and 30 messages with collectives and protocol related communication +for 1012 bytes to process 1. + +If the monitoring was called with --mca pml_monitoring_enable 1 everything is aggregated +under the internal tags. With te above example, you have: +I 0 1 1120 bytes 57 msgs sent +I 0 2 23052 bytes 61 msgs sent +I 1 0 860 bytes 24 msgs sent +I 1 2 104 bytes 26 msgs sent +I 1 3 2760 bytes 108 msgs sent +I 2 0 22804 bytes 49 msgs sent +I 2 3 964 bytes 50 msgs sent +I 3 0 104 bytes 26 msgs sent +I 3 1 2508 bytes 95 msgs sent +I 3 2 860 bytes 24 msgs sent + +Monitoring phases +----------------- +If one wants to monitor phases of the application, it is possible to flush the monitoring +at the application level. In this case all the monitoring since the last flush is stored +by every process in a file. + +An example of how to flush such monitoring is given in test/monitoring/monitoring_test.c + +Moreover, all the different flushed phased are aggregated at runtime and output at the end +of the application as described above. + +Example +------- +A working example is given in test/monitoring/monitoring_test.c +It features, MPI_COMM_WORLD monitoring , sub-communicator monitoring, collective and +point-to-point communication monitoring and phases monitoring + +To compile: +> make monitoring_test + +Helper scripts +-------------- +Two perl scripts are provided in test/monitoring +- aggregate_profile.pl is for aggregating monitoring phases of different processes + This script aggregates the profiles generated by the flush_monitoring function. + The files need to be in in given format: name__ + They are then aggregated by phases. + If one needs the profile of all the phases he can concatenate the different files, + or use the output of the monitoring system done at MPI_Finalize + in the example it should be call as: + ./aggregate_profile.pl prof/phase to generate + prof/phase_1.prof + prof/phase_2.prof + +- profile2mat.pl is for transforming a the monitoring output into a communication matrix. + Take a profile file and aggregates all the recorded communicator into matrices. + It generated a matrices for the number of messages, (msg), + for the total bytes transmitted (size) and + the average number of bytes per messages (avg) + + The output matrix is symmetric + +Do not forget to enable the execution right to these scripts. + +For instance, the provided examples store phases output in ./prof + +If you type: +> mpirun -np 4 --mca pml_monitoring_enable 2 ./monitoring_test +you should have the following output +Proc 3 flushing monitoring to: ./prof/phase_1_3.prof +Proc 0 flushing monitoring to: ./prof/phase_1_0.prof +Proc 2 flushing monitoring to: ./prof/phase_1_2.prof +Proc 1 flushing monitoring to: ./prof/phase_1_1.prof +Proc 1 flushing monitoring to: ./prof/phase_2_1.prof +Proc 3 flushing monitoring to: ./prof/phase_2_3.prof +Proc 0 flushing monitoring to: ./prof/phase_2_0.prof +Proc 2 flushing monitoring to: ./prof/phase_2_2.prof +I 2 3 104 bytes 26 msgs sent +E 2 0 22804 bytes 49 msgs sent +E 2 3 860 bytes 24 msgs sent +I 3 0 104 bytes 26 msgs sent +I 3 1 204 bytes 51 msgs sent +E 3 1 2304 bytes 44 msgs sent +E 3 2 860 bytes 24 msgs sent +I 0 1 108 bytes 27 msgs sent +E 0 1 1012 bytes 30 msgs sent +E 0 2 23052 bytes 61 msgs sent +I 1 2 104 bytes 26 msgs sent +I 1 3 208 bytes 52 msgs sent +E 1 0 860 bytes 24 msgs sent +E 1 3 2552 bytes 56 msgs sent + +you can parse the phases with: +> /aggregate_profile.pl prof/phase +Building prof/phase_1.prof +Building prof/phase_2.prof + +And you can build the different communication matrices of phase 1 with: +> ./profile2mat.pl prof/phase_1.prof +prof/phase_1.prof -> all +prof/phase_1_size_all.mat +prof/phase_1_msg_all.mat +prof/phase_1_avg_all.mat + +prof/phase_1.prof -> external +prof/phase_1_size_external.mat +prof/phase_1_msg_external.mat +prof/phase_1_avg_external.mat + +prof/phase_1.prof -> internal +prof/phase_1_size_internal.mat +prof/phase_1_msg_internal.mat +prof/phase_1_avg_internal.mat + +Credit +------ +Designed by George Bosilca and +Emmanuel Jeannot diff --git a/ompi/mca/pml/monitoring/pml_monitoring.c b/ompi/mca/pml/monitoring/pml_monitoring.c new file mode 100644 index 00000000000..0f47e7cd395 --- /dev/null +++ b/ompi/mca/pml/monitoring/pml_monitoring.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2013-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2013-2015 Inria. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include "opal/class/opal_hash_table.h" +typedef struct _transtlator_t{ + int *ranks; + int size; +} translator_t; + + +void initialize_monitoring( void ); +void monitor_send_data(int world_rank, size_t data_size, int tag); +void output_monitoring( void ); +void finalize_monitoring( void ); +int filter_monitoring( void ); /* returns 1 if we distinguish positive (point-to-point) and negative (collective and meta messages) tags*/ +int ompi_mca_pml_monitoring_flush(char* filename); + + +MPI_Group group_world; + +/* array for stroring monitoring data*/ +uint64_t* sent_data = NULL; +uint64_t* messages_count = NULL; +uint64_t* filtered_sent_data = NULL; +uint64_t* filtered_messages_count = NULL; +uint64_t* all_sent_data = NULL; +uint64_t* all_messages_count = NULL; +uint64_t* all_filtered_sent_data = NULL; +uint64_t* all_filtered_messages_count = NULL; + +int init_done = 0; +int nbprocs = -1; +int my_rank = -1; +opal_hash_table_t *translation_ht = NULL; + + +mca_pml_monitoring_module_t mca_pml_monitoring = { + mca_pml_monitoring_add_procs, + mca_pml_monitoring_del_procs, + mca_pml_monitoring_enable, + NULL, + mca_pml_monitoring_add_comm, + mca_pml_monitoring_del_comm, + mca_pml_monitoring_irecv_init, + mca_pml_monitoring_irecv, + mca_pml_monitoring_recv, + mca_pml_monitoring_isend_init, + mca_pml_monitoring_isend, + mca_pml_monitoring_send, + mca_pml_monitoring_iprobe, + mca_pml_monitoring_probe, + mca_pml_monitoring_start, + mca_pml_monitoring_improbe, + mca_pml_monitoring_mprobe, + mca_pml_monitoring_imrecv, + mca_pml_monitoring_mrecv, + mca_pml_monitoring_dump, + NULL, + 65535, + INT_MAX +}; + +int mca_pml_monitoring_add_procs(struct ompi_proc_t **procs, + size_t nprocs) +{ + /** + * Create the monitoring hashtable only for my MPI_COMM_WORLD. We choose + * to ignore by now all other processes. + */ + if(NULL == translation_ht) { + size_t i; + uint64_t key; + + nbprocs = nprocs; + + translation_ht = OBJ_NEW(opal_hash_table_t); + opal_hash_table_init(translation_ht, 2048); + + + for( i = 0; i < nprocs; i++ ) { + /* rank : ompi_proc_local_proc in procs */ + if( procs[i] == ompi_proc_local_proc) + my_rank = i; + key = *((uint64_t*)&(procs[i]->super.proc_name)); + /* store the rank (in COMM_WORLD) of the process + with its name (a uniq opal ID) as key in the hash table*/ + opal_hash_table_set_value_uint64(translation_ht, + key, + (void*)(uintptr_t)i); + } + } + return pml_selected_module.pml_add_procs(procs, nprocs); +} + + +int mca_pml_monitoring_del_procs(struct ompi_proc_t **procs, + size_t nprocs) +{ + return pml_selected_module.pml_del_procs(procs, nprocs); +} + +int mca_pml_monitoring_dump(struct ompi_communicator_t* comm, + int verbose) +{ + return pml_selected_module.pml_dump(comm, verbose); +} + + +void finalize_monitoring( void ){ + + if(filter_monitoring()){ + free(filtered_sent_data); + free(filtered_messages_count); + } + + free(sent_data); + free(messages_count); + opal_hash_table_remove_all( translation_ht ); + free(translation_ht); + +} +void initialize_monitoring( void ){ + + sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t)); + messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t)); + all_sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t)); + all_messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t)); + + if(filter_monitoring()){ + filtered_sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t)); + filtered_messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t)); + all_filtered_sent_data = (uint64_t*)calloc(nbprocs, sizeof(uint64_t)); + all_filtered_messages_count = (uint64_t*) calloc(nbprocs, sizeof(uint64_t)); + } + + init_done = 1; +} + + + +void monitor_send_data(int world_rank, size_t data_size, int tag){ + + if ( !init_done ) + initialize_monitoring(); + + /* distinguishses positive and negative tags if requested */ + if((tag<0) && (filter_monitoring())){ + filtered_sent_data[world_rank] += data_size; + filtered_messages_count[world_rank]++; + }else{ /* if filtered monitoring is not activated data is aggregated indifferently */ + sent_data[world_rank] += data_size; + messages_count[world_rank]++; + } + /*printf("%d Send dest = %d(%d:comm_world=%d), size = %ld ajouté dans : %d\n",my_rank, dest_rank, comm->c_my_rank, MPI_COMM_WORLD->c_my_rank, data_size, rank); fflush(stdout);*/ + + +} + +void output_monitoring( void ){ + int i; + for (i = 0 ; i < nbprocs ; i++) { + all_sent_data[i] += sent_data[i]; + all_messages_count[i] += messages_count[i]; + if(all_sent_data[i] > 0) { + fprintf(stderr, "I\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, all_sent_data[i], all_messages_count[i]); fflush(stderr); + } + } + + if(filter_monitoring()){ + for (i = 0 ; i < nbprocs ; i++) { + all_filtered_sent_data[i] += filtered_sent_data[i]; + all_filtered_messages_count[i] += filtered_messages_count[i]; + if(all_filtered_sent_data[i] > 0) { + fprintf(stderr, "E\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, all_filtered_sent_data[i], all_filtered_messages_count[i]); fflush(stderr); + } + } + } +} + + +/* + Flushes the monitoring into filename + Useful for phases (see exmple in test/monitoring) +*/ + +int ompi_mca_pml_monitoring_flush(char* filename) { + FILE *pf; + int i; + + + pf = fopen(filename, "w"); + + if(!pf) + return -1; + + fprintf(stderr,"Proc %d flushing monitoring to: %s\n", my_rank, filename); + + for (i = 0 ; i < nbprocs ; i++) { + if(sent_data[i] > 0) { + fprintf(pf, "I\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, sent_data[i], messages_count[i]); fflush(pf); + /* aggregate data in general array*/ + all_sent_data[i] += sent_data[i]; + all_messages_count[i] += messages_count[i]; + /* reset phase array */ + messages_count[i] = 0; + sent_data[i] = 0; + } + } + + if(filter_monitoring()){ + for (i = 0 ; i < nbprocs ; i++) { + if(filtered_sent_data[i] > 0) { + fprintf(pf, "E\t%d\t%d\t" PRIu64 " bytes\t" PRIu64 " msgs sent\n", my_rank, i, filtered_sent_data[i], filtered_messages_count[i]); fflush(pf); + /* aggregate data in general array*/ + all_filtered_sent_data[i] += filtered_sent_data[i]; + all_filtered_messages_count[i] += filtered_messages_count[i]; + /* reset phase array */ + filtered_messages_count[i] = 0; + filtered_sent_data[i] = 0; + } + } + } + + fclose(pf); + return 0; +} diff --git a/ompi/mca/pml/monitoring/pml_monitoring.h b/ompi/mca/pml/monitoring/pml_monitoring.h new file mode 100644 index 00000000000..dbae8e1eee9 --- /dev/null +++ b/ompi/mca/pml/monitoring/pml_monitoring.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2013-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2013-2015 Inria. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_PML_MONITORING_H +#define MCA_PML_MONITORING_H + +BEGIN_C_DECLS + +#include +#include +#include +#include +#include + +typedef mca_pml_base_module_t mca_pml_monitoring_module_t; + +extern mca_pml_base_component_t pml_selected_component; +extern mca_pml_base_module_t pml_selected_module; +extern mca_pml_monitoring_module_t mca_pml_monitoring; +OMPI_DECLSPEC extern mca_pml_base_component_2_0_0_t mca_pml_monitoring_component; + +/* + * PML interface functions. + */ + +extern int mca_pml_monitoring_add_comm(struct ompi_communicator_t* comm); + +extern int mca_pml_monitoring_del_comm(struct ompi_communicator_t* comm); + +extern int mca_pml_monitoring_add_procs(struct ompi_proc_t **procs, + size_t nprocs); + +extern int mca_pml_monitoring_del_procs(struct ompi_proc_t **procs, + size_t nprocs); + +extern int mca_pml_monitoring_enable(bool enable); + +extern int mca_pml_monitoring_iprobe(int dst, + int tag, + struct ompi_communicator_t* comm, + int *matched, + ompi_status_public_t* status ); + +extern int mca_pml_monitoring_probe(int dst, + int tag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status ); + +extern int mca_pml_monitoring_improbe(int dst, + int tag, + struct ompi_communicator_t* comm, + int *matched, + struct ompi_message_t **message, + ompi_status_public_t* status ); + +extern int mca_pml_monitoring_mprobe(int dst, + int tag, + struct ompi_communicator_t* comm, + struct ompi_message_t **message, + ompi_status_public_t* status ); + +extern int mca_pml_monitoring_isend_init(void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm, + struct ompi_request_t **request); + +extern int mca_pml_monitoring_isend(void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm, + struct ompi_request_t **request); + +extern int mca_pml_monitoring_send(void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm); + +extern int mca_pml_monitoring_irecv_init(void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + struct ompi_request_t **request); + +extern int mca_pml_monitoring_irecv(void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + struct ompi_request_t **request); + +extern int mca_pml_monitoring_recv(void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status); + +extern int mca_pml_monitoring_imrecv(void *buf, + size_t count, + ompi_datatype_t *datatype, + struct ompi_message_t **message, + struct ompi_request_t **request); + +extern int mca_pml_monitoring_mrecv(void *buf, + size_t count, + ompi_datatype_t *datatype, + struct ompi_message_t **message, + ompi_status_public_t* status); + +extern int mca_pml_monitoring_dump(struct ompi_communicator_t* comm, + int verbose); + +extern int mca_pml_monitoring_start(size_t count, + ompi_request_t** requests); + +END_C_DECLS + +#endif /* MCA_PML_MONITORING_H */ diff --git a/ompi/mca/pml/monitoring/pml_monitoring_comm.c b/ompi/mca/pml/monitoring/pml_monitoring_comm.c new file mode 100644 index 00000000000..047a15bfd30 --- /dev/null +++ b/ompi/mca/pml/monitoring/pml_monitoring_comm.c @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2013-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2013-2015 Inria. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include + +extern void output_monitoring( void ); + + +int mca_pml_monitoring_add_comm(struct ompi_communicator_t* comm) +{ + return pml_selected_module.pml_add_comm(comm); +} + +int mca_pml_monitoring_del_comm(struct ompi_communicator_t* comm) +{ + return pml_selected_module.pml_del_comm(comm); +} diff --git a/ompi/mca/pml/monitoring/pml_monitoring_component.c b/ompi/mca/pml/monitoring/pml_monitoring_component.c new file mode 100644 index 00000000000..491f5f9f782 --- /dev/null +++ b/ompi/mca/pml/monitoring/pml_monitoring_component.c @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2013-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2013-2015 Inria. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include +#include +#include + +static int mca_pml_monitoring_enabled = 0; +static int mca_pml_monitoring_active = 0; +mca_pml_base_component_t pml_selected_component; +mca_pml_base_module_t pml_selected_module; + +extern void output_monitoring( void ); +extern void finalize_monitoring( void ); +extern int ompi_mca_pml_monitoring_flush(char* filename); +int filter_monitoring( void ); + + + +/* Return 1 if the the seperation between internal tags and external tags is enabled*/ +int filter_monitoring( void ) +{ + if (mca_pml_monitoring_enabled == 2) + return 1; + else + return 0; +} + +static unsigned long hidden_fct = (unsigned long)((void*)ompi_mca_pml_monitoring_flush); +int mca_pml_monitoring_enable(bool enable) +{ + /* If we reach this point we were succesful at hijacking the interface of + * the real PML, and we are now correctly interleaved between the upper + * layer and the real PML. + */ + mca_base_component_var_register(&mca_pml_monitoring_component.pmlm_version, "flush", + "Hidden argument to provide the flush function pointer", + MCA_BASE_VAR_TYPE_UNSIGNED_LONG, NULL, 0, 0, + OPAL_INFO_LVL_1, + MCA_BASE_VAR_SCOPE_CONSTANT, + &hidden_fct); + return pml_selected_module.pml_enable(enable); +} + +static int mca_pml_monitoring_component_open(void) +{ + if( mca_pml_monitoring_enabled ) { + opal_pointer_array_add(&mca_pml_base_pml, + strdup(mca_pml_monitoring_component.pmlm_version.mca_component_name)); + } + return OMPI_SUCCESS; +} + +static int mca_pml_monitoring_component_close(void) +{ + if( mca_pml_monitoring_enabled ) { + if( !mca_pml_monitoring_active ) { + /* Save a copy of the selected PML */ + pml_selected_component = mca_pml_base_selected_component; + pml_selected_module = mca_pml; + /* And now install the interception layer */ + mca_pml_base_selected_component = mca_pml_monitoring_component; + mca_pml = mca_pml_monitoring; + mca_pml.pml_progress = pml_selected_module.pml_progress; + /* Bump my ref count up to avoid getting released too early */ + mca_base_component_repository_retain_component(mca_pml_monitoring_component.pmlm_version.mca_type_name, + mca_pml_monitoring_component.pmlm_version.mca_component_name); + mca_pml_monitoring_active = 1; + } + } + return OMPI_SUCCESS; +} + +static mca_pml_base_module_t* +mca_pml_monitoring_component_init(int* priority, + bool enable_progress_threads, + bool enable_mpi_threads) +{ + if( mca_pml_monitoring_enabled ) { + *priority = 0; /* I'm up but don't select me */ + return &mca_pml_monitoring; + } + return NULL; +} + +static int mca_pml_monitoring_component_finish(void) +{ + if( mca_pml_monitoring_enabled && mca_pml_monitoring_active ) { + /* It is over... Output what has been monitored*/ + output_monitoring(); + /* Free internal data structure */ + finalize_monitoring(); + /* Call the original PML and then close */ + mca_pml_monitoring_active = 0; + mca_pml_monitoring_enabled = 0; + /* Restore the original PML */ + mca_pml_base_selected_component = pml_selected_component; + mca_pml = pml_selected_module; + /* Redirect the close call to the original PML */ + pml_selected_component.pmlm_finalize(); + /** + * We should never release the last ref on the current component or face forever punishement. + */ + /* mca_base_component_repository_release(&mca_pml_monitoring_component.pmlm_version); */ + } + return OMPI_SUCCESS; +} + +static int mca_pml_monitoring_component_register(void) +{ + (void)mca_base_component_var_register(&mca_pml_monitoring_component.pmlm_version, "enable", + "Enable the monitoring at the PML level. This value should be different than 0 in order for the monitoring to be enabled (default disable)", MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, + OPAL_INFO_LVL_9, + MCA_BASE_VAR_SCOPE_READONLY, &mca_pml_monitoring_enabled); + return OMPI_SUCCESS; +} + +mca_pml_base_component_2_0_0_t mca_pml_monitoring_component = { + + /* First, the mca_base_component_t struct containing meta + information about the component itself */ + + { + MCA_PML_BASE_VERSION_2_0_0, + + "monitoring", /* MCA component name */ + OMPI_MAJOR_VERSION, /* MCA component major version */ + OMPI_MINOR_VERSION, /* MCA component minor version */ + OMPI_RELEASE_VERSION, /* MCA component release version */ + mca_pml_monitoring_component_open, /* component open */ + mca_pml_monitoring_component_close, /* component close */ + NULL, + mca_pml_monitoring_component_register + }, + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + mca_pml_monitoring_component_init, /* component init */ + mca_pml_monitoring_component_finish /* component finalize */ + +}; + diff --git a/ompi/mca/pml/monitoring/pml_monitoring_iprobe.c b/ompi/mca/pml/monitoring/pml_monitoring_iprobe.c new file mode 100644 index 00000000000..ec34cb5d27c --- /dev/null +++ b/ompi/mca/pml/monitoring/pml_monitoring_iprobe.c @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2013-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2013-2015 Inria. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include + + +/* EJ: nothing to do here */ + +int mca_pml_monitoring_iprobe( int dst, + int tag, + struct ompi_communicator_t* comm, + int *matched, + ompi_status_public_t* status ) +{ + return pml_selected_module.pml_iprobe(dst, tag, comm, + matched, status); +} + +int mca_pml_monitoring_probe( int dst, + int tag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status ) +{ + return pml_selected_module.pml_probe(dst, tag, comm, status); +} + +int mca_pml_monitoring_improbe(int dst, + int tag, + struct ompi_communicator_t* comm, + int *matched, + struct ompi_message_t **message, + ompi_status_public_t* status) +{ + return pml_selected_module.pml_improbe(dst, tag, comm, + matched, message, status); +} + + +int mca_pml_monitoring_mprobe(int dst, + int tag, + struct ompi_communicator_t* comm, + struct ompi_message_t **message, + ompi_status_public_t* status) +{ + return pml_selected_module.pml_mprobe(dst, tag, comm, message, status); +} + diff --git a/ompi/mca/pml/monitoring/pml_monitoring_irecv.c b/ompi/mca/pml/monitoring/pml_monitoring_irecv.c new file mode 100644 index 00000000000..91b247c7c53 --- /dev/null +++ b/ompi/mca/pml/monitoring/pml_monitoring_irecv.c @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2013-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2013-2015 Inria. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include + + +/* EJ: loging is done on the sender. Nothing to do here */ + +int mca_pml_monitoring_irecv_init(void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + struct ompi_request_t **request) +{ + return pml_selected_module.pml_irecv_init(buf, count, datatype, + src, tag, comm, request); +} + + +int mca_pml_monitoring_irecv(void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + struct ompi_request_t **request) +{ + return pml_selected_module.pml_irecv(buf, count, datatype, + src, tag, comm, request); +} + + +int mca_pml_monitoring_recv(void *buf, + size_t count, + ompi_datatype_t *datatype, + int src, + int tag, + struct ompi_communicator_t* comm, + ompi_status_public_t* status) +{ + return pml_selected_module.pml_recv(buf, count, datatype, + src, tag, comm, status); +} + + +int mca_pml_monitoring_imrecv(void *buf, + size_t count, + ompi_datatype_t *datatype, + struct ompi_message_t **message, + struct ompi_request_t **request) +{ + return pml_selected_module.pml_imrecv(buf, count, datatype, + message, request); +} + + +int mca_pml_monitoring_mrecv(void *buf, + size_t count, + ompi_datatype_t *datatype, + struct ompi_message_t **message, + ompi_status_public_t* status) + +{ + return pml_selected_module.pml_mrecv(buf, count, datatype, + message, status); +} + + diff --git a/ompi/mca/pml/monitoring/pml_monitoring_isend.c b/ompi/mca/pml/monitoring/pml_monitoring_isend.c new file mode 100644 index 00000000000..b8fc4f18a8e --- /dev/null +++ b/ompi/mca/pml/monitoring/pml_monitoring_isend.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2013-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2013-2015 Inria. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include + +extern void monitor_send_data(int dest_rank, size_t data_size, int tag); +extern opal_hash_table_t *get_hashtable(void); +extern opal_hash_table_t *translation_ht; + +int mca_pml_monitoring_isend_init(void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm, + struct ompi_request_t **request) +{ + return pml_selected_module.pml_isend_init(buf, count, datatype, + dst, tag, mode, comm, request); +} + +int mca_pml_monitoring_isend(void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm, + struct ompi_request_t **request) +{ + + /* find the processor of teh destination */ + ompi_proc_t *proc = ompi_group_get_proc_ptr(comm->c_remote_group, dst); + int world_rank; + + /* find its name*/ + uint64_t key = *((uint64_t*)&(proc->super.proc_name)); + /** + * If this fails the destination is not part of my MPI_COM_WORLD + * Lookup its name in the rank hastable to get its MPI_COMM_WORLD rank + */ + if(OPAL_SUCCESS == opal_hash_table_get_value_uint64(translation_ht, key, (void *)&world_rank)) { + size_t type_size, data_size; + ompi_datatype_type_size(datatype, &type_size); + data_size = count*type_size; + monitor_send_data(world_rank, data_size, tag); + } + + return pml_selected_module.pml_isend(buf, count, datatype, + dst, tag, mode, comm, request); +} + +int mca_pml_monitoring_send(void *buf, + size_t count, + ompi_datatype_t *datatype, + int dst, + int tag, + mca_pml_base_send_mode_t mode, + struct ompi_communicator_t* comm) +{ + + ompi_proc_t *proc = ompi_group_get_proc_ptr(comm->c_remote_group, dst); + int world_rank; + uint64_t key = *((uint64_t*) &(proc->super.proc_name)); + + /** + * If this fails the destination is not part of my MPI_COM_WORLD + */ + if(OPAL_SUCCESS == opal_hash_table_get_value_uint64(translation_ht, key, (void *)&world_rank)) { + size_t type_size, data_size; + ompi_datatype_type_size(datatype, &type_size); + data_size = count*type_size; + monitor_send_data(world_rank, data_size, tag); + } + + + return pml_selected_module.pml_send(buf, count, datatype, + dst, tag, mode, comm); +} + diff --git a/ompi/mca/pml/monitoring/pml_monitoring_start.c b/ompi/mca/pml/monitoring/pml_monitoring_start.c new file mode 100644 index 00000000000..5b503977e79 --- /dev/null +++ b/ompi/mca/pml/monitoring/pml_monitoring_start.c @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2013-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2013-2015 Inria. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include +#include + +extern void monitor_send_data(int dest_rank, size_t data_size, int tag); +extern opal_hash_table_t *translation_ht; + + +/* manage persistant requests*/ +int mca_pml_monitoring_start(size_t count, + ompi_request_t** requests) +{ + size_t i; + + for( i = 0; i < count; i++ ) { + mca_pml_base_request_t *pml_request = (mca_pml_base_request_t*)requests[i]; + ompi_proc_t *proc; + int world_rank; + + if(NULL == pml_request) { + continue; + } + if(OMPI_REQUEST_PML != requests[i]->req_type) { + continue; + } + if(MCA_PML_REQUEST_SEND != pml_request->req_type) { + continue; + } + + proc = ompi_group_get_proc_ptr(pml_request->req_comm->c_remote_group, pml_request->req_peer); + uint64_t key = *((uint64_t*) &(proc->super.proc_name)); + + + /** + * If this fails the destination is not part of my MPI_COM_WORLD + */ + if(OPAL_SUCCESS == opal_hash_table_get_value_uint64(translation_ht, key, (void *)&world_rank)) { + size_t type_size, data_size; + ompi_datatype_type_size(pml_request->req_datatype, &type_size); + data_size = pml_request->req_count * type_size; + monitor_send_data(world_rank, data_size, 1); + } + } + return pml_selected_module.pml_start(count, requests); +} + diff --git a/ompi/mca/topo/treematch/Makefile.am b/ompi/mca/topo/treematch/Makefile.am new file mode 100644 index 00000000000..6019a786e8d --- /dev/null +++ b/ompi/mca/topo/treematch/Makefile.am @@ -0,0 +1,62 @@ +# +# Copyright (c) 2011-2015 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2011-2015 INRIA. All rights reserved. +# Copyright (c) 2011-2015 Université Bordeaux 1 +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +if topo_treematch_local +extra_treematch_files = treematch/tm_bucket.h \ + treematch/tm_hwloc.h treematch/tm_mapping.h \ + treematch/tm_timings.h treematch/tm_tree.h \ + treematch/tm_kpartitioning.h treematch/uthash.h\ + treematch/IntConstantInitializedVector.h \ + treematch/tm_mt.h \ + treematch/tm_thread_pool.h treematch/tm_verbose.h \ + treematch/tm_malloc.h \ + treematch/IntConstantInitializedVector.c \ + treematch/tm_mt.c \ + treematch/tm_thread_pool.c treematch/tm_verbose.c \ + treematch/tm_malloc.c \ + treematch/tm_mapping.c treematch/tm_timings.c \ + treematch/tm_bucket.c treematch/tm_tree.c \ + treematch/tm_hwloc.c treematch/tm_kpartitioning.c +endif + +sources = \ + topo_treematch.h \ + topo_treematch_module.c \ + topo_treematch_component.c \ + topo_treematch_dist_graph_create.c $(extra_treematch_files) + +# Make the output library in this directory, and name it either +# mca__.la (for DSO builds) or libmca__.la +# (for static builds). + +if MCA_BUILD_ompi_topo_treematch_DSO +lib = +lib_sources = +component = mca_topo_treematch.la +component_sources = $(sources) +else +lib = libmca_topo_treematch.la +lib_sources = $(sources) +component = +component_sources = +endif + +mcacomponentdir = $(pkglibdir) +mcacomponent_LTLIBRARIES = $(component) +mca_topo_treematch_la_SOURCES = $(component_sources) +mca_topo_treematch_la_LDFLAGS = -module -avoid-version + +noinst_LTLIBRARIES = $(lib) +libmca_topo_treematch_la_SOURCES = $(lib_sources) +libmca_topo_treematch_la_LDFLAGS = -module -avoid-version + diff --git a/ompi/mca/topo/treematch/configure.m4 b/ompi/mca/topo/treematch/configure.m4 new file mode 100644 index 00000000000..d2141329036 --- /dev/null +++ b/ompi/mca/topo/treematch/configure.m4 @@ -0,0 +1,87 @@ +# -*- shell-script -*- +# +# Copyright (c) 2011-2015 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2011-2015 INRIA. All rights reserved. +# Copyright (c) 2011-2015 Universite Bordeaux 1 +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# MCA_ompi_topo_treematch_CONFIG([action-if-can-compile], +# [action-if-cant-compile]) +# ------------------------------------------- +AC_DEFUN([MCA_ompi_topo_treematch_CONFIG], [ + AC_REQUIRE([MCA_opal_hwloc_CONFIG_REQUIRE]) + + AC_ARG_WITH([treematch], + [AC_HELP_STRING([--with-treematch(=DIR)], + [Build TreeMatch topology support, optionally adding DIR/include, DIR/lib, and DIR/lib64 to the search path for headers and libraries])], + [], + [with_treematch=yes]) + AC_ARG_WITH([treematch-include], + [AC_HELP_STRING([--with-treematch-include(=DIR)], + ["Search for TreeMatch headers in DIR"])]) + AC_ARG_WITH([treematch-libdir], + [AC_HELP_STRING([--with-treematch-libdir(=DIR)], + ["Search for TreeMatch libraries in DIR"])]) + + treematch_files_local="no" + ompi_check_treematch_dir=$srcdir + ompi_check_treematch_libdir="" + ompi_check_treematch_happy="no" + + AS_IF([test "x$with_treematch" != xno], + [AC_MSG_CHECKING([TreeMatch headers]) + AS_IF([test "x$with_treematch_include" = x], + [AS_IF([test "x$with_treematch" = xyes], + [treematch_files_local="yes" + with_treematch_include=$OMPI_TOP_SRCDIR/ompi/mca/topo/treematch/treematch], + [with_treematch_include=$with_treematch/include])]) + AS_IF([test -f $with_treematch_include/tm_tree.h], + [AS_IF([test "x$with_treematch" = xyes], + [AC_MSG_RESULT([in the source])], + [AC_MSG_RESULT([user provided])]) + opal_check_treematch_dir=$with_treematch_include + ompi_check_treematch_happy="yes"], + [AC_MSG_ERROR([missing tm_tree.h (${with_treematch}:${with_treematch_include})])])]) + + AS_IF([test "$ompi_check_treematch_happy" = "yes"], + [AC_MSG_CHECKING([TreeMatch library]) + OPAL_CHECK_WITHDIR([treematch], [$with_treematch_include], [tm_tree.h]) + AS_IF([test "x$with_treematch_libdir" = x], + [AS_IF([test "x$with_treematch" != xyes], + [with_treematch_libdir=$with_treematch/lib] + [with_treematch_libdir=$OMPI_TOP_SRCDIR/ompi/mca/topo/treematch/treematch])]) + AS_IF([test "x$treematch_files_local" = xno], + [OPAL_CHECK_WITHDIR([treematch-libdir], [$with_treematch_libdir], [libtreematch.*]) + AS_IF([test "x$with_treematch" != xno -a "x$with_treematch" != xyes], + [AS_IF([test ! -z "$with_treematch" -a "$with_treematch" != "yes"], + [ompi_check_treematch_dir="$with_treematch"]) + AS_IF([test ! -z "$with_treematch_libdir" -a "$with_treematch_libdir" != "yes"], + [ompi_check_treematch_libdir="$with_treematch_libdir"]) + OPAL_CHECK_PACKAGE([topo_treematch], + [tm_tree.h], + [treematch], + [build_tree], + [], + [$with_treematch_include], + [$with_treematch_libdir], + [ompi_check_treematch_happy="yes"], + [ompi_check_treematch_happy="no"])], + [ompi_check_treematch_happy="no"])])]) + + AS_IF([test "$ompi_check_treematch_happy" = "yes"], + [$1], + [AS_IF([test ! -z "$with_treematch" -a "$with_treematch" != "no"], + [AC_MSG_ERROR([TreeMatch support requested but not found. Aborting])]) + $2]) + + AC_CONFIG_FILES([ompi/mca/topo/treematch/Makefile]) + AM_CONDITIONAL(topo_treematch_local, + [test "x$treematch_files_local" = "xyes"]) +]) diff --git a/ompi/mca/topo/treematch/topo_treematch.h b/ompi/mca/topo/treematch/topo_treematch.h new file mode 100644 index 00000000000..42d12c54f3a --- /dev/null +++ b/ompi/mca/topo/treematch/topo_treematch.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2011-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2011-2015 INRIA. All rights reserved. + * Copyright (c) 2011-2015 Bordeaux Polytechnic Institute + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef MCA_TOPO_UNTIY_H +#define MCA_TOPO_UNTIY_H + +#include "ompi_config.h" +#include "ompi/mca/topo/topo.h" + +/* + * ****************************************************************** + * ******** functions which provide MCA interface comppliance ******* + * ****************************************************************** + * These functions are: + * - mca_topo_treematch_module_open + * - mca_topo_treematch_module_close + * - mca_topo_treematch_module_query + * - mca_topo_treematch_module_finalize + * These functions are always found on the mca_topo_treematch_module + * structure. They are the "meta" functions to ensure smooth op. + * ****************************************************************** + */ +BEGIN_C_DECLS + +/* + * Public component instance + */ +typedef struct mca_topo_treematch_component_2_2_0_t { + mca_topo_base_component_2_2_0_t super; + + int reorder_mode; +} mca_topo_treematch_component_2_2_0_t; + +OMPI_MODULE_DECLSPEC extern mca_topo_treematch_component_2_2_0_t + mca_topo_treematch_component; + +/* + * A unique module class for the module so that we can both cache + * module-specific information on the module and have a + * module-specific constructor and destructor. + */ +typedef struct { + mca_topo_base_module_t super; + + /* Modules can add their own information here */ +} mca_topo_treematch_module_t; + +OBJ_CLASS_DECLARATION(mca_topo_treematch_module_t); + + +/* + * Module functions + */ + +int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* module, + ompi_communicator_t *comm_old, + int n, int nodes[], + int degrees[], int targets[], + int weights[], + struct ompi_info_t *info, int reorder, + ompi_communicator_t **newcomm); +/* + * ****************************************************************** + * ************ functions implemented in this module end ************ + * ****************************************************************** + */ + +END_C_DECLS + +#endif /* MCA_TOPO_EXAMPLE_H */ diff --git a/ompi/mca/topo/treematch/topo_treematch_component.c b/ompi/mca/topo/treematch/topo_treematch_component.c new file mode 100644 index 00000000000..221efd0a7ee --- /dev/null +++ b/ompi/mca/topo/treematch/topo_treematch_component.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2011-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2011-2015 INRIA. All rights reserved. + * Copyright (c) 2011-2015 Université Bordeaux 1 + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" +#include "ompi/mca/topo/treematch/topo_treematch.h" + +/* + * Public string showing the topo treematch module version number + */ +const char *mca_topo_treematch_component_version_string = + "Open MPI treematch topology MCA component version" OMPI_VERSION; + +/* + * Local funtions + */ +static int init_query(bool enable_progress_threads, bool enable_mpi_threads); +static struct mca_topo_base_module_t * +comm_query(const ompi_communicator_t *comm, int *priority, uint32_t type); +static int mca_topo_treematch_component_register(void); + +/* + * Public component structure + */ +mca_topo_treematch_component_2_2_0_t mca_topo_treematch_component = + { + { + { + MCA_TOPO_BASE_VERSION_2_2_0, + + "treematch", + OMPI_MAJOR_VERSION, + OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION, + NULL, /* component open */ + NULL, /* component close */ + NULL, /* component query */ + mca_topo_treematch_component_register, /* component register */ + }, + + { + /* The component is checkpoint ready */ + MCA_BASE_METADATA_PARAM_CHECKPOINT + }, + + init_query, + comm_query + }, + 0 /* reorder: by default centralized */ + }; + + +static int init_query(bool enable_progress_threads, bool enable_mpi_threads) +{ + if(NULL == opal_hwloc_topology) { + return OPAL_ERR_NOT_SUPPORTED; + } + return OMPI_SUCCESS; +} + + +static struct mca_topo_base_module_t * +comm_query(const ompi_communicator_t *comm, int *priority, uint32_t type) +{ + mca_topo_treematch_module_t *treematch; + + if( OMPI_COMM_DIST_GRAPH != type ) { + return NULL; + } + treematch = OBJ_NEW(mca_topo_treematch_module_t); + if (NULL == treematch) { + return NULL; + } + treematch->super.topo.dist_graph.dist_graph_create = mca_topo_treematch_dist_graph_create; + + /* This component has very low priority -- it's an treematch, after + all! */ + *priority = 42; + treematch->super.type = OMPI_COMM_DIST_GRAPH; + return &(treematch->super); +} + +static int mca_topo_treematch_component_register(void) +{ + (void)mca_base_component_var_register(&mca_topo_treematch_component.super.topoc_version, + "reorder_mode", "If set the reordering will be done in a partially distributed way (default=0). If partially-distributed only local knowledge will be used, possibly leading to less accurate reordering.", MCA_BASE_VAR_TYPE_INT, + NULL, 0, 0, OPAL_INFO_LVL_2, + MCA_BASE_VAR_SCOPE_READONLY, &mca_topo_treematch_component.reorder_mode); + return OMPI_SUCCESS; +} + diff --git a/ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c b/ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c new file mode 100644 index 00000000000..c275053b16b --- /dev/null +++ b/ompi/mca/topo/treematch/topo_treematch_dist_graph_create.c @@ -0,0 +1,907 @@ +/* + * Copyright (c) 2011-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2011-2015 INRIA. All rights reserved. + * Copyright (c) 2012-2015 Bordeaux Poytechnic Institute + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include "opal/constants.h" +#if defined(OPAL_HAVE_HWLOC) +#include "opal/mca/hwloc/hwloc.h" +#endif /* defined(OPAL_HAVE_HWLOC) */ + +#include "ompi/mca/topo/treematch/topo_treematch.h" +#include "ompi/mca/topo/treematch/treematch/tm_mapping.h" +#include "ompi/mca/topo/base/base.h" + +#include "ompi/communicator/communicator.h" +#include "ompi/info/info.h" + +#include "ompi/mca/pml/pml.h" + +#include "opal/mca/dstore/dstore.h" + +#define ERR_EXIT(ERR) \ + do { free(local_pattern); \ + return (ERR); } \ + while(0); + +#define FALLBACK() \ + do { free(nodes_roots); \ + free(local_procs); \ + hwloc_bitmap_free(set); \ + goto fallback; } \ + while(0); + +#define MY_STRING_SIZE 64 +/*#define __DEBUG__ 1 */ + + +static int check_oversubscribing(int rank, + int num_nodes, + int num_objs_in_node, + int num_procs_in_node, + int *nodes_roots, + int *local_procs, + ompi_communicator_t *comm_old) +{ + int oversubscribed = 0; + int local_oversub = 0; + int err; + + if (rank == local_procs[0]) + if(num_objs_in_node < num_procs_in_node) + local_oversub = 1; + + if (rank == 0) { + MPI_Request *reqs = (MPI_Request *)calloc(num_nodes-1, sizeof(MPI_Request)); + int *oversub = (int *)calloc(num_nodes, sizeof(int)); + int i; + + oversub[0] = local_oversub; + for(i = 1; i < num_nodes; i++) + if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(&oversub[i], 1, MPI_INT, + nodes_roots[i], 111, comm_old, &reqs[i-1])))) + return err; + + if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes-1, + reqs, MPI_STATUSES_IGNORE))) + return err; + + for(i = 0; i < num_nodes; i++) + oversubscribed += oversub[i]; + + free(oversub); + free(reqs); + } else { + if (rank == local_procs[0]) + if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&local_oversub, 1, MPI_INT, 0, + 111, MCA_PML_BASE_SEND_STANDARD, comm_old)))) + return err; + } + + if (OMPI_SUCCESS != (err = comm_old->c_coll.coll_bcast(&oversubscribed, 1, + MPI_INT, 0, comm_old, + comm_old->c_coll.coll_bcast_module))) + return err; + + return oversubscribed; +} + +int mca_topo_treematch_dist_graph_create(mca_topo_base_module_t* topo_module, + ompi_communicator_t *comm_old, + int n, int nodes[], + int degrees[], int targets[], + int weights[], + struct ompi_info_t *info, int reorder, + ompi_communicator_t **newcomm) +{ + int err; + + if (OMPI_SUCCESS != (err = mca_topo_base_dist_graph_distribute(topo_module, comm_old,n,nodes, + degrees,targets,weights, + &(topo_module->mtc.dist_graph)))) + return err; + + if(!reorder) { /* No reorder. Create a new communicator, then */ + /* jump out to attach the dist_graph and return */ + fallback: + + if( OMPI_SUCCESS == (err = ompi_comm_create(comm_old, + comm_old->c_local_group, + newcomm))){ + /* Attach the dist_graph to the newly created communicator */ + (*newcomm)->c_flags |= OMPI_COMM_DIST_GRAPH; + (*newcomm)->c_topo = topo_module; + (*newcomm)->c_topo->reorder = reorder; + } + return err; + } else { /* reorder == yes */ + mca_topo_base_comm_dist_graph_2_2_0_t *topo = NULL; + ompi_proc_t *proc = NULL; + MPI_Request *reqs = NULL; + hwloc_cpuset_t set; + hwloc_obj_t object,root_obj; + hwloc_obj_t *tracker = NULL; + double *local_pattern = NULL; + int *vpids, *colors = NULL; + int *local_procs = NULL; + int *nodes_roots = NULL; + int *localrank_to_objnum = NULL; + int depth, effective_depth, obj_rank = -1; + int num_objs_in_node = 0; + int num_pus_in_node = 0; + int numlevels = 0; + int num_nodes = 0; + int num_procs_in_node = 0; + int rank, size; + int hwloc_err; + int oversubscribing_objs = 0; + int i, j, idx; + uint32_t val, *pval; + + topo = topo_module->mtc.dist_graph; + rank = ompi_comm_rank(comm_old); + size = ompi_comm_size(comm_old); + +#ifdef __DEBUG__ + fprintf(stdout,"Process rank is : %i\n",rank); +#endif + /* Determine the number of local procs */ + /* and the number of ext procs */ + for(i = 0 ; i < size ; i++){ + proc = ompi_group_peer_lookup(comm_old->c_local_group, i); + if (( i == rank ) || + (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags))) + num_procs_in_node++; + } + + /* Get the ranks of the local procs in comm_old */ + local_procs = (int *)malloc(num_procs_in_node * sizeof(int)); + for(i = idx = 0 ; i < size ; i++){ + proc = ompi_group_peer_lookup(comm_old->c_local_group, i); + if (( i == rank ) || + (OPAL_PROC_ON_LOCAL_NODE(proc->super.proc_flags))) + local_procs[idx++] = i; + } + + vpids = (int *)malloc(size * sizeof(int)); + colors = (int *)malloc(size * sizeof(int)); + for(i = 0; i < size ; i++) { + proc = ompi_group_peer_lookup(comm_old->c_local_group, i); + pval = &val; + OPAL_MODEX_RECV_VALUE(err, OPAL_DSTORE_NODEID, &(proc->super), &pval, OPAL_UINT32); + if( OPAL_SUCCESS != err ) { + opal_output(0, "Unable to extract peer %s nodeid from the modex.\n", + OMPI_NAME_PRINT(&(proc->super))); + vpids[i] = colors[i] = -1; + continue; + } + vpids[i] = colors[i] = (int)val; + } + +#ifdef __DEBUG__ + fprintf(stdout,"Process rank (2) is : %i \n",rank); + if ( 0 == rank ){ + fprintf(stdout,"local_procs : "); + for(i = 0; i < num_procs_in_node ; i++) + fprintf(stdout," [%i:%i] ",i,local_procs[i]); + fprintf(stdout,"\n"); + + fprintf(stdout,"Vpids : "); + for(i = 0; i < size ; i++) + fprintf(stdout," [%i:%i] ",i,vpids[i]); + fprintf(stdout,"\n"); + } +#endif + /* clean-up dupes in the array */ + for(i = 0; i < size ; i++) + if ( -1 == vpids[i] ) + continue; + else + for(j = i+1 ; j < size ; j++) + if( vpids[j] != -1 ) + if( vpids[i] == vpids[j] ) + vpids[j] = -1; + /* compute number of nodes */ + for(i = 0; i < size ; i++) + if( vpids[i] != -1 ) + num_nodes++; + /* compute local roots ranks in comm_old */ + /* Only the global root needs to do this */ + if(0 == rank) { + nodes_roots = (int *)calloc(num_nodes,sizeof(int)); + for(i = idx = 0; i < size ; i++) + if( vpids[i] != -1 ) + nodes_roots[idx++] = i; +#ifdef __DEBUG__ + fprintf(stdout,"num nodes is %i\n",num_nodes); + fprintf(stdout,"Root nodes are :\n"); + for(i = 0; i < num_nodes ; i++) + fprintf(stdout," [root %i : %i] ",i,nodes_roots[i]); + fprintf(stdout,"\n"); +#endif + } + free(vpids); + + /* Then, we need to know if the processes are bound */ + /* We make the hypothesis that all processes are in */ + /* the same state : all bound or none bound */ + hwloc_err = hwloc_topology_init(&opal_hwloc_topology); + if (-1 == hwloc_err) goto fallback; + hwloc_err = hwloc_topology_load(opal_hwloc_topology); + if (-1 == hwloc_err) goto fallback; + root_obj = hwloc_get_root_obj(opal_hwloc_topology); + if (NULL == root_obj) goto fallback; + + /* if cpubind returns an error, it will be full anyway */ + set = hwloc_bitmap_alloc_full(); + hwloc_get_cpubind(opal_hwloc_topology,set,0); + num_pus_in_node = hwloc_get_nbobjs_by_type(opal_hwloc_topology, HWLOC_OBJ_PU); + + if(hwloc_bitmap_isincluded(root_obj->cpuset,set)){ + /* processes are not bound on the machine */ +#ifdef __DEBUG__ + if (0 == rank) + fprintf(stdout,">>>>>>>>>>>>> Process Not bound <<<<<<<<<<<<<<<\n"); +#endif /* __DEBUG__ */ + + /* we try to bind to cores or above objects if enough are present */ + /* Not sure that cores are present in ALL nodes */ + depth = hwloc_get_type_or_above_depth(opal_hwloc_topology,HWLOC_OBJ_CORE); + num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology,depth); + + /* Check for oversubscribing */ + oversubscribing_objs = check_oversubscribing(rank,num_nodes, + num_objs_in_node,num_procs_in_node, + nodes_roots,local_procs,comm_old); + if(oversubscribing_objs) { +#ifdef __DEBUG__ + fprintf(stdout,"Oversubscribing OBJ/CORES resources => Trying to use PUs \n"); +#endif + int oversubscribed_pus = check_oversubscribing(rank,num_nodes, + num_pus_in_node,num_procs_in_node, + nodes_roots,local_procs,comm_old); + if (oversubscribed_pus){ +#ifdef __DEBUG__ + fprintf(stdout,"Oversubscribing PUs resources => Rank Reordering Impossible \n"); +#endif + FALLBACK(); + } else { + obj_rank = ompi_process_info.my_local_rank%num_pus_in_node; + effective_depth = hwloc_topology_get_depth(opal_hwloc_topology) - 1; + num_objs_in_node = num_pus_in_node; +#ifdef __DEBUG__ + fprintf(stdout,"Process not bound : binding on PU#%i \n",obj_rank); +#endif + } + } else { + obj_rank = ompi_process_info.my_local_rank%num_objs_in_node; + effective_depth = depth; + object = hwloc_get_obj_by_depth(opal_hwloc_topology,effective_depth,obj_rank); + if( NULL == object) FALLBACK(); + + hwloc_bitmap_copy(set,object->cpuset); + hwloc_bitmap_singlify(set); /* we don't want the process to move */ + hwloc_err = hwloc_set_cpubind(opal_hwloc_topology,set,0); + if( -1 == hwloc_err) FALLBACK(); +#ifdef __DEBUG__ + fprintf(stdout,"Process not bound : binding on OBJ#%i \n",obj_rank); +#endif + } + } else { /* the processes are already bound */ + object = hwloc_get_obj_covering_cpuset(opal_hwloc_topology,set); + obj_rank = object->logical_index; + effective_depth = object->depth; + num_objs_in_node = hwloc_get_nbobjs_by_depth(opal_hwloc_topology, effective_depth); + + /* Check for oversubscribing */ + oversubscribing_objs = check_oversubscribing(rank,num_nodes, + num_objs_in_node,num_procs_in_node, + nodes_roots,local_procs,comm_old); + if(oversubscribing_objs) { +#ifdef __DEBUG__ + fprintf(stdout,"Oversubscribing OBJ/CORES resources => Rank Reordering Impossible\n"); +#endif + FALLBACK(); + } +#ifdef __DEBUG__ + fprintf(stdout,"Process %i bound on OBJ #%i \n",rank,obj_rank); + fprintf(stdout,"=====> Num obj in node : %i | num pus in node : %i\n",num_objs_in_node,num_pus_in_node); +#endif + } + + reqs = (MPI_Request *)calloc(num_procs_in_node-1,sizeof(MPI_Request)); + if( rank == local_procs[0] ) { + /* we need to find the right elements of the hierarchy */ + /* and remove the unneeded elements */ + /* Only local masters need to do this */ + int array_size = effective_depth + 1; + int *myhierarchy = (int *)calloc(array_size,sizeof(int)); + + for (i = 0; i < array_size ; i++) + myhierarchy[i] = hwloc_get_nbobjs_by_depth(opal_hwloc_topology,i); + + numlevels = 1; + for (i = 1; i < array_size; i++) + if ((myhierarchy[i] != 0) && (myhierarchy[i] != myhierarchy[i-1])) + numlevels++; + + tracker = (hwloc_obj_t *)calloc(numlevels,sizeof(hwloc_obj_t)); + idx = 0; + tracker[idx++] = root_obj; + i = 1; + while (i < array_size){ + if ( myhierarchy[i] != myhierarchy[i-1]) { + j = i; + while(myhierarchy[j] == myhierarchy[i]) + if (++j > effective_depth) + break; + tracker[idx++] = hwloc_get_obj_by_depth(opal_hwloc_topology,j-1,0); + i = j; + } else i++; + } + free(myhierarchy); + +#ifdef __DEBUG__ + fprintf(stdout,">>>>>>>>>>>>>>>>>>>>> Effective depth is : %i (total depth %i)| num_levels %i\n", + effective_depth,hwloc_topology_get_depth(opal_hwloc_topology),numlevels); + for(i = 0 ; i < numlevels ; i++) + fprintf(stdout,"tracker[%i] : arity %i | depth %i\n",i,tracker[i]->arity,tracker[i]->depth); +#endif + /* get the obj number */ + localrank_to_objnum = (int *)calloc(num_procs_in_node,sizeof(int)); + localrank_to_objnum[0] = obj_rank; + + for(i = 1; i < num_procs_in_node; i++) { + if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(&localrank_to_objnum[i],1,MPI_INT, + local_procs[i],111, comm_old,&reqs[i-1])))) + return err; + } + if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_procs_in_node-1, + reqs,MPI_STATUSES_IGNORE))) + return err; + } else { + /* sending my core number to my local master on the node */ + if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&obj_rank, 1, MPI_INT, local_procs[0], + 111, MCA_PML_BASE_SEND_STANDARD, comm_old)))) + return err; + } + free(reqs); + + /* Centralized Reordering */ + if (0 == mca_topo_treematch_component.reorder_mode) { + int *k = NULL; + int *obj_mapping = NULL; + int newrank = -1; + int num_objs_total = 0; + + /* Gather comm pattern + * If weights have been provided take them in account. Otherwise rely + * solely on HWLOC information. + */ + if(0 == rank) { + + fprintf(stderr,"========== Centralized Reordering ========= \n"); + + local_pattern = (double *)calloc(size*size,sizeof(double)); + if( true == topo->weighted ) { + for(i = 0; i < topo->indegree ; i++) + local_pattern[topo->in[i]] += topo->inw[i]; + for(i = 0; i < topo->outdegree ; i++) + local_pattern[topo->out[i]] += topo->outw[i]; + if (OMPI_SUCCESS != (err = comm_old->c_coll.coll_gather(MPI_IN_PLACE, size, MPI_DOUBLE, + local_pattern, size, MPI_DOUBLE, + 0, comm_old, + comm_old->c_coll.coll_gather_module))) + return err; + } + } else { + local_pattern = (double *)calloc(size,sizeof(double)); + if( true == topo->weighted ) { + for(i = 0; i < topo->indegree ; i++) + local_pattern[topo->in[i]] += topo->inw[i]; + for(i = 0; i < topo->outdegree ; i++) + local_pattern[topo->out[i]] += topo->outw[i]; + if (OMPI_SUCCESS != (err = comm_old->c_coll.coll_gather(local_pattern, size, MPI_DOUBLE, + NULL,0,0, + 0, comm_old, + comm_old->c_coll.coll_gather_module))) + return err; + } + } + + if( rank == local_procs[0]) { + tm_topology_t *tm_topology = NULL; + tm_topology_t *tm_opt_topology = NULL; + int *obj_to_rank_in_comm = NULL; + int *hierarchies = NULL; + int hierarchy[MAX_LEVELS+1]; + int min; + + /* create a table that derives the rank in comm_old from the object number */ + obj_to_rank_in_comm = (int *)malloc(num_objs_in_node*sizeof(int)); + for(i = 0 ; i < num_objs_in_node ; i++) + obj_to_rank_in_comm[i] = -1; + for(i = 0 ; i < num_objs_in_node ; i++) { + object = hwloc_get_obj_by_depth(opal_hwloc_topology,effective_depth,i); + for( j = 0; j < num_procs_in_node ; j++ ) + if(localrank_to_objnum[j] == (int)(object->logical_index)) + break; + if(j == num_procs_in_node) + obj_to_rank_in_comm[i] = -1; + else { + int k; + for(k = 0; k < size ; k++) + if (k == local_procs[j]) + break; + obj_to_rank_in_comm[i] = k; + } + } + + /* the global master gathers info from local_masters */ + if ( 0 == rank ) { + if ( num_nodes > 1 ) { + int *objs_per_node = NULL ; + int *displs = NULL; + + objs_per_node = (int *)calloc(num_nodes,sizeof(int)); + reqs = (MPI_Request *)calloc(num_nodes-1,sizeof(MPI_Request)); + objs_per_node[0] = num_objs_in_node; + for(i = 1; i < num_nodes ; i++) + if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(objs_per_node + i, 1, MPI_INT, + nodes_roots[i],111,comm_old,&reqs[i-1])))) + ERR_EXIT(err); + + if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1, + reqs,MPI_STATUSES_IGNORE))) + ERR_EXIT(err); + + for(i = 0; i < num_nodes; i++) + num_objs_total += objs_per_node[i]; + obj_mapping = (int *)calloc(num_objs_total,sizeof(int)); + displs = (int *)calloc(num_objs_total,sizeof(int)); + displs[0] = 0; + for(i = 1; i < num_nodes ; i++) + displs[i] = displs[i-1] + objs_per_node[i]; + + memset(reqs,0,(num_nodes-1)*sizeof(MPI_Request)); + memcpy(obj_mapping,obj_to_rank_in_comm,objs_per_node[0]*sizeof(int)); + for(i = 1; i < num_nodes ; i++) + if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(obj_mapping + displs[i], objs_per_node[i], MPI_INT, + nodes_roots[i],111,comm_old,&reqs[i-1])))) + ERR_EXIT(err); + if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1, + reqs,MPI_STATUSES_IGNORE))) + ERR_EXIT(err); + free(displs); + free(objs_per_node); + } else { + /* if num_nodes == 1, then it's easy to get the obj mapping */ + num_objs_total = num_objs_in_node; + obj_mapping = (int *)calloc(num_objs_total,sizeof(int)); + memcpy(obj_mapping,obj_to_rank_in_comm,num_objs_total*sizeof(int)); + } + +#ifdef __DEBUG__ + fprintf(stdout,"Obj mapping : "); + for(i = 0 ; i < num_objs_total ; i++) + fprintf(stdout," [%i:%i] ",i,obj_mapping[i]); + fprintf(stdout,"\n"); +#endif + } else { + if ( num_nodes > 1 ) { + if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(&num_objs_in_node, 1, MPI_INT, + 0, 111, MCA_PML_BASE_SEND_STANDARD, comm_old)))) + ERR_EXIT(err); + if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(obj_to_rank_in_comm, num_objs_in_node, MPI_INT, + 0, 111, MCA_PML_BASE_SEND_STANDARD, comm_old)))) + ERR_EXIT(err); + } + } + + free(obj_to_rank_in_comm); + + for(i = 0 ; i < (MAX_LEVELS+1) ; i++) + hierarchy[i] = -1; + hierarchy[0] = numlevels; + + assert(numlevels < MAX_LEVELS); + + for(i = 0 ; i < hierarchy[0] ; i++) + hierarchy[i+1] = tracker[i]->arity; + + if( 0 == rank ) { + hierarchies = (int *)malloc(num_nodes*(MAX_LEVELS+1)*sizeof(int)); + for(i = 0 ; i < num_nodes*(MAX_LEVELS+1) ; i++) + hierarchies[i] = -1; + } + + /* gather hierarchies iff more than 1 node! */ + if ( num_nodes > 1 ) { + if(rank != 0) { + if (OMPI_SUCCESS != (err = MCA_PML_CALL(send(hierarchy,(MAX_LEVELS+1), MPI_INT, 0, + 111, MCA_PML_BASE_SEND_STANDARD, comm_old)))) + ERR_EXIT(err); + } else { + memset(reqs,0,(num_nodes-1)*sizeof(MPI_Request)); + for(i = 1; i < num_nodes ; i++) + if (OMPI_SUCCESS != ( err = MCA_PML_CALL(irecv(hierarchies+i*(MAX_LEVELS+1),(MAX_LEVELS+1),MPI_INT, + nodes_roots[i],111,comm_old,&reqs[i-1])))){ + free(hierarchies); + ERR_EXIT(err); + } + if (OMPI_SUCCESS != ( err = ompi_request_wait_all(num_nodes - 1, + reqs,MPI_STATUSES_IGNORE))) { + free(hierarchies); + ERR_EXIT(err); + } + free(reqs); + } + } + + if ( 0 == rank ) { + tree_t *comm_tree = NULL; + double **comm_pattern = NULL; + int *matching = NULL; + + memcpy(hierarchies,hierarchy,(MAX_LEVELS+1)*sizeof(int)); +#ifdef __DEBUG__ + fprintf(stdout,"hierarchies : "); + for(i = 0 ; i < num_nodes*(MAX_LEVELS+1) ; i++) + fprintf(stdout," [%i] ",hierarchies[i]); + fprintf(stdout,"\n"); +#endif + tm_topology = (tm_topology_t *)malloc(sizeof(tm_topology_t)); + tm_topology->nb_levels = hierarchies[0]; + + /* extract min depth */ + for(i = 1 ; i < num_nodes ; i++) + if (hierarchies[i*(MAX_LEVELS+1)] < tm_topology->nb_levels) + tm_topology->nb_levels = hierarchies[i*(MAX_LEVELS+1)]; + /* Crush levels in hierarchies too long (ie > tm_topology->nb_levels)*/ + for(i = 0; i < num_nodes ; i++) { + int *base_ptr = hierarchies + i*(MAX_LEVELS+1) ; + int suppl = *base_ptr - tm_topology->nb_levels; + for(j = 1 ; j <= suppl ; j++) + *(base_ptr + tm_topology->nb_levels) *= *(base_ptr + tm_topology->nb_levels + j); + } + if( num_nodes > 1){ + /* We aggregate all topos => +1 level!*/ + tm_topology->nb_levels += 1; + tm_topology->arity = (int *)calloc(tm_topology->nb_levels,sizeof(int)); + tm_topology->arity[0] = num_nodes; + for(i = 0; i < (tm_topology->nb_levels - 1); i++) { + min = *(hierarchies + 1 + i); + for(j = 1; j < num_nodes ; j++) + if( hierarchies[j*(MAX_LEVELS+1) + 1 + i] < min) + min = hierarchies[j*(MAX_LEVELS+1) + 1 + i]; + tm_topology->arity[i+1] = min; + } + }else{ + tm_topology->arity = (int *)calloc(tm_topology->nb_levels,sizeof(int)); + for(i = 0; i < tm_topology->nb_levels; i++) + tm_topology->arity[i] = hierarchies[i+1]; + } + free(hierarchies); + + /* compute the number of processing elements */ + tm_topology->nb_nodes = (int *)calloc(tm_topology->nb_levels,sizeof(int)); + tm_topology->nb_nodes[0] = 1; + for(i = 1 ; i < tm_topology->nb_levels; i++) + tm_topology->nb_nodes[i] = tm_topology->nb_nodes[i-1]*tm_topology->arity[i-1]; + + comm_pattern = (double **)malloc(size*sizeof(double *)); + for(i = 0 ; i < size ; i++) + comm_pattern[i] = local_pattern + i*size; + /* matrix needs to be symmetric */ + for( i = 0 ; i < size ; i++) + for(j = i ; j < size ; j++) { + comm_pattern[i][j] += comm_pattern[j][i]; + comm_pattern[j][i] = comm_pattern[i][j]; + } + for( i = 0 ; i < size ; i++) + for(j = 0 ; j < size ; j++) + comm_pattern[i][j] /= 2; +#ifdef __DEBUG__ + fprintf(stdout,"==== COMM PATTERN ====\n"); + for( i = 0 ; i < size ; i++){ + for(j = 0 ; j < size ; j++) + fprintf(stdout," %f ",comm_pattern[i][j]); + fprintf(stdout,"\n"); + } +#endif + /* Build process id tab */ + tm_topology->node_id = (int **)calloc(tm_topology->nb_levels,sizeof(int*)); + for(i = 0; i < tm_topology->nb_levels ; i++) { + tm_topology->node_id[i] = (int *)calloc(tm_topology->nb_nodes[i],sizeof(int)); + for (j = 0; j < tm_topology->nb_nodes[i] ; j++) + tm_topology->node_id[i][j] = obj_mapping[j]; + } + +#ifdef __DEBUG__ + for(i = 0; i < tm_topology->nb_levels ; i++) { + fprintf(stdout,"tm topo node_id for level [%i] : ",i); + for(j = 0 ; j < tm_topology->nb_nodes[i] ; j++) + fprintf(stdout," [%i:%i] ",j,obj_mapping[j]); + fprintf(stdout,"\n"); + } + display_topology(tm_topology); +#endif + k = (int *)calloc(num_objs_total,sizeof(int)); + matching = (int *)calloc(size,sizeof(int)); + + tm_opt_topology = optimize_topology(tm_topology); + comm_tree = build_tree_from_topology(tm_opt_topology,comm_pattern,size,NULL,NULL); + map_topology_simple(tm_opt_topology,comm_tree,matching,size,k); + +#ifdef __DEBUG__ + + fprintf(stdout,"====> nb levels : %i\n",tm_topology->nb_levels); + fprintf(stdout,"Rank permutation sigma/k : "); + for(i = 0 ; i < num_objs_total ; i++) + fprintf(stdout," [%i:%i] ",i,k[i]); + fprintf(stdout,"\n"); + + fprintf(stdout,"Matching : "); + for(i = 0 ; i < size ; i++) + fprintf(stdout," [%i:%i] ",i,matching[i]); + fprintf(stdout,"\n"); +#endif + free(comm_pattern); + free(comm_tree); + free(matching); + free(obj_mapping); + for(i = 0 ; i < tm_topology->nb_levels ; i++) + free(tm_topology->node_id[i]); + free(tm_topology->node_id); + free(tm_topology->nb_nodes); + free(tm_topology->arity); + free(tm_topology); + FREE_topology(tm_opt_topology); + } + } + + /* Todo : Bcast + group creation */ + /* scatter the ranks */ + if (OMPI_SUCCESS != (err = comm_old->c_coll.coll_scatter(k, 1, MPI_INT, + &newrank, 1, MPI_INT, + 0, comm_old,comm_old->c_coll.coll_scatter_module))) + ERR_EXIT(err); + + if ( 0 == rank ) + free(k); + + /* this needs to be optimized but will do for now */ + if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old, 0, newrank,newcomm, false))) + ERR_EXIT(err); + /* end of TODO */ + + /* Attach the dist_graph to the newly created communicator */ + (*newcomm)->c_flags |= OMPI_COMM_DIST_GRAPH; + (*newcomm)->c_topo = topo_module; + (*newcomm)->c_topo->reorder = reorder; + } else { /* partially distributed reordering */ + ompi_communicator_t *localcomm = NULL; + int *matching = (int *)calloc(num_procs_in_node,sizeof(int)); + int *lrank_to_grank = (int *)calloc(num_procs_in_node,sizeof(int)); + int *grank_to_lrank = (int *)calloc(size,sizeof(int)); + hwloc_obj_t object; + opal_hwloc_locality_t locality; + char set_as_string[64]; + opal_value_t kv; + + if (OMPI_SUCCESS != (err = ompi_comm_split(comm_old,colors[rank],ompi_process_info.my_local_rank,&localcomm, false))) + return err; + + for(i = 0 ; i < num_procs_in_node ; i++) + lrank_to_grank[i] = -1; + lrank_to_grank[ompi_process_info.my_local_rank] = rank; + + for(i = 0 ; i < size ; i++) + grank_to_lrank[i] = -1; + + if (OMPI_SUCCESS != (err = localcomm->c_coll.coll_allgather(&rank,1,MPI_INT, + lrank_to_grank,1,MPI_INT, + localcomm, + localcomm->c_coll.coll_allgather_module))) + return err; + + for(i = 0 ; i < num_procs_in_node ; i++) + grank_to_lrank[lrank_to_grank[i]] = i; + + if (rank == local_procs[0]){ + tm_topology_t *tm_topology = NULL; + tm_topology_t *tm_opt_topology = NULL; + tree_t *comm_tree = NULL; + double **comm_pattern = NULL; + +#ifdef __DEBUG__ + fprintf(stderr,"========== Partially Distributed Reordering ========= \n"); +#endif + + local_pattern = (double *)calloc(num_procs_in_node*num_procs_in_node,sizeof(double)); + for(i = 0 ; i < num_procs_in_node*num_procs_in_node ; i++) + local_pattern[i] = 0.0; + + if( true == topo->weighted ) { + for(i = 0; i < topo->indegree ; i++) + if (grank_to_lrank[topo->in[i]] != -1) + local_pattern[grank_to_lrank[topo->in[i]]] += topo->inw[i]; + for(i = 0; i < topo->outdegree ; i++) + if (grank_to_lrank[topo->out[i]] != -1) + local_pattern[grank_to_lrank[topo->out[i]]] += topo->outw[i]; + if (OMPI_SUCCESS != (err = localcomm->c_coll.coll_gather(MPI_IN_PLACE, num_procs_in_node, MPI_DOUBLE, + local_pattern, num_procs_in_node, MPI_DOUBLE, + 0,localcomm, + localcomm->c_coll.coll_gather_module))) + ERR_EXIT(err); + } + + comm_pattern = (double **)malloc(num_procs_in_node*sizeof(double *)); + for(i = 0 ; i < num_procs_in_node ; i++){ + comm_pattern[i] = (double *)calloc(num_procs_in_node,sizeof(double)); + memcpy((void *)comm_pattern[i],(void *)(local_pattern + i*num_procs_in_node),num_procs_in_node*sizeof(double)); + } + /* Matrix needs to be symmetric */ + for( i = 0 ; i < num_procs_in_node ; i++) + for(j = i ; j < num_procs_in_node ; j++){ + comm_pattern[i][j] += comm_pattern[j][i]; + comm_pattern[j][i] = comm_pattern[i][j]; + } + for( i = 0 ; i < num_procs_in_node ; i++) + for(j = 0 ; j < num_procs_in_node ; j++) + comm_pattern[i][j] /= 2; + +#ifdef __DEBUG__ + fprintf(stdout,"========== COMM PATTERN ============= \n"); + for(i = 0 ; i < num_procs_in_node ; i++){ + fprintf(stdout," %i : ",i); + for(j = 0; j < num_procs_in_node ; j++) + fprintf(stdout," %f ",comm_pattern[i][j]); + fprintf(stdout,"\n"); + } + fprintf(stdout,"======================= \n"); +#endif + + tm_topology = (tm_topology_t *)malloc(sizeof(tm_topology_t)); + tm_topology->nb_levels = numlevels; + tm_topology->arity = (int *)calloc(tm_topology->nb_levels,sizeof(int)); + tm_topology->nb_nodes = (int *)calloc(tm_topology->nb_levels,sizeof(int)); + tm_topology->node_id = (int **)malloc(tm_topology->nb_levels*sizeof(int *)); + for(i = 0 ; i < tm_topology->nb_levels ; i++){ + int nb_objs = hwloc_get_nbobjs_by_depth(opal_hwloc_topology,tracker[i]->depth); + tm_topology->nb_nodes[i] = nb_objs; + tm_topology->node_id[i] = (int*)malloc(sizeof(int)*nb_objs); + tm_topology->arity[i] = tracker[i]->arity; + for(j = 0 ; j < nb_objs ; j++) + tm_topology->node_id[i][j] = -1; + for(j = 0 ; j < nb_objs ; j++) + if ( j < num_procs_in_node ) + tm_topology->node_id[i][j] = localrank_to_objnum[j]; + } + +#ifdef __DEBUG__ + fprintf(stdout,"Levels in topo : %i | num procs in node : %i\n",tm_topology->nb_levels,num_procs_in_node); + for(i = 0; i < tm_topology->nb_levels ; i++){ + fprintf(stdout,"Nb objs for level %i : %i | arity %i\n ",i,tm_topology->nb_nodes[i],tm_topology->arity[i]); + for(j = 0; j < tm_topology->nb_nodes[i] ; j++) + fprintf(stdout,"Obj id : %i |",tm_topology->node_id[i][j]); + fprintf(stdout,"\n"); + } + display_topology(tm_topology); +#endif + + tm_opt_topology = optimize_topology(tm_topology); + comm_tree = build_tree_from_topology(tm_opt_topology,comm_pattern,num_procs_in_node,NULL,NULL); + map_topology_simple(tm_opt_topology,comm_tree,matching,num_procs_in_node,NULL); + +#ifdef __DEBUG__ + + fprintf(stdout,"Matching :"); + for(i = 0 ; i < num_procs_in_node ; i++) + fprintf(stdout," %i ",matching[i]); + fprintf(stdout,"\n"); +#endif + for(i = 0 ; i < num_procs_in_node ; i++) + free(comm_pattern[i]); + free(comm_pattern); + for(i = 0; i < tm_topology->nb_levels ; i++) + free(tm_topology->node_id[i]); + free(tm_topology->node_id); + free(tm_topology->nb_nodes); + free(tm_topology->arity); + free(tm_topology); + FREE_topology(tm_opt_topology); + } else { + local_pattern = (double *)calloc(num_procs_in_node,sizeof(double)); + for(i = 0 ; i < num_procs_in_node ; i++) + local_pattern[i] = 0.0; + + if( true == topo->weighted ) { + for(i = 0; i < topo->indegree ; i++) + if (grank_to_lrank[topo->in[i]] != -1) + local_pattern[grank_to_lrank[topo->in[i]]] += topo->inw[i]; + for(i = 0; i < topo->outdegree ; i++) + if (grank_to_lrank[topo->out[i]] != -1) + local_pattern[grank_to_lrank[topo->out[i]]] += topo->outw[i]; + if (OMPI_SUCCESS != (err = localcomm->c_coll.coll_gather(local_pattern, num_procs_in_node, MPI_DOUBLE, + NULL,0,0, + 0,localcomm, + localcomm->c_coll.coll_gather_module))) + ERR_EXIT(err); + } + } + + if (OMPI_SUCCESS != (err = localcomm->c_coll.coll_bcast(matching, num_procs_in_node, + MPI_INT,0,localcomm, + localcomm->c_coll.coll_bcast_module))) + ERR_EXIT(err); + + object = hwloc_get_obj_by_depth(opal_hwloc_topology, + effective_depth,matching[ompi_process_info.my_local_rank]); + if( NULL == object) goto fallback; + hwloc_bitmap_copy(set,object->cpuset); + hwloc_bitmap_singlify(set); + hwloc_err = hwloc_set_cpubind(opal_hwloc_topology,set,0); + if( -1 == hwloc_err) goto fallback; + + /* Report new binding to ORTE/OPAL */ + /* hwloc_bitmap_list_asprintf(&orte_process_info.cpuset,set); */ + err = hwloc_bitmap_snprintf (set_as_string,64,set); + +#ifdef __DEBUG__ + fprintf(stdout,"Bitmap str size : %i\n",err); +#endif + + OBJ_CONSTRUCT(&kv, opal_value_t); + kv.key = strdup(OPAL_DSTORE_CPUSET); + kv.type = OPAL_STRING; + kv.data.string = strdup(set_as_string); + + (void)opal_dstore.store(opal_dstore_internal, (opal_process_name_t*)ORTE_PROC_MY_NAME, &kv); + OBJ_DESTRUCT(&kv); + + locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, + orte_process_info.cpuset,set_as_string); + OBJ_CONSTRUCT(&kv, opal_value_t); + kv.key = strdup(OPAL_DSTORE_LOCALITY); + kv.type = OPAL_UINT16; + kv.data.uint16 = locality; + (void)opal_dstore.store(opal_dstore_internal, (opal_process_name_t*)ORTE_PROC_MY_NAME, &kv); + OBJ_DESTRUCT(&kv); + + if( OMPI_SUCCESS != (err = ompi_comm_create(comm_old, + comm_old->c_local_group, + newcomm))){ + ERR_EXIT(err); + } else { + /* Attach the dist_graph to the newly created communicator */ + (*newcomm)->c_flags |= OMPI_COMM_DIST_GRAPH; + (*newcomm)->c_topo = topo_module; + (*newcomm)->c_topo->reorder = reorder; + } + free(matching); + free(grank_to_lrank); + free(lrank_to_grank); + } /* distributed reordering end */ + + if(rank == local_procs[0]) + free(tracker); + free(nodes_roots); + free(local_procs); + free(local_pattern); + free(localrank_to_objnum); + free(colors); + hwloc_bitmap_free(set); + } /* reorder == yes */ + return err; +} diff --git a/ompi/mca/topo/treematch/topo_treematch_module.c b/ompi/mca/topo/treematch/topo_treematch_module.c new file mode 100644 index 00000000000..55999ae0bbd --- /dev/null +++ b/ompi/mca/topo/treematch/topo_treematch_module.c @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2011-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2011-2015 INRIA. All rights reserved. + * Copyright (c) 2011-2015 Université Bordeaux 1 + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include "ompi_config.h" + +#include + +#include "mpi.h" +#include "ompi/communicator/communicator.h" +#include "ompi/mca/topo/topo.h" +#include "ompi/mca/topo/base/base.h" +#include "ompi/mca/topo/treematch/topo_treematch.h" + +/* + * Local functions + */ +static void treematch_module_constructor(mca_topo_treematch_module_t *u); +static void treematch_module_destructor(mca_topo_treematch_module_t *u); + +OBJ_CLASS_INSTANCE(mca_topo_treematch_module_t, mca_topo_base_module_t, + treematch_module_constructor, treematch_module_destructor); + + +static void treematch_module_constructor(mca_topo_treematch_module_t *u) +{ + mca_topo_base_module_t *m = &(u->super); + + memset(&m->topo, 0, sizeof(m->topo)); +} + + +static void treematch_module_destructor(mca_topo_treematch_module_t *u) +{ + /* Do whatever is necessary to clean up / destroy the module */ +} diff --git a/ompi/mca/topo/treematch/treematch/COPYING b/ompi/mca/topo/treematch/treematch/COPYING new file mode 100644 index 00000000000..af43e1da03e --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/COPYING @@ -0,0 +1,8 @@ +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + 3. Neither the name of Inria nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/ompi/mca/topo/treematch/treematch/IntConstantInitializedVector.c b/ompi/mca/topo/treematch/treematch/IntConstantInitializedVector.c new file mode 100644 index 00000000000..00ee56a1610 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/IntConstantInitializedVector.c @@ -0,0 +1,61 @@ +#include +#include +#include "IntConstantInitializedVector.h" + + +int intCIV_isInitialized(int_CIVector * v, int i) +{ + if(v->top == 0) + return 0; + if(v->from[i] >= 0) + if(v->from[i] < v->top && v->to[v->from[i]] == i) + return 1; + return 0; +} + + + +void intCIV_init(int_CIVector * v, int size, int init_value) +{ + v->init_value = init_value; + v->size = size; + v->top = 0; + v->to = malloc(sizeof(int)*size); + v->from = malloc(sizeof(int)*size); + v->vec = malloc(sizeof(int)*size); +} + +void intCIV_exit(int_CIVector * v) +{ + free(v->to); + free(v->from); + free(v->vec); + } + +int intCIV_set(int_CIVector * v, int i, int val) +{ + if(v == NULL) + return -1; + if(i < 0 || i >= v->size) + return -1; + if(!intCIV_isInitialized(v,i)) + { + v->from[i] = v->top; + v->to[v->top] = i; + v->top++; + } + v->vec[i] = val; + return 0; +} + +int intCIV_get(int_CIVector * v, int i) +{ + if(v == NULL) + return -1; + if(i < 0 || i >= v->size) + return -1; + if(intCIV_isInitialized(v,i)) + return v->vec[i]; + return v->init_value; +} + diff --git a/ompi/mca/topo/treematch/treematch/IntConstantInitializedVector.h b/ompi/mca/topo/treematch/treematch/IntConstantInitializedVector.h new file mode 100644 index 00000000000..1b237b1b0ee --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/IntConstantInitializedVector.h @@ -0,0 +1,16 @@ +#ifndef INTEGER_CONSTANT_INITIALIZED_VECTOR +#define INTEGER_CONSTANT_INITIALIZED_VECTOR + +typedef struct int_CIVector_ +{ + int init_value, size, top, *to, *from, *vec; +} int_CIVector; + +int intCIV_isInitialized(int_CIVector * v, int i); +void intCIV_init(int_CIVector * v, int size, int init_value); +void intCIV_exit(int_CIVector * v); +int intCIV_set(int_CIVector * v, int i, int val); +int intCIV_get(int_CIVector * v, int i); + + +#endif /*INTEGER_CONSTANT_INITIALIZED_VECTOR*/ diff --git a/ompi/mca/topo/treematch/treematch/LICENSE b/ompi/mca/topo/treematch/treematch/LICENSE new file mode 100644 index 00000000000..3ad4deaa3c8 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/LICENSE @@ -0,0 +1,515 @@ + +CeCILL-B FREE SOFTWARE LICENSE AGREEMENT + + + Notice + +This Agreement is a Free Software license agreement that is the result +of discussions between its authors in order to ensure compliance with +the two main principles guiding its drafting: + + * firstly, compliance with the principles governing the distribution + of Free Software: access to source code, broad rights granted to + users, + * secondly, the election of a governing law, French law, with which + it is conformant, both as regards the law of torts and + intellectual property law, and the protection that it offers to + both authors and holders of the economic rights over software. + +The authors of the CeCILL-B (for Ce[a] C[nrs] I[nria] L[ogiciel] L[ibre]) +license are: + +Commissariat à l'Energie Atomique - CEA, a public scientific, technical +and industrial research establishment, having its principal place of +business at 25 rue Leblanc, immeuble Le Ponant D, 75015 Paris, France. + +Centre National de la Recherche Scientifique - CNRS, a public scientific +and technological establishment, having its principal place of business +at 3 rue Michel-Ange, 75794 Paris cedex 16, France. + +Institut National de Recherche en Informatique et en Automatique - +INRIA, a public scientific and technological establishment, having its +principal place of business at Domaine de Voluceau, Rocquencourt, BP +105, 78153 Le Chesnay cedex, France. + + + Preamble + +This Agreement is an open source software license intended to give users +significant freedom to modify and redistribute the software licensed +hereunder. + +The exercising of this freedom is conditional upon a strong obligation +of giving credits for everybody that distributes a software +incorporating a software ruled by the current license so as all +contributions to be properly identified and acknowledged. + +In consideration of access to the source code and the rights to copy, +modify and redistribute granted by the license, users are provided only +with a limited warranty and the software's author, the holder of the +economic rights, and the successive licensors only have limited liability. + +In this respect, the risks associated with loading, using, modifying +and/or developing or reproducing the software by the user are brought to +the user's attention, given its Free Software status, which may make it +complicated to use, with the result that its use is reserved for +developers and experienced professionals having in-depth computer +knowledge. Users are therefore encouraged to load and test the +suitability of the software as regards their requirements in conditions +enabling the security of their systems and/or data to be ensured and, +more generally, to use and operate it in the same conditions of +security. This Agreement may be freely reproduced and published, +provided it is not altered, and that no provisions are either added or +removed herefrom. + +This Agreement may apply to any or all software for which the holder of +the economic rights decides to submit the use thereof to its provisions. + + + Article 1 - DEFINITIONS + +For the purpose of this Agreement, when the following expressions +commence with a capital letter, they shall have the following meaning: + +Agreement: means this license agreement, and its possible subsequent +versions and annexes. + +Software: means the software in its Object Code and/or Source Code form +and, where applicable, its documentation, "as is" when the Licensee +accepts the Agreement. + +Initial Software: means the Software in its Source Code and possibly its +Object Code form and, where applicable, its documentation, "as is" when +it is first distributed under the terms and conditions of the Agreement. + +Modified Software: means the Software modified by at least one +Contribution. + +Source Code: means all the Software's instructions and program lines to +which access is required so as to modify the Software. + +Object Code: means the binary files originating from the compilation of +the Source Code. + +Holder: means the holder(s) of the economic rights over the Initial +Software. + +Licensee: means the Software user(s) having accepted the Agreement. + +Contributor: means a Licensee having made at least one Contribution. + +Licensor: means the Holder, or any other individual or legal entity, who +distributes the Software under the Agreement. + +Contribution: means any or all modifications, corrections, translations, +adaptations and/or new functions integrated into the Software by any or +all Contributors, as well as any or all Internal Modules. + +Module: means a set of sources files including their documentation that +enables supplementary functions or services in addition to those offered +by the Software. + +External Module: means any or all Modules, not derived from the +Software, so that this Module and the Software run in separate address +spaces, with one calling the other when they are run. + +Internal Module: means any or all Module, connected to the Software so +that they both execute in the same address space. + +Parties: mean both the Licensee and the Licensor. + +These expressions may be used both in singular and plural form. + + + Article 2 - PURPOSE + +The purpose of the Agreement is the grant by the Licensor to the +Licensee of a non-exclusive, transferable and worldwide license for the +Software as set forth in Article 5 hereinafter for the whole term of the +protection granted by the rights over said Software. + + + Article 3 - ACCEPTANCE + +3.1 The Licensee shall be deemed as having accepted the terms and +conditions of this Agreement upon the occurrence of the first of the +following events: + + * (i) loading the Software by any or all means, notably, by + downloading from a remote server, or by loading from a physical + medium; + * (ii) the first time the Licensee exercises any of the rights + granted hereunder. + +3.2 One copy of the Agreement, containing a notice relating to the +characteristics of the Software, to the limited warranty, and to the +fact that its use is restricted to experienced users has been provided +to the Licensee prior to its acceptance as set forth in Article 3.1 +hereinabove, and the Licensee hereby acknowledges that it has read and +understood it. + + + Article 4 - EFFECTIVE DATE AND TERM + + + 4.1 EFFECTIVE DATE + +The Agreement shall become effective on the date when it is accepted by +the Licensee as set forth in Article 3.1. + + + 4.2 TERM + +The Agreement shall remain in force for the entire legal term of +protection of the economic rights over the Software. + + + Article 5 - SCOPE OF RIGHTS GRANTED + +The Licensor hereby grants to the Licensee, who accepts, the following +rights over the Software for any or all use, and for the term of the +Agreement, on the basis of the terms and conditions set forth hereinafter. + +Besides, if the Licensor owns or comes to own one or more patents +protecting all or part of the functions of the Software or of its +components, the Licensor undertakes not to enforce the rights granted by +these patents against successive Licensees using, exploiting or +modifying the Software. If these patents are transferred, the Licensor +undertakes to have the transferees subscribe to the obligations set +forth in this paragraph. + + + 5.1 RIGHT OF USE + +The Licensee is authorized to use the Software, without any limitation +as to its fields of application, with it being hereinafter specified +that this comprises: + + 1. permanent or temporary reproduction of all or part of the Software + by any or all means and in any or all form. + + 2. loading, displaying, running, or storing the Software on any or + all medium. + + 3. entitlement to observe, study or test its operation so as to + determine the ideas and principles behind any or all constituent + elements of said Software. This shall apply when the Licensee + carries out any or all loading, displaying, running, transmission + or storage operation as regards the Software, that it is entitled + to carry out hereunder. + + + 5.2 ENTITLEMENT TO MAKE CONTRIBUTIONS + +The right to make Contributions includes the right to translate, adapt, +arrange, or make any or all modifications to the Software, and the right +to reproduce the resulting software. + +The Licensee is authorized to make any or all Contributions to the +Software provided that it includes an explicit notice that it is the +author of said Contribution and indicates the date of the creation thereof. + + + 5.3 RIGHT OF DISTRIBUTION + +In particular, the right of distribution includes the right to publish, +transmit and communicate the Software to the general public on any or +all medium, and by any or all means, and the right to market, either in +consideration of a fee, or free of charge, one or more copies of the +Software by any means. + +The Licensee is further authorized to distribute copies of the modified +or unmodified Software to third parties according to the terms and +conditions set forth hereinafter. + + + 5.3.1 DISTRIBUTION OF SOFTWARE WITHOUT MODIFICATION + +The Licensee is authorized to distribute true copies of the Software in +Source Code or Object Code form, provided that said distribution +complies with all the provisions of the Agreement and is accompanied by: + + 1. a copy of the Agreement, + + 2. a notice relating to the limitation of both the Licensor's + warranty and liability as set forth in Articles 8 and 9, + +and that, in the event that only the Object Code of the Software is +redistributed, the Licensee allows effective access to the full Source +Code of the Software at a minimum during the entire period of its +distribution of the Software, it being understood that the additional +cost of acquiring the Source Code shall not exceed the cost of +transferring the data. + + + 5.3.2 DISTRIBUTION OF MODIFIED SOFTWARE + +If the Licensee makes any Contribution to the Software, the resulting +Modified Software may be distributed under a license agreement other +than this Agreement subject to compliance with the provisions of Article +5.3.4. + + + 5.3.3 DISTRIBUTION OF EXTERNAL MODULES + +When the Licensee has developed an External Module, the terms and +conditions of this Agreement do not apply to said External Module, that +may be distributed under a separate license agreement. + + + 5.3.4 CREDITS + +Any Licensee who may distribute a Modified Software hereby expressly +agrees to: + + 1. indicate in the related documentation that it is based on the + Software licensed hereunder, and reproduce the intellectual + property notice for the Software, + + 2. ensure that written indications of the Software intended use, + intellectual property notice and license hereunder are included in + easily accessible format from the Modified Software interface, + + 3. mention, on a freely accessible website describing the Modified + Software, at least throughout the distribution term thereof, that + it is based on the Software licensed hereunder, and reproduce the + Software intellectual property notice, + + 4. where it is distributed to a third party that may distribute a + Modified Software without having to make its source code + available, make its best efforts to ensure that said third party + agrees to comply with the obligations set forth in this Article . + +If the Software, whether or not modified, is distributed with an +External Module designed for use in connection with the Software, the +Licensee shall submit said External Module to the foregoing obligations. + + + 5.3.5 COMPATIBILITY WITH THE CeCILL AND CeCILL-C LICENSES + +Where a Modified Software contains a Contribution subject to the CeCILL +license, the provisions set forth in Article 5.3.4 shall be optional. + +A Modified Software may be distributed under the CeCILL-C license. In +such a case the provisions set forth in Article 5.3.4 shall be optional. + + + Article 6 - INTELLECTUAL PROPERTY + + + 6.1 OVER THE INITIAL SOFTWARE + +The Holder owns the economic rights over the Initial Software. Any or +all use of the Initial Software is subject to compliance with the terms +and conditions under which the Holder has elected to distribute its work +and no one shall be entitled to modify the terms and conditions for the +distribution of said Initial Software. + +The Holder undertakes that the Initial Software will remain ruled at +least by this Agreement, for the duration set forth in Article 4.2. + + + 6.2 OVER THE CONTRIBUTIONS + +The Licensee who develops a Contribution is the owner of the +intellectual property rights over this Contribution as defined by +applicable law. + + + 6.3 OVER THE EXTERNAL MODULES + +The Licensee who develops an External Module is the owner of the +intellectual property rights over this External Module as defined by +applicable law and is free to choose the type of agreement that shall +govern its distribution. + + + 6.4 JOINT PROVISIONS + +The Licensee expressly undertakes: + + 1. not to remove, or modify, in any manner, the intellectual property + notices attached to the Software; + + 2. to reproduce said notices, in an identical manner, in the copies + of the Software modified or not. + +The Licensee undertakes not to directly or indirectly infringe the +intellectual property rights of the Holder and/or Contributors on the +Software and to take, where applicable, vis-à-vis its staff, any and all +measures required to ensure respect of said intellectual property rights +of the Holder and/or Contributors. + + + Article 7 - RELATED SERVICES + +7.1 Under no circumstances shall the Agreement oblige the Licensor to +provide technical assistance or maintenance services for the Software. + +However, the Licensor is entitled to offer this type of services. The +terms and conditions of such technical assistance, and/or such +maintenance, shall be set forth in a separate instrument. Only the +Licensor offering said maintenance and/or technical assistance services +shall incur liability therefor. + +7.2 Similarly, any Licensor is entitled to offer to its licensees, under +its sole responsibility, a warranty, that shall only be binding upon +itself, for the redistribution of the Software and/or the Modified +Software, under terms and conditions that it is free to decide. Said +warranty, and the financial terms and conditions of its application, +shall be subject of a separate instrument executed between the Licensor +and the Licensee. + + + Article 8 - LIABILITY + +8.1 Subject to the provisions of Article 8.2, the Licensee shall be +entitled to claim compensation for any direct loss it may have suffered +from the Software as a result of a fault on the part of the relevant +Licensor, subject to providing evidence thereof. + +8.2 The Licensor's liability is limited to the commitments made under +this Agreement and shall not be incurred as a result of in particular: +(i) loss due the Licensee's total or partial failure to fulfill its +obligations, (ii) direct or consequential loss that is suffered by the +Licensee due to the use or performance of the Software, and (iii) more +generally, any consequential loss. In particular the Parties expressly +agree that any or all pecuniary or business loss (i.e. loss of data, +loss of profits, operating loss, loss of customers or orders, +opportunity cost, any disturbance to business activities) or any or all +legal proceedings instituted against the Licensee by a third party, +shall constitute consequential loss and shall not provide entitlement to +any or all compensation from the Licensor. + + + Article 9 - WARRANTY + +9.1 The Licensee acknowledges that the scientific and technical +state-of-the-art when the Software was distributed did not enable all +possible uses to be tested and verified, nor for the presence of +possible defects to be detected. In this respect, the Licensee's +attention has been drawn to the risks associated with loading, using, +modifying and/or developing and reproducing the Software which are +reserved for experienced users. + +The Licensee shall be responsible for verifying, by any or all means, +the suitability of the product for its requirements, its good working +order, and for ensuring that it shall not cause damage to either persons +or properties. + +9.2 The Licensor hereby represents, in good faith, that it is entitled +to grant all the rights over the Software (including in particular the +rights set forth in Article 5). + +9.3 The Licensee acknowledges that the Software is supplied "as is" by +the Licensor without any other express or tacit warranty, other than +that provided for in Article 9.2 and, in particular, without any warranty +as to its commercial value, its secured, safe, innovative or relevant +nature. + +Specifically, the Licensor does not warrant that the Software is free +from any error, that it will operate without interruption, that it will +be compatible with the Licensee's own equipment and software +configuration, nor that it will meet the Licensee's requirements. + +9.4 The Licensor does not either expressly or tacitly warrant that the +Software does not infringe any third party intellectual property right +relating to a patent, software or any other property right. Therefore, +the Licensor disclaims any and all liability towards the Licensee +arising out of any or all proceedings for infringement that may be +instituted in respect of the use, modification and redistribution of the +Software. Nevertheless, should such proceedings be instituted against +the Licensee, the Licensor shall provide it with technical and legal +assistance for its defense. Such technical and legal assistance shall be +decided on a case-by-case basis between the relevant Licensor and the +Licensee pursuant to a memorandum of understanding. The Licensor +disclaims any and all liability as regards the Licensee's use of the +name of the Software. No warranty is given as regards the existence of +prior rights over the name of the Software or as regards the existence +of a trademark. + + + Article 10 - TERMINATION + +10.1 In the event of a breach by the Licensee of its obligations +hereunder, the Licensor may automatically terminate this Agreement +thirty (30) days after notice has been sent to the Licensee and has +remained ineffective. + +10.2 A Licensee whose Agreement is terminated shall no longer be +authorized to use, modify or distribute the Software. However, any +licenses that it may have granted prior to termination of the Agreement +shall remain valid subject to their having been granted in compliance +with the terms and conditions hereof. + + + Article 11 - MISCELLANEOUS + + + 11.1 EXCUSABLE EVENTS + +Neither Party shall be liable for any or all delay, or failure to +perform the Agreement, that may be attributable to an event of force +majeure, an act of God or an outside cause, such as defective +functioning or interruptions of the electricity or telecommunications +networks, network paralysis following a virus attack, intervention by +government authorities, natural disasters, water damage, earthquakes, +fire, explosions, strikes and labor unrest, war, etc. + +11.2 Any failure by either Party, on one or more occasions, to invoke +one or more of the provisions hereof, shall under no circumstances be +interpreted as being a waiver by the interested Party of its right to +invoke said provision(s) subsequently. + +11.3 The Agreement cancels and replaces any or all previous agreements, +whether written or oral, between the Parties and having the same +purpose, and constitutes the entirety of the agreement between said +Parties concerning said purpose. No supplement or modification to the +terms and conditions hereof shall be effective as between the Parties +unless it is made in writing and signed by their duly authorized +representatives. + +11.4 In the event that one or more of the provisions hereof were to +conflict with a current or future applicable act or legislative text, +said act or legislative text shall prevail, and the Parties shall make +the necessary amendments so as to comply with said act or legislative +text. All other provisions shall remain effective. Similarly, invalidity +of a provision of the Agreement, for any reason whatsoever, shall not +cause the Agreement as a whole to be invalid. + + + 11.5 LANGUAGE + +The Agreement is drafted in both French and English and both versions +are deemed authentic. + + + Article 12 - NEW VERSIONS OF THE AGREEMENT + +12.1 Any person is authorized to duplicate and distribute copies of this +Agreement. + +12.2 So as to ensure coherence, the wording of this Agreement is +protected and may only be modified by the authors of the License, who +reserve the right to periodically publish updates or new versions of the +Agreement, each with a separate number. These subsequent versions may +address new issues encountered by Free Software. + +12.3 Any Software distributed under a given version of the Agreement may +only be subsequently distributed under the same version of the Agreement +or a subsequent version. + + + Article 13 - GOVERNING LAW AND JURISDICTION + +13.1 The Agreement is governed by French law. The Parties agree to +endeavor to seek an amicable solution to any disagreements or disputes +that may arise during the performance of the Agreement. + +13.2 Failing an amicable solution within two (2) months as from their +occurrence, and unless emergency proceedings are necessary, the +disagreements or disputes shall be referred to the Paris Courts having +jurisdiction, by the more diligent Party. + + +Version 1.0 dated 2006-09-05. diff --git a/ompi/mca/topo/treematch/treematch/tgt_map.c b/ompi/mca/topo/treematch/treematch/tgt_map.c new file mode 100644 index 00000000000..ea0a35542ad --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tgt_map.c @@ -0,0 +1,56 @@ +#include +#include +#include +//#include "tm_hwloc.h" +#include "tm_tree.h" +#include "tm_mapping.h" +#include "tm_timings.h" + + + +int main(int argc, char**argv){; + tree_t *comm_tree=NULL; + double **comm,**arch; + tm_topology_t *topology; + int nb_processes,nb_cores; + int *sol,*k; + if(argc<3){ + fprintf(stderr,"Usage: %s \n",argv[0]); + return -1; + } + + topology=tgt_to_tm(argv[1],&arch); + optimize_topology(&topology); + nb_processes=build_comm(argv[2],&comm); + sol=(int*)MALLOC(sizeof(int)*nb_processes); + + nb_cores=nb_processing_units(topology); + k=(int*)MALLOC(sizeof(int)*nb_cores); + // TreeMatchMapping(nb_processes,nb_cores,comm,sol); + + if(nb_processes>nb_cores){ + fprintf(stderr,"Error: to many processes (%d) for this topology (%d nodes)\n",nb_processes,nb_cores); + exit(-1); + } + TIC; + comm_tree=build_tree_from_topology(topology,comm,nb_processes,NULL,NULL); + map_topology_simple(topology,comm_tree,sol,k); + double duration=TOC; + printf("mapping duration: %f\n",duration); + printf("TreeMatch: "); + print_sol_inv(nb_processes,sol,comm,arch); + //print_1D_tab(k,nb_cores); +// display_other_heuristics(topology,nb_processes,comm,arch); + + //display_tab(arch,nb_cores); + + FREE_topology(topology); + //FREE_tree(comm_tree); + FREE(sol); + FREE(comm); + FREE(arch); + + + + return 0; +} diff --git a/ompi/mca/topo/treematch/treematch/tgt_to_mat.c b/ompi/mca/topo/treematch/treematch/tgt_to_mat.c new file mode 100644 index 00000000000..1e65a21a941 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tgt_to_mat.c @@ -0,0 +1,31 @@ +#include +#include +#include +#include "tm_hwloc.h" +#include "tm_tree.h" +#include "tm_mapping.h" +#include "tm_timings.h" + + + +int main(int argc, char**argv){; + tm_topology_t *topology; + int nb_cores; + double **arch; + if(argc<2){ + fprintf(stderr,"Usage: %s \n",argv[0]); + return -1; + } + + topology=tgt_to_tm(argv[1],&arch); + nb_cores=nb_nodes(topology); + + display_tab(arch,nb_cores); + + FREE_topology(topology); + FREE(arch); + + + + return 0; +} diff --git a/ompi/mca/topo/treematch/treematch/tm_bucket.c b/ompi/mca/topo/treematch/treematch/tm_bucket.c new file mode 100644 index 00000000000..44d0169bce2 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_bucket.c @@ -0,0 +1,669 @@ +#include +#include +#include +#include +#include "tm_tree.h" +#include "tm_bucket.h" +#include "tm_timings.h" +#include "tm_verbose.h" +#include "tm_thread_pool.h" +#include "tm_mt.h" +#ifdef _WIN32 +#include +#include +#endif + +#ifndef __CHARMC__ +#define __CHARMC__ 0 +#endif + +#if __CHARMC__ +#include "converse.h" +#else +static int ilog2(int val) +{ + int i = 0; + for( ; val != 0; val >>= 1, i++ ); + return i; +} +#define CmiLog2(VAL) ilog2((int)(VAL)) +#endif + +static int verbose_level = ERROR; + +bucket_list_t global_bl; + +int tab_cmp(const void*,const void*); +int old_bucket_id(int,int,bucket_list_t); +int bucket_id(int,int,bucket_list_t); +void display_bucket(bucket_t *); +void check_bucket(bucket_t *,double **,double, double); +void display_pivots(bucket_list_t); +void display_bucket_list(bucket_list_t); +void add_to_bucket(int,int,int,bucket_list_t); +void dfs(int,int,int,double *,double *,int,int); +void built_pivot_tree(bucket_list_t); +void fill_buckets(bucket_list_t); +int is_power_of_2(int); +void partial_sort(bucket_list_t *,double **,int); +void next_bucket_elem(bucket_list_t,int *,int *); +int add_edge_3(tree_t *,tree_t *,int,int,int *); +void FREE_bucket(bucket_t *); +void FREE_tab_bucket(bucket_t **,int); +void FREE_bucket_list(bucket_list_t); +void partial_update_val (int nb_args, void **args); + +int tab_cmp(const void* x1,const void* x2) +{ + int *e1 = NULL,*e2 = NULL,i1,i2,j1,j2; + double **tab = NULL; + bucket_list_t bl; + + bl = global_bl; + + e1 = ((int *)x1); + e2 = ((int *)x2); + + tab = bl->tab; + + i1 = e1[0]; + j1 = e1[1]; + i2 = e2[0]; + j2 = e2[1]; + + if(tab[i1][j1]==tab[i2][j2]){ + if(i1==i2){ + return (j1 > j2) ? -1 : 1; + }else{ + return (i1 > i2) ? -1 : 1; + } + } + return (tab[i1][j1] > tab[i2][j2]) ? -1 : 1; +} + + +int old_bucket_id(int i,int j,bucket_list_t bucket_list) +{ + double *pivot = NULL,val; + int n,sup,inf,p; + + pivot = bucket_list->pivot; + n = bucket_list->nb_buckets; + val = bucket_list->tab[i][j]; + + inf = -1; + sup = n; + + while( (sup - inf) > 1){ + p = (sup + inf)/2; + /* printf("%f [%d,%d,%d]=%f\n",val,inf,p,sup,pivot[p]); */ + if( val < pivot[p] ){ + inf = p; + if( inf == sup ) + inf--; + } else { + sup = p; + if( sup == inf ) + sup++; + } + } + /*exit(-1);*/ + return sup; +} + +int bucket_id(int i,int j,bucket_list_t bucket_list) +{ + double *pivot_tree = NULL,val; + int p,k; + + pivot_tree = bucket_list->pivot_tree; + val = bucket_list->tab[i][j]; + + + p = 1; + for( k = 0 ; k < bucket_list->max_depth ; k++){ + if( val > pivot_tree[p] ) + p = p*2; + else + p = p*2 + 1; + } + + return (int)pivot_tree[p]; +} + +void display_bucket(bucket_t *b) +{ + printf("\tb.bucket=%p\n",(void *)b->bucket); + printf("\tb.bucket_len=%d\n",(int)b->bucket_len); + printf("\tb.nb_elem=%d\n",(int)b->nb_elem); +} + +void check_bucket(bucket_t *b,double **tab,double inf, double sup) +{ + int i,j,k; + for( k = 0 ; k < b->nb_elem ; k++ ){ + i = b->bucket[k].i; + j = b->bucket[k].j; + if((tab[i][j] < inf) || (tab[i][j] > sup)){ + if(verbose_level >= CRITICAL) + printf("[%d] (%d,%d):%f not in [%f,%f]\n",k,i,j,tab[i][j],inf,sup); + exit(-1); + } + } +} + +void display_pivots(bucket_list_t bucket_list) +{ + int i; + for( i = 0 ; i < bucket_list->nb_buckets-1 ; i++) + printf("pivot[%d]=%f\n",i,bucket_list->pivot[i]); + printf("\n"); +} + +void display_bucket_list(bucket_list_t bucket_list) +{ + int i; + double inf,sup; + + /*display_pivots(bucket_list);*/ + + for(i = 0 ; i < bucket_list->nb_buckets ; i++){ + inf = bucket_list->pivot[i]; + sup = bucket_list->pivot[i-1]; + if( i == 0 ) + sup=DBL_MAX; + if( i == bucket_list->nb_buckets - 1 ) + inf = 0; + if(verbose_level >= DEBUG){ + printf("Bucket %d:\n",i); + display_bucket(bucket_list->bucket_tab[i]); + printf("\n"); + } + check_bucket(bucket_list->bucket_tab[i],bucket_list->tab,inf,sup); + } + +} + +void add_to_bucket(int id,int i,int j,bucket_list_t bucket_list) +{ + bucket_t *bucket = NULL; + int N,n,size; + + bucket = bucket_list->bucket_tab[id]; + /* display_bucket(bucket);*/ + + if( bucket->bucket_len == bucket->nb_elem ){ + N = bucket_list->N; + n = bucket_list->nb_buckets; + size = N*N/n; + /* display_bucket(bucket);*/ + bucket->bucket = (coord*)realloc(bucket->bucket,sizeof(coord)*(size + bucket->bucket_len)); + bucket->bucket_len += size; + + if(verbose_level >= DEBUG){ + printf("MALLOC/realloc: %d\n",id); + printf("(%d,%d)\n",i,j); + display_bucket(bucket); + printf("\n"); + } + + } + + bucket->bucket[bucket->nb_elem].i=i; + bucket->bucket[bucket->nb_elem].j=j; + bucket->nb_elem++; + + /* printf("\n"); */ + /* exit(-1); */ +} + +void dfs(int i,int inf,int sup,double *pivot,double *pivot_tree,int depth,int max_depth) +{ + int p; + if( depth == max_depth ) + return; + + p = (inf + sup)/2; + pivot_tree[i] = pivot[p-1]; + + dfs(2*i,inf,p-1,pivot,pivot_tree,depth+1,max_depth); + dfs(2*i+1,p+1,sup,pivot,pivot_tree,depth+1,max_depth); +} + +void built_pivot_tree(bucket_list_t bucket_list) +{ + double *pivot_tree = NULL,*pivot = NULL; + int n,i,k; + + pivot = bucket_list->pivot; + n = bucket_list->nb_buckets; + pivot_tree = (double*)MALLOC(sizeof(double)*2*n); + bucket_list->max_depth = (int)CmiLog2(n) - 1; + + dfs(1,1,n-1,pivot,pivot_tree,0,bucket_list->max_depth); + + k = 0; + pivot_tree[0] = -1; + for( i = n ; i < 2*n ; i++) + pivot_tree[i] = k++; + + bucket_list->pivot_tree = pivot_tree; + + if(verbose_level >= DEBUG){ + for(i=0;i<2*n;i++) + printf("%d:%f\t",i,pivot_tree[i]); + printf("\n"); + } +} + +void fill_buckets(bucket_list_t bucket_list) +{ + int N,i,j,id; + + N = bucket_list->N; + + for( i = 0 ; i < N ; i++ ) + for( j = i+1 ; j < N ; j++ ){ + id = bucket_id(i,j,bucket_list); + add_to_bucket(id,i,j,bucket_list); + } +} + +int is_power_of_2(int val) +{ + int n = 1; + do{ + if( n == val) + return 1; + n <<= 1; + }while( n > 0); + return 0; +} + + +void partial_sort(bucket_list_t *bl,double **tab,int N) +{ + double *pivot = NULL; + int *sample = NULL; + int i,j,k,n,id; + bucket_list_t bucket_list; + int nb_buckets, nb_bits; + + /* after these operations, nb_bucket is a power of 2 interger close to log2(N)*/ + + nb_buckets = (int)floor(CmiLog2(N)); + + nb_bits = (int)ceil(CmiLog2(nb_buckets)); + nb_buckets = nb_buckets >> (nb_bits-1); + nb_buckets = nb_buckets << (nb_bits-1); + + /* check the result*/ + if(!is_power_of_2(nb_buckets)){ + if(verbose_level >= ERROR) + fprintf(stderr,"Error! Paramater nb_buckets is: %d and should be a power of 2\n",nb_buckets); + exit(-1); + } + + bucket_list = (bucket_list_t)MALLOC(sizeof(_bucket_list_t)); + bucket_list->tab = tab; + bucket_list->N = N; + + n = pow(nb_buckets,2); + if(verbose_level >= INFO) + printf("N=%d, n=%d\n",N,n); + sample = (int*)MALLOC(2*sizeof(int)*n); + + for( k = 0 ; k < n ; k++ ){ + i = genrand_int32()%(N-2)+1; + if( i == N-2 ) + j = N-1; + else + j = genrand_int32()%(N-i-2)+i+1; + if(verbose_level >= DEBUG) + printf("i=%d, j=%d\n",i,j); + assert( i != j ); + assert( i < j ); + assert( i < N ); + assert( j < N ); + sample[2*k] = i; + sample[2*k+1] = j; + } + + /* printf("k=%d\n",k); */ + global_bl = bucket_list; + qsort(sample,n,2*sizeof(int),tab_cmp); + + if(verbose_level >= DEBUG) + for(k=0;kpivot = pivot; + bucket_list->nb_buckets = nb_buckets; + built_pivot_tree(bucket_list); + + bucket_list->bucket_tab = (bucket_t**)MALLOC(nb_buckets*sizeof(bucket_t*)); + for( i = 0 ; i < nb_buckets ; i++ ) + bucket_list->bucket_tab[i] = (bucket_t*)CALLOC(1,sizeof(bucket_t)); + + fill_buckets(bucket_list); + + /* display_bucket_list(bucket_list); */ + + bucket_list->cur_bucket = 0; + bucket_list->bucket_indice = 0; + + FREE(sample); + + *bl = bucket_list; +} + +void next_bucket_elem(bucket_list_t bucket_list,int *i,int *j) +{ + bucket_t *bucket = bucket_list->bucket_tab[bucket_list->cur_bucket]; + + /* display_bucket_list(bucket_list); + printf("nb_elem: %d, indice: %d, bucket_id: %d\n",(int)bucket->nb_elem,bucket_list->bucket_indice,bucket_list->cur_bucket); + */ + while( bucket->nb_elem <= bucket_list->bucket_indice ){ + bucket_list->bucket_indice = 0; + bucket_list->cur_bucket++; + bucket = bucket_list->bucket_tab[bucket_list->cur_bucket]; + if(verbose_level >= DEBUG){ + printf("### From bucket %d to bucket %d\n",bucket_list->cur_bucket-1,bucket_list->cur_bucket); + printf("nb_elem: %d, indice: %d, bucket_id: %d\n",(int)bucket->nb_elem,bucket_list->bucket_indice,bucket_list->cur_bucket); + } + } + + if(!bucket->sorted){ + global_bl = bucket_list; + qsort(bucket->bucket,bucket->nb_elem,2*sizeof(int),tab_cmp); + bucket->sorted = 1; + } + + *i = bucket->bucket[bucket_list->bucket_indice].i; + *j = bucket->bucket[bucket_list->bucket_indice].j; + bucket_list->bucket_indice++; +} + + +int add_edge_3(tree_t *tab_node, tree_t *parent,int i,int j,int *nb_groups) +{ + /* printf("%d <-> %d ?\n",tab_node[i].id,tab_node[j].id); */ + if((!tab_node[i].parent) && (!tab_node[j].parent)){ + if(parent){ + parent->child[0] = &tab_node[i]; + parent->child[1] = &tab_node[j]; + tab_node[i].parent = parent; + tab_node[j].parent = parent; + + if(verbose_level >= DEBUG) + printf("%d: %d-%d\n",*nb_groups,parent->child[0]->id,parent->child[1]->id); + + return 1; + } + return 0; + } + + if( tab_node[i].parent && (!tab_node[j].parent) ){ + parent = tab_node[i].parent; + if(!parent->child[2]){ + parent->child[2] = &tab_node[j]; + tab_node[j].parent = parent; + + if(verbose_level >= DEBUG) + printf("%d: %d-%d-%d\n",*nb_groups,parent->child[0]->id,parent->child[1]->id,parent->child[2]->id); + + (*nb_groups)++; + } + return 0; + } + + if(tab_node[j].parent && (!tab_node[i].parent)){ + parent = tab_node[j].parent; + if(!parent->child[2]){ + parent->child[2] = &tab_node[i]; + tab_node[i].parent = parent; + + if(verbose_level >= DEBUG) + printf("%d: %d-%d-%d\n",*nb_groups,parent->child[0]->id,parent->child[1]->id,parent->child[2]->id); + + (*nb_groups)++; + } + return 0; + } + + return 0; +} + +int try_add_edge(tree_t *tab_node, tree_t *parent,int arity,int i,int j,int *nb_groups) +{ + assert( i != j ); + + switch(arity){ + case 2: + if(tab_node[i].parent) + return 0; + if(tab_node[j].parent) + return 0; + + parent->child[0] = &tab_node[i]; + parent->child[1] = &tab_node[j]; + tab_node[i].parent = parent; + tab_node[j].parent = parent; + + (*nb_groups)++; + + return 1; + case 3: + return add_edge_3(tab_node,parent,i,j,nb_groups); + default: + if(verbose_level >= ERROR) + fprintf(stderr,"Cannot handle arity %d\n",parent->arity); + exit(-1); + } +} + +void FREE_bucket(bucket_t *bucket) +{ + FREE(bucket->bucket); + FREE(bucket); +} + +void FREE_tab_bucket(bucket_t **bucket_tab,int N) +{ + int i; + for( i = 0 ; i < N ; i++ ) + FREE_bucket(bucket_tab[i]); + FREE(bucket_tab); +} + +void FREE_bucket_list(bucket_list_t bucket_list) +{ + /* Do not FREE the tab field it is used elsewhere */ + FREE_tab_bucket(bucket_list->bucket_tab,bucket_list->nb_buckets); + FREE(bucket_list->pivot); + FREE(bucket_list->pivot_tree); + FREE(bucket_list); +} + +void partial_update_val (int nb_args, void **args){ + int inf = *(int*)args[0]; + int sup = *(int*)args[1]; + affinity_mat_t *aff_mat = (affinity_mat_t*)args[2]; + tree_t *new_tab_node = (tree_t*)args[3]; + double *res=(double*)args[4]; + int l; + + if(nb_args != 6){ + if(verbose_level >= ERROR) + fprintf(stderr,"Wrong number of args in %s: %d\n",__FUNCTION__, nb_args); + exit(-1); + } + + for( l = inf ; l < sup ; l++ ){ + update_val(aff_mat,&new_tab_node[l]); + *res += new_tab_node[l].val; + } +} + +void bucket_grouping(affinity_mat_t *aff_mat,tree_t *tab_node, tree_t *new_tab_node, + int arity,int M) +{ + bucket_list_t bucket_list; + double duration,val = 0; + int l,i,j,nb_groups; + double gr1_1=0; + double gr1_2=0; + double gr1, gr2, gr3; + int N = aff_mat->order; + double **mat = aff_mat->mat; + + verbose_level = get_verbose_level(); + if(verbose_level >= INFO ) + printf("starting sort of N=%d elements\n",N); + + TIC; + partial_sort(&bucket_list,mat,N); + duration = TOC; + if(verbose_level >= INFO) + printf("Partial sorting=%fs\n",duration); + if(verbose_level >= DEBUG) + display_pivots(bucket_list); + + TIC; + l = 0; + i = 0; + nb_groups = 0; + + + TIC; + if(verbose_level >= INFO){ + while( l < M ){ + TIC; + next_bucket_elem(bucket_list,&i,&j); + if(verbose_level >= DEBUG) + printf("elem[%d][%d]=%f ",i,j,mat[i][j]); + gr1_1 += TOC; + TIC; + if(try_add_edge(tab_node,&new_tab_node[l],arity,i,j,&nb_groups)){ + l++; + } + gr1_2 += TOC; + } + }else{ + while( l < M ){ + next_bucket_elem(bucket_list,&i,&j); + if(try_add_edge(tab_node,&new_tab_node[l],arity,i,j,&nb_groups)){ + l++; + } + } + } + + gr1=TOC; + if(verbose_level >= INFO) + printf("Grouping phase 1=%fs (%fs+%fs) \n",gr1, gr1_1, gr1_2); + + if(verbose_level >= DEBUG) + printf("l=%d,nb_groups=%d\n",l,nb_groups); + + TIC; + while( nb_groups < M ){ + next_bucket_elem(bucket_list,&i,&j); + try_add_edge(tab_node,NULL,arity,i,j,&nb_groups); + } + + gr2=TOC; + if(verbose_level >= INFO) + printf("Grouping phase 2=%fs\n",gr2); + + if(verbose_level >= DEBUG) + printf("l=%d,nb_groups=%d\n",l,nb_groups); + + TIC; + + + if(M>512){ /* perform this part in parallel*/ + int id; + int nb_threads; + work_t **works; + int *inf; + int *sup; + double *tab_val; + + nb_threads = get_nb_threads(); + works = (work_t**)MALLOC(sizeof(work_t*)*nb_threads); + inf = (int*)MALLOC(sizeof(int)*nb_threads); + sup = (int*)MALLOC(sizeof(int)*nb_threads); + tab_val = (double*)CALLOC(nb_threads,sizeof(double)); + for(id=0;id= DEBUG) + printf("Executing %p\n",(void *)works[id]); + + submit_work( works[id], id); + } + + for(id=0;idargs); + } + + + FREE(inf); + FREE(sup); + FREE(tab_val); + FREE(works); + }else{ + for( l = 0 ; l < M ; l++ ){ + + update_val(aff_mat,&new_tab_node[l]); + val += new_tab_node[l].val; + } + } + gr3=TOC; + if(verbose_level >= INFO) + printf("Grouping phase 3=%fs\n",gr3); + /* printf("val=%f\n",val);exit(-1); */ + + duration = TOC; + if(verbose_level >= INFO) + printf("Grouping =%fs\n",duration); + + if(verbose_level >= DEBUG){ + printf("Bucket: %d, indice:%d\n",bucket_list->cur_bucket,bucket_list->bucket_indice); + printf("val=%f\n",val); + } + FREE_bucket_list(bucket_list); + + /* exit(-1); */ + /* display_grouping(new_tab_node,M,arity,val); */ +} diff --git a/ompi/mca/topo/treematch/treematch/tm_bucket.h b/ompi/mca/topo/treematch/treematch/tm_bucket.h new file mode 100644 index 00000000000..17e70603983 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_bucket.h @@ -0,0 +1,34 @@ +#ifndef __BUCKET_H__ +#define __BUCKET_H__ + +typedef struct{ + int i; + int j; +}coord; + +typedef struct{ + coord * bucket; /* store i,j */ + int bucket_len; /* allocated size in the heap */ + int nb_elem; /* number of usefull elements (nb_elem should be lower than bucket_len) */ + int sorted; +}bucket_t; + +typedef struct{ + bucket_t **bucket_tab; + int nb_buckets; + double **tab; + int N;/* length of tab */ + /* For iterating over the buckets */ + int cur_bucket; + int bucket_indice; + double *pivot; + double *pivot_tree; + int max_depth; +}_bucket_list_t; + +typedef _bucket_list_t *bucket_list_t; + +void bucket_grouping(affinity_mat_t *aff_mat,tree_t *tab_node, tree_t *new_tab_node, + int arity,int M); +int try_add_edge(tree_t *tab_node, tree_t *parent,int arity,int i,int j,int *nb_groups); +#endif diff --git a/ompi/mca/topo/treematch/treematch/tm_hwloc.c b/ompi/mca/topo/treematch/treematch/tm_hwloc.c new file mode 100644 index 00000000000..0cc39a915fc --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_hwloc.c @@ -0,0 +1,280 @@ +#include +#include +#include "tm_tree.h" +#include "tm_mapping.h" +#include +#include "tm_verbose.h" + + +double ** tm_topology_to_arch(tm_topology_t *topology,double *cost); +tm_topology_t * tgt_to_tm(char *filename,double **pcost); +int topo_nb_proc(hwloc_topology_t topology,int N); +double ** topology_to_arch(hwloc_topology_t topology); +int symetric(hwloc_topology_t topology); +tm_topology_t* hwloc_to_tm(char *filename,double **pcost); +tm_topology_t* get_local_topo_with_hwloc(void); + + + + +/* transform a tgt scotch file into a topology file*/ +tm_topology_t * tgt_to_tm(char *filename, double **pcost) +{ + tm_topology_t *topology = NULL; + FILE *pf = NULL; + char line[1024]; + char *s = NULL; + double *cost = NULL; + int i; + + + + pf = fopen(filename,"r"); + if(!pf){ + if(get_verbose_level() >= CRITICAL) + fprintf(stderr,"Cannot open %s\n",filename); + exit(-1); + } + + if(get_verbose_level() >= INFO) + printf("Reading TGT file: %s\n",filename); + + + fgets(line,1024,pf); + + s = strstr(line,"tleaf"); + if(!s){ + if(get_verbose_level() >= CRITICAL) + fprintf(stderr,"Syntax error! %s is not a tleaf file\n",filename); + exit(-1); + } + + s += 5; + while(isspace(*s)) + s++; + + topology = (tm_topology_t*)MALLOC(sizeof(tm_topology_t)); + topology->nb_levels = atoi(strtok(s," "))+1; + topology->arity = (int*)MALLOC(sizeof(int)*topology->nb_levels); + cost = (double*)CALLOC(topology->nb_levels,sizeof(double)); + + for( i = 0 ; i < topology->nb_levels-1 ; i++ ){ + topology->arity[i] = atoi(strtok(NULL," ")); + cost[i] = atoi(strtok(NULL," ")); + } + + topology->arity[topology->nb_levels-1] = 0; + /* cost[topology->nb_levels-1]=0; */ + + /*aggregate costs*/ + for( i = topology->nb_levels-2 ; i >= 0 ; i-- ) + cost[i] += cost[i+1]; + + build_synthetic_proc_id(topology); + + *pcost = cost; + /* FREE(cost); */ + /* + topology->arity[0]=nb_proc; + topology->nb_levels=decompose((int)ceil((1.0*nb_obj)/nb_proc),1,topology->arity); + printf("levels=%d\n",topology->nb_levels); + */ + if(get_verbose_level() >= INFO) + printf("Topology built from %s!\n",filename); + + + return topology; +} + +int topo_nb_proc(hwloc_topology_t topology,int N) +{ + hwloc_obj_t *objs = NULL; + int nb_proc; + + objs = (hwloc_obj_t*)MALLOC(sizeof(hwloc_obj_t)*N); + objs[0] = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU,NULL); + nb_proc = 1 + hwloc_get_closest_objs(topology,objs[0],objs+1,N-1); + FREE(objs); + return nb_proc; +} + + +double ** topology_to_arch(hwloc_topology_t topology) +{ + int nb_proc,i,j; + hwloc_obj_t obj_proc1,obj_proc2,obj_res; + double **arch = NULL; + + nb_proc = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PU); + arch = (double**)MALLOC(sizeof(double*)*nb_proc); + for( i = 0 ; i < nb_proc ; i++ ){ + obj_proc1 = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,i); + arch[obj_proc1->os_index] = (double*)MALLOC(sizeof(double)*nb_proc); + for( j = 0 ; j < nb_proc ; j++ ){ + obj_proc2 = hwloc_get_obj_by_type(topology,HWLOC_OBJ_PU,j); + obj_res = hwloc_get_common_ancestor_obj(topology,obj_proc1,obj_proc2); + /* printf("arch[%d][%d] <- %ld\n",obj_proc1->os_index,obj_proc2->os_index,*((long int*)(obj_res->userdatab))); */ + arch[obj_proc1->os_index][obj_proc2->os_index]=speed(obj_res->depth+1); + } + } + return arch; +} + +int symetric(hwloc_topology_t topology) +{ + int depth,i,topodepth = hwloc_topology_get_depth(topology); + unsigned int arity; + hwloc_obj_t obj; + for ( depth = 0; depth < topodepth-1 ; depth++ ) { + int N = hwloc_get_nbobjs_by_depth(topology, depth); + obj = hwloc_get_next_obj_by_depth (topology,depth,NULL); + arity = obj->arity; + + /* printf("Depth=%d, N=%d, Arity:%d\n",depth,N,arity); */ + for (i = 1; i < N; i++ ){ + obj = hwloc_get_next_obj_by_depth (topology,depth,obj); + if( obj->arity != arity){ + /* printf("[%d]: obj->arity=%d, arity=%d\n",i,obj->arity,arity); */ + return 0; + } + } + } + return 1; +} + +tm_topology_t* hwloc_to_tm(char *filename,double **pcost) +{ + hwloc_topology_t topology; + tm_topology_t *res = NULL; + hwloc_obj_t *objs = NULL; + unsigned topodepth,depth; + int nb_nodes,i; + double *cost; + int err; + + /* Build the topology */ + hwloc_topology_init(&topology); + err = hwloc_topology_set_xml(topology,filename); + if(err == -1){ + if(get_verbose_level() >= CRITICAL) + fprintf(stderr,"Error: %s is a bad xml topology file!\n",filename); + exit(-1); + } + + hwloc_topology_ignore_all_keep_structure(topology); + hwloc_topology_load(topology); + + + /* Test if symetric */ + if(!symetric(topology)){ + if(get_verbose_level() >= CRITICAL) + fprintf(stderr,"%s not symetric!\n",filename); + exit(-1); + } + + /* work on depth */ + topodepth = hwloc_topology_get_depth(topology); + + res = (tm_topology_t*)MALLOC(sizeof(tm_topology_t)); + res->nb_levels = topodepth; + res->node_id = (int**)MALLOC(sizeof(int*)*res->nb_levels); + res->nb_nodes = (int*)MALLOC(sizeof(int)*res->nb_levels); + res->arity = (int*)MALLOC(sizeof(int)*res->nb_levels); + + if(get_verbose_level() >= INFO) + printf("topodepth = %d\n",topodepth); + + /* Build TreeMatch topology */ + for( depth = 0 ; depth < topodepth ; depth++ ){ + nb_nodes = hwloc_get_nbobjs_by_depth(topology, depth); + res->nb_nodes[depth] = nb_nodes; + res->node_id[depth] = (int*)MALLOC(sizeof(int)*nb_nodes); + + objs = (hwloc_obj_t*)MALLOC(sizeof(hwloc_obj_t)*nb_nodes); + objs[0] = hwloc_get_next_obj_by_depth(topology,depth,NULL); + hwloc_get_closest_objs(topology,objs[0],objs+1,nb_nodes-1); + res->arity[depth] = objs[0]->arity; + + if(get_verbose_level() >= INFO) + printf("%d(%d):",res->arity[depth],nb_nodes); + + /* Build process id tab */ + for (i = 0; i < nb_nodes; i++){ + res->node_id[depth][i] = objs[i]->os_index; + /* if(depth==topodepth-1) */ + } + FREE(objs); + } + + cost = (double*)CALLOC(res->nb_levels,sizeof(double)); + for(i=0; inb_levels; i++){ + cost[i] = speed(i); + } + + *pcost = cost; + + + /* Destroy topology object. */ + hwloc_topology_destroy(topology); + if(get_verbose_level() >= INFO) + printf("\n"); + return res; +} + +tm_topology_t* get_local_topo_with_hwloc(void) +{ + hwloc_topology_t topology; + tm_topology_t *res = NULL; + hwloc_obj_t *objs = NULL; + unsigned topodepth,depth; + int nb_nodes,i; + + /* Build the topology */ + hwloc_topology_init(&topology); + hwloc_topology_ignore_all_keep_structure(topology); + hwloc_topology_load(topology); + + /* Test if symetric */ + if(!symetric(topology)){ + if(get_verbose_level() >= CRITICAL) + fprintf(stderr,"Local toplogy not symetric!\n"); + exit(-1); + } + + /* work on depth */ + topodepth = hwloc_topology_get_depth(topology); + + res = (tm_topology_t*)MALLOC(sizeof(tm_topology_t)); + res->nb_levels = topodepth; + res->node_id = (int**)MALLOC(sizeof(int*)*res->nb_levels); + res->nb_nodes = (int*)MALLOC(sizeof(int)*res->nb_levels); + res->arity = (int*)MALLOC(sizeof(int)*res->nb_levels); + + /* Build TreeMatch topology */ + for( depth = 0 ; depth < topodepth ; depth++ ){ + nb_nodes = hwloc_get_nbobjs_by_depth(topology, depth); + res->nb_nodes[depth] = nb_nodes; + res->node_id[depth] = (int*)MALLOC(sizeof(int)*nb_nodes); + + objs = (hwloc_obj_t*)MALLOC(sizeof(hwloc_obj_t)*nb_nodes); + objs[0] = hwloc_get_next_obj_by_depth(topology,depth,NULL); + hwloc_get_closest_objs(topology,objs[0],objs+1,nb_nodes-1); + res->arity[depth] = objs[0]->arity; + + /* printf("%d:",res->arity[depth]); */ + + /* Build process id tab */ + for (i = 0; i < nb_nodes; i++){ + res->node_id[depth][i] = objs[i]->os_index; + /* if(depth==topodepth-1) */ + } + FREE(objs); + } + + /* Destroy HWLOC topology object. */ + hwloc_topology_destroy(topology); + + /* printf("\n"); */ + return res; +} + diff --git a/ompi/mca/topo/treematch/treematch/tm_hwloc.h b/ompi/mca/topo/treematch/treematch/tm_hwloc.h new file mode 100644 index 00000000000..090687f6503 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_hwloc.h @@ -0,0 +1,7 @@ +#include +#include "tm_tree.h" + +void hwloc_topology_tag(hwloc_topology_t topology); +tm_topology_t* hwloc_to_tm(char *filename,double **pcost); +tm_topology_t * tgt_to_tm(char *filename,double **pcost); +tm_topology_t* get_local_topo_with_hwloc(void); diff --git a/ompi/mca/topo/treematch/treematch/tm_kpartitioning.c b/ompi/mca/topo/treematch/treematch/tm_kpartitioning.c new file mode 100644 index 00000000000..d0770f6cff3 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_kpartitioning.c @@ -0,0 +1,505 @@ +#include "tm_mapping.h" +#include "tm_mt.h" +#include "tm_kpartitioning.h" +#include +#include + +#define USE_KL_KPART 0 +#if USE_KL_KPART +#include "k-partitioning.h" +#endif /* USE_KL_KPART */ +#define KL_KPART_GREEDY_TRIALS 0 + +static int verbose_level = ERROR; + +#define MAX_TRIALS 10 +#define USE_KL_STRATEGY 1 + + +#define MIN(a,b) ((a)<(b)?(a):(b)) + + +int fill_tab(int **,int *,int,int,int,int); +void complete_com_mat(double ***,int,int); +void complete_obj_weight(double **,int,int); + +void allocate_vertex(int,int *,com_mat_t *,int,int *,int); +double eval_cost(int *, com_mat_t *); +int *kpartition_greedy(int, com_mat_t *,int,int *,int); +constraint_t *split_constraints (int *,int,int,tm_topology_t *,int); +com_mat_t **split_com_mat(com_mat_t *,int,int,int *); +int **split_vertices(int *,int,int,int *); +void FREE_tab_com_mat(com_mat_t **,int); +void FREE_tab_local_vertices(int **,int); +void FREE_const_tab(constraint_t *,int); +void kpartition_build_level_topology(tree_t *,com_mat_t *,int,int,tm_topology_t *, + int *,int *,int,double *,double *); + + + +void allocate_vertex(int u, int *res, com_mat_t *com_mat, int n, int *size, int max_size) +{ + int i,best_part=0; + double cost, best_cost = -1; + + /*printf("\n"); + print_1D_tab(res,n);*/ + if(u>=com_mat->n){ + for( i = 0 ; i < n ; i++) + if (( res[i] != -1 ) && ( size[res[i]] < max_size )){ + best_part = res[i]; + break; + } + }else{ + for( i = 0 ; i < n ; i++){ + if (( res[i] != -1 ) && ( size[res[i]] < max_size )){ + cost = (((i)n)) ?com_mat->comm[u][i]:0; + if (( cost > best_cost)){ + best_cost = cost; + best_part = res[i]; + } + } + } + } + /* printf("size[%d]: %d\n",best_part, size[best_part]);*/ + /* printf("putting(%.2f): %d -> %d\n",best_cost, u, best_part); */ + + res[u] = best_part; + size[best_part]++; +} + +double eval_cost(int *partition, com_mat_t *com_mat) +{ + double cost = 0; + int i,j; + + for( i = 0 ; i < com_mat->n ; i++ ) + for( j = i+1 ; j < com_mat->n ; j++ ) + if(partition[i] != partition[j]) + cost += com_mat->comm[i][j]; + + return cost; +} + +int *kpartition_greedy(int k, com_mat_t *com_mat, int n, int *constraints, int nb_constraints) +{ + int *res = NULL, *best_res=NULL, *size = NULL; + int i,j,nb_trials; + int max_size, max_val; + double cost, best_cost = -1; + int start, end; + int dumb_id, nb_dumb; + + + + + for( nb_trials = 0 ; nb_trials < MAX_TRIALS ; nb_trials++ ){ + res = (int *)MALLOC(sizeof(int)*n); + for ( i = 0 ; i < n ; i ++ ) + res[i] = -1; + + size = (int *)CALLOC(k,sizeof(int)); + max_size = n/k; + + /*printf("Constraints: ");print_1D_tab(constraints,nb_constraints);*/ + + /* put "dumb" vertices in the correct partition if there are any*/ + if (nb_constraints){ + start = 0; + dumb_id = n-1; + for( i = 0 ; i < k ; i ++){ + max_val = (i+1)* (n/k); + end = start; + while( end < nb_constraints){ + if(constraints[end] >= max_val) + break; + end++; + } + /* now end - start is the number of constarints for the ith subtree + hence the number of dumb vertices is the differences between the + number of leaves of the subtree (n/k) and the number of constraints + */ + nb_dumb = n/k - (end-start); + /*printf("max_val: %d, nb_dumb=%d, start=%d, end=%d, size=%d\n",max_val, nb_dumb, start, end, n/k);*/ + + /* dumb vertices are the one with highest indices: + put them in the ith partitions*/ + for( j = 0; j < nb_dumb; j ++ ){ + res[dumb_id] = i; + dumb_id--; + } + /* increase the size of the ith partition accordingly*/ + size[i] += nb_dumb; + start=end; + } + } + /*printf("After dumb vertices mapping: ");print_1D_tab(res,n);*/ + + /* choose k initial "true" vertices at random and put them in a different partition */ + for ( i = 0 ; i < k ; i ++ ){ + /* if the partition is full of dumb vertices go to next partition*/ + if(size[i] >= max_size) + continue; + /* find a vertex not allready partitionned*/ + do{ + /* call the mersenne twister PRNG of tm_mt.c*/ + j = genrand_int32() % n; + } while ( res[j] != -1 ); + /* allocate and update size of partition*/ + res[j] = i; + /* printf("random: %d -> %d\n",j,i); */ + size[i]++; + } + + /* allocate each unaloacted vertices in the partition that maximize the communication*/ + for( i = 0 ; i < n ; i ++) + if( res[i] == -1) + allocate_vertex(i, res, com_mat, n, size, max_size); + + cost = eval_cost(res,com_mat); + /*print_1D_tab(res,n); + printf("cost=%.2f\n",cost);*/ + if((cost= ERROR) + fprintf(stderr,"Error: Cannot partition %d elements in %d parts\n",n,k); + return NULL; + } + + /* if(USE_KL_KPART) */ + /* res = kPartitioning(comm, n, k, constraints, nb_constraints, KL_KPART_GREEDY_TRIALS); */ + /* else */ + res = kpartition_greedy(k, com_mat, n, constraints, nb_constraints); + + return res; +} + +constraint_t *split_constraints (int *constraints, int nb_constraints, int k, tm_topology_t *topology, int depth) +{ + constraint_t *const_tab = NULL; + int nb_leaves, start, end; + int i; + + const_tab = (constraint_t *)CALLOC(k,sizeof(constraint_t)); + + /* nb_leaves is the number of leaves of the current subtree + this will help to detremine where to split constraints and how to shift values + */ + nb_leaves = compute_nb_leaves_from_level( depth + 1, topology ); + +/* split the constraints into k sub-constraints + each sub-contraints 'i' contains constraints of value in [i*nb_leaves,(i+1)*nb_leaves[ + */ + start = 0; + for( i = 0; i < k; i++ ){ + /*returns the indice in contsraints that contains the smallest value not copied + end is used to compute the number of copied elements (end-size) and is used as the next staring indices*/ + end = fill_tab(&(const_tab[i].constraints), constraints, nb_constraints,start, (i+1) * nb_leaves, i * nb_leaves); + const_tab[i].length = end-start; + const_tab[i].id = i; + start = end; + } + + return const_tab; +} + + +com_mat_t **split_com_mat(com_mat_t *com_mat, int n, int k, int *partition) +{ + com_mat_t **res = NULL, *sub_com_mat; + double **sub_mat = NULL; + int *perm = NULL; + int cur_part, i, ii, j, jj, m = n/k, s; + + res = (com_mat_t**)MALLOC(k*sizeof(com_mat_t *)); + + + if(verbose_level >= DEBUG){ + printf("Partition: "); print_1D_tab(partition,n); + display_tab(com_mat->comm,com_mat->n); + } + + perm = (int*)MALLOC(sizeof(int)*m); + for( cur_part = 0 ; cur_part < k ; cur_part ++ ){ + + /* build perm such that submat[i][j] correspond to com_mat[perm[i]][perm[j]] according to the partition*/ + s = 0; + for( j = 0; j < com_mat->n; j ++) /* check only non zero element of of com_mat*/ + if ( partition[j] == cur_part ) + perm[s++] = j; + + /* s is now the size of the non zero sub matrix for this partition*/ + /* built a sub-matrix for partition cur_part*/ + sub_mat = (double **) MALLOC(sizeof(double *) * s); + for( i = 0 ; i < s ; i++) + sub_mat[i] = (double *) MALLOC(sizeof(double ) * s); + + /* build the sub_mat corresponding to the partiion cur_part*/ + for ( i = 0 ; i < s ; i ++){ + ii = perm[i]; + for( j = i ; j < s ; j ++){ + jj = perm[j]; + sub_mat[i][j] = com_mat->comm[ii][jj]; + sub_mat[j][i] = sub_mat[i][j]; + } + } + + sub_com_mat = (com_mat_t *)malloc(sizeof(com_mat_t)); + sub_com_mat -> n = s; + sub_com_mat -> comm = sub_mat; + + + /* printf("\n\npartition:%d\n",cur_part);display_tab(sub_mat,m);*/ + + /* assign the sub_mat to the result*/ + res[cur_part] = sub_com_mat; + } + + FREE(perm); + + return res; +} + +int **split_vertices( int *vertices, int n, int k, int *partition) +{ + int **res = NULL, *sub_vertices = NULL; + int m = n/k; + int i, j, cur_part; + + /*allocate resuts*/ + res = (int**) MALLOC(sizeof(int*) * k); + + + if(verbose_level >= DEBUG){ + printf("Partition: ");print_1D_tab(partition,n); + printf("Vertices id: ");print_1D_tab(vertices,n); + } + + /*split the vertices tab of the partition cur_part to the sub_vertices tab*/ + for( cur_part = 0; cur_part < k ; cur_part ++){ + sub_vertices = (int*) MALLOC(sizeof(int) * m); + i = 0; + for( j = 0; j < n; j ++) + if ( partition[j] == cur_part ) + sub_vertices[i++] = vertices[j]; + res[cur_part] = sub_vertices; + if(verbose_level >= DEBUG){ + printf("partition %d: ",cur_part);print_1D_tab(sub_vertices,m); + } + } + /*exit(-1);*/ + return res; +} + +void FREE_tab_com_mat(com_mat_t **mat,int k) +{ + int i,j; + if( !mat ) + return; + + for ( i = 0 ; i < k ; i ++){ + for ( j = 0 ; j < mat[i]->n ; j ++) + FREE( mat[i]->comm[j] ); + FREE( mat[i]->comm ); + } + FREE(mat); +} + +void FREE_tab_local_vertices(int **mat, int k) +{ + int i; /* m=n/k; */ + if( !mat ) + return; + + for ( i = 0 ; i < k ; i ++){ + FREE( mat[i] ); + } + FREE(mat); +} + + +void FREE_const_tab(constraint_t *const_tab, int k) +{ + int i; + + if( !const_tab ) + return; + + for(i = 0; i < k; i++){ + if(const_tab[i].length) + FREE(const_tab[i].constraints); + } + + FREE(const_tab); +} + +void kpartition_build_level_topology(tree_t *cur_node, com_mat_t *com_mat, int N, int depth, + tm_topology_t *topology, int *local_vertices, + int *constraints, int nb_constraints, + double *obj_weight, double *comm_speed) +{ + com_mat_t **tab_com_mat = NULL; /* table of comunication matrix. We will have k of such comunication matrix, one for each subtree */ + int k = topology->arity[depth]; + tree_t **tab_child = NULL; + int *partition = NULL; + int **tab_local_vertices = NULL; + constraint_t *const_tab = NULL; + int i; + verbose_level = get_verbose_level(); + + /* if we are at the bottom of the tree set cur_node + and return*/ + if ( depth == topology->nb_levels - 1 ){ + if(verbose_level>=DEBUG) + printf("id : %d, com_mat= %p\n",local_vertices[0], (void *)com_mat->comm); + set_node(cur_node,NULL, 0, NULL, local_vertices[0], 0, NULL, depth); + return; + } + + + /* partition the com_matrix in k partitions*/ + partition = kpartition(topology->arity[depth], com_mat, N, constraints, nb_constraints); + + /* split the communication matrix in k parts according to the partition just found above */ + tab_com_mat = split_com_mat( com_mat, N, k, partition); + + /* split the local vertices in k parts according to the partition just found above */ + tab_local_vertices = split_vertices( local_vertices, N, k, partition); + + /* construct a tab of constraints of size k: one for each partitions*/ + const_tab = split_constraints (constraints, nb_constraints, k, topology, depth); + + /* create the table of k nodes of the resulting sub-tree */ + tab_child = (tree_t **) CALLOC (k,sizeof(tree_t)); + for( i = 0 ; i < k ; i++){ + tab_child[i] = (tree_t *) MALLOC(sizeof(tree_t)); + } + + /* for each child, proceeed recursively*/ + for( i = 0 ; i < k ; i++){ + tab_child[i]->id = i; + kpartition_build_level_topology ( tab_child[i], tab_com_mat[i], N/k, depth + 1, + topology, tab_local_vertices[i], + const_tab[i].constraints, const_tab[i].length, + obj_weight, comm_speed); + tab_child[i]->parent = cur_node; + } + + /* link the node with its child */ + set_node( cur_node, tab_child, k, NULL, cur_node->id, 0, NULL, depth); + + /* FREE local data*/ + FREE(partition); + FREE_tab_com_mat(tab_com_mat,k); + FREE_tab_local_vertices(tab_local_vertices,k); + FREE_const_tab(const_tab,k); +} + + +tree_t *kpartition_build_tree_from_topology(tm_topology_t *topology,double **comm,int N, int *constraints, int nb_constraints, double *obj_weight, double *com_speed) +{ + int depth,i, K; + tree_t *root = NULL; + int *local_vertices = NULL; + int nb_cores; + com_mat_t com_mat; + + verbose_level = get_verbose_level(); + + if(verbose_level>=INFO) + printf("Number of constraints: %d\n", nb_constraints); + printf("Number of constraints: %d, N=%d\n", nb_constraints, N); + + nb_cores=nb_processing_units(topology); + + if((constraints == NULL) && (nb_constraints != 0)){ + if(verbose_level>=ERROR) + fprintf(stderr,"size of constraint table not zero while constraint tab is NULL\n"); + return NULL; + } + + if((constraints != NULL) && (nb_constraints > nb_cores)){ + if(verbose_level>=ERROR) + fprintf(stderr,"size of constraint table (%d) is greater than the number of cores (%d)\n", nb_constraints, nb_cores); + return NULL; + } + + depth = 0; + + /* if we have more cores than processes add new dumb process to the com matrix*/ + if((K=nb_cores - N)>0){ + /* add K element to the object weight*/ + complete_obj_weight(&obj_weight,N,K); + /* display_tab(tab,N+K);*/ + } else if( K < 0){ + if(verbose_level>=ERROR) + fprintf(stderr,"Not enough cores!\n"); + return NULL; + } + + com_mat.comm = comm; + com_mat.n = N; + + /* + local_vertices is the array of vertices that can be used + the min(N,nb_contraints) 1st element are number from 0 to N + the last ones have value -1 + the value of this array will be used to number the leaves of the tree_t tree + that start at "root" + + min(N,nb_contraints) is used to takle the case where thre is less processes than constraints + + */ + + local_vertices = (int*) MALLOC (sizeof(int) * (K+N)); + + for( i = 0 ; i < MIN(N,nb_constraints) ; i++) + local_vertices[i] = i; + for( i = MIN(N,nb_constraints) ;i < N + K ; i++) + local_vertices[i] = -1; + + /* we assume all objects have the same arity*/ + /* assign the root of the tree*/ + root = (tree_t*) MALLOC (sizeof(tree_t)); + + + + /*build the tree downward from the root*/ + kpartition_build_level_topology(root, &com_mat, N+K, depth, topology, local_vertices, + constraints, nb_constraints, obj_weight, com_speed); + + /*print_1D_tab(local_vertices,K+N);*/ + if(verbose_level>=INFO) + printf("Build (bottom-up) tree done!\n"); + + + + FREE(local_vertices); + + + /* tell the system it is a constraint tree, this is usefull for freeing pointers*/ + root->constraint = 1; + + return root; +} + + diff --git a/ompi/mca/topo/treematch/treematch/tm_kpartitioning.h b/ompi/mca/topo/treematch/treematch/tm_kpartitioning.h new file mode 100644 index 00000000000..58cf6af6ffc --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_kpartitioning.h @@ -0,0 +1,9 @@ +typedef struct _com_mat_t{ + double **comm; + int n; /*comm is of size n by n the other element are zeroes*/ + +} com_mat_t; + + +int *kpartition(int, com_mat_t*, int, int *, int); +tree_t * kpartition_build_tree_from_topology(tm_topology_t *topology,double **com_mat,int N, int *constraints, int nb_constraints, double *obj_weight, double *com_speed); diff --git a/ompi/mca/topo/treematch/treematch/tm_malloc.c b/ompi/mca/topo/treematch/treematch/tm_malloc.c new file mode 100644 index 00000000000..9351215863e --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_malloc.c @@ -0,0 +1,157 @@ +#include "uthash.h" +#include +#include "tm_verbose.h" +#include "tm_malloc.h" + +#define EXTRA_BYTE 100 + +typedef signed char byte; + + +/* static int verbose_level = ERROR;*/ + +typedef struct _hash_t { + void *key; /* we'll use this field as the key */ + size_t size; + UT_hash_handle hh; /* makes this structure hashable */ +}hash_t; + +static hash_t *size_hash = NULL; +static char extra_data[EXTRA_BYTE]; + +static void save_size(void *ptr, size_t size); +static size_t retreive_size(void *someaddr); +static void init_extra_data(void); + +void save_size(void *ptr, size_t size) { + hash_t *elem; + elem = (hash_t*) malloc(sizeof(hash_t)); + elem -> key = ptr; + elem -> size = size; + if(get_verbose_level() >= DEBUG) + printf("Storing (%p,%ld)\n",ptr,size); + HASH_ADD_PTR( size_hash, key, elem ); +} + + +size_t retreive_size(void *someaddr){ + size_t res; + hash_t *elem = NULL; + HASH_FIND_PTR(size_hash, &someaddr, elem); + if(!elem){ + fprintf(stderr,"cannot find ptr %p to free!\n",someaddr); + return 0; + } + + res = elem->size; + if(get_verbose_level()>=DEBUG) + printf("Retreiving (%p,%ld)\n",someaddr, res); + + HASH_DEL( size_hash, elem); + return res; +} + +void my_mem_check(void){ + hash_t *s; + int nb_errors = 0; + for(s=size_hash; s != NULL; s=s->hh.next) { + if(get_verbose_level()>=ERROR) + printf("pointer %p of size %ld has not been freed!\n", s->key, s->size); + nb_errors ++; + } + + if(get_verbose_level() >= INFO) + printf ("Number of errors in managing memory: %d\n",nb_errors); +} + +void init_extra_data(void){ + static int done = 0; + int i; + + if(done) + return; + + srandom(0); + + for( i = 0 ; i < EXTRA_BYTE; i++) + extra_data[i] = (char) random() % 256; + + done = 1; +} + + +void *my_malloc(size_t size, char *file, int line){ + byte *ptr; + init_extra_data(); + + size+=2*EXTRA_BYTE; + ptr = malloc(size); + + if(get_verbose_level()>=DEBUG) + printf("my_malloc of size %ld: %p (%s: %d)\n",size-2*EXTRA_BYTE,ptr,file,line); + + save_size(ptr,size); + + memcpy(ptr, extra_data, EXTRA_BYTE); + memcpy(ptr + size - EXTRA_BYTE, extra_data, EXTRA_BYTE); + + + if(get_verbose_level()>=DEBUG) + printf("my_malloc returning: %p\n",ptr+EXTRA_BYTE); + + return (void *)(ptr + EXTRA_BYTE); +} + +void *my_calloc(size_t count, size_t size, char *file, int line){ + byte *ptr; + size_t full_size; + + init_extra_data(); + + full_size = count * size + 2 * EXTRA_BYTE; + + ptr = malloc(full_size); + bzero(ptr,full_size); + save_size(ptr, full_size); + + if(get_verbose_level()>=DEBUG) + printf("my_calloc of size %ld: %p (%s: %d)\n",full_size-2*EXTRA_BYTE,ptr, file, line); + + + memcpy(ptr, extra_data, EXTRA_BYTE); + memcpy(ptr + full_size - EXTRA_BYTE, extra_data, EXTRA_BYTE); + + if(get_verbose_level()>=DEBUG) + printf("my_calloc returning: %p\n",ptr+EXTRA_BYTE); + + return (void *)(ptr+EXTRA_BYTE); +} + +void my_free(void *ptr){ + byte *original_ptr = ((byte *)ptr) - EXTRA_BYTE; + size_t size; + + if(!ptr) + return; + + size = retreive_size(original_ptr); + + if((bcmp(original_ptr ,extra_data, EXTRA_BYTE)) && ((get_verbose_level()>=ERROR))){ + fprintf(stderr,"cannot find special string ***before*** %p!\n",ptr); + fprintf(stderr,"memory is probably corrupted here!\n"); + } + + if((bcmp(original_ptr + size -EXTRA_BYTE ,extra_data, EXTRA_BYTE)) && ((get_verbose_level()>=ERROR))){ + fprintf(stderr,"cannot find special string ***after*** %p!\n",ptr); + fprintf(stderr,"memory is probably corrupted here!\n"); + } + + if(get_verbose_level()>=DEBUG) + printf("my_free freeing: %p\n",original_ptr); + + + free(original_ptr); +} + + + diff --git a/ompi/mca/topo/treematch/treematch/tm_malloc.h b/ompi/mca/topo/treematch/treematch/tm_malloc.h new file mode 100644 index 00000000000..c4038d90be7 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_malloc.h @@ -0,0 +1,5 @@ +#include +void *my_malloc(size_t size, char *, int); +void *my_calloc(size_t count, size_t size, char *, int); +void my_free(void *ptr); +void my_mem_check(void); diff --git a/ompi/mca/topo/treematch/treematch/tm_mapping.c b/ompi/mca/topo/treematch/treematch/tm_mapping.c new file mode 100644 index 00000000000..1f664ad3375 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_mapping.c @@ -0,0 +1,1368 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tm_mt.h" +#include "tm_mapping.h" +#include "tm_timings.h" +#include "tm_tree.h" + +#ifdef _WIN32 +#include +#include +#endif + +#define TEST_ERROR(n) do{ \ + if( (n) != 0 ){ \ + fprintf(stderr,"Error %d Line %d\n",n,__LINE__); \ + exit(-1);} \ + }while(0) + +#define LINE_SIZE (1000000) + +typedef struct { + int val; + long key; +} hash_t; + + +typedef struct { + double val; + int key1; + int key2; +} hash2_t; + +int distance(tm_topology_t *topology,int i, int j); +int nb_lines(char *); +void init_comm(char *,int,double **);void map_Packed(tm_topology_t *,int,int *); +void map_RR(int ,int *,int *); +int hash_asc(const void*,const void*); +int *generate_random_sol(tm_topology_t *,int,int,int); +double eval_sol(int *,int,double **,double **); +double eval_sol_inv(int *,int,double **,double **); +void exchange(int *,int,int); +double gain_exchange(int *,int,int,double,int,double **,double **); +void select_max(int *,int *,double **,int,int *); +void compute_gain(int *,int,double **,double **,double **); +void map_MPIPP(tm_topology_t *,int,int,int *,double **,double **); +void depth_first(tree_t *,int *,int *); +int nb_leaves(tree_t *); +void map_topology(tm_topology_t *,tree_t *,int,int,int *,int,int *); +int int_cmp(const void*,const void*); +int decompose(int,int,int *); +tree_t *build_synthetic_topology_old(int *,int,int,int); +void update_comm_speed(double **,int,int); +void topology_numbering(tm_topology_t *,int **,int *); +void topology_arity(tm_topology_t *,int **,int *); +void optimize_arity(int **,int *,int); +int get_indice(int *,int,int); +int fill_tab(int **,int *,int,int,int,int); +void update_canonical(int *,int,int,int); +int constraint_dsc(const void*,const void*); +void display_contsraint_tab(constraint_t *,int); +void update_perm(int *,int,constraint_t *,int,int); +void recursive_canonicalization(int,tm_topology_t *,int *,int *,int *,int,int); +void FREE_topology(tm_topology_t *); + + +int distance(tm_topology_t *topology,int i, int j) +{ + int level = topology->nb_levels; + int arity; + int f_i = i,f_j = j; + + do{ + level--; + arity = topology->arity[level]; + if( arity == 0 ) + arity = 1; + f_i = f_i/arity; + f_j = f_j/arity; + } while(f_i!=f_j); + + /* printf("(%d,%d):%d\n",i,j,level);*/ + /* exit(-1); */ + return level; +} + +int nb_processing_units(tm_topology_t *topology) +{ + return topology->nb_nodes[topology->nb_levels-1]; +} + + +void FREE_topology(tm_topology_t *topology) +{ + int i; + for( i = 0 ; i < topology->nb_levels ; i++ ) + FREE(topology->node_id[i]); + FREE(topology->node_id); + FREE(topology->nb_nodes); + FREE(topology->arity); + FREE(topology); +} + +double print_sol(int N,int *Value,double **comm, double *cost, tm_topology_t *topology) +{ + double a,c,sol; + int i,j; + + sol = 0; + for ( i = 0 ; i < N ; i++ ) + for ( j = i+1 ; j < N ; j++){ + c = comm[i][j]; + a = cost[distance(topology,Value[i],Value[j])]; + /* printf("T_%d_%d %f/%f=%f\n",i,j,c,a,c/a); */ + sol += c/a; + } + + for (i = 0; i < N; i++) { + printf("%d", Value[i]); + if(i= CRITICAL) + fprintf(stderr,"Cannot open %s\n",filename); + exit(-1); + } + + while(fgets(line,LINE_SIZE,pf)) + N++; + + if(get_verbose_level() >= DEBUG) + printf("Number of lines of file %s = %d\n",filename,N); + + fclose(pf); + return N; +} + +void init_comm(char *filename,int N,double **comm) +{ + FILE *pf = NULL; + char *ptr= NULL; + char line[LINE_SIZE]; + int i,j; + unsigned int vl = get_verbose_level(); + + + + if(!(pf=fopen(filename,"r"))){ + if(vl >= CRITICAL) + fprintf(stderr,"Cannot open %s\n",filename); + exit(-1); + } + + j = -1; + i = 0; + while(fgets(line,LINE_SIZE,pf)){ + char *l = line; + j = 0; + comm[i][N] = 0; + /* printf("%s|",line); */ + while((ptr=strtok(l," \t"))){ + l = NULL; + if((ptr[0]!='\n')&&(!isspace(ptr[0]))&&(*ptr)){ + comm[i][j] = atof(ptr); + comm[i][N] += comm [i][j]; + /* printf ("comm[%d][%d]=%f|%s|\n",i,j,comm[i][j],ptr); */ + j++; + } + } + if( j != N){ + if(vl >= CRITICAL) + fprintf(stderr,"Error at %d %d (%d!=%d)for %s\n",i,j,j,N,filename); + exit(-1); + } + i++; + } + if( i != N ){ + if(vl >= CRITICAL) + fprintf(stderr,"Error at %d %d for %s\n",i,j,filename); + exit(-1); + } + /* + printf("%s:\n",filename); + for(i=0;i= CRITICAL) + fprintf(stderr,"Cannot open %s\n",filename); + exit(-1); + } + + /* compute the size od the array to store the constraints*/ + n = 0; + fgets(line, LINE_SIZE, pf); + l = line; + while((ptr=strtok(l," \t"))){ + l = NULL; + if((ptr[0] != '\n') && ( !isspace(ptr[0])) && (*ptr) && (ptr)) + n++; + } + + tab = (int*)MALLOC(n*sizeof(int)); + + rewind(pf); + fgets(line, LINE_SIZE, pf); + l = line; + i = 0; + while((ptr=strtok(l," \t"))){ + l = NULL; + if((ptr[0] != '\n') && ( !isspace(ptr[0])) && (*ptr) && (ptr)){ + if(i <= n) + tab[i] = atoi(ptr); + else{ + if(vl >= CRITICAL) + fprintf(stderr, "More than %d entries in %s\n", n, filename); + exit(-1); + } + i++; + } + } + + if( i != n ){ + if(vl >= CRITICAL) + fprintf(stderr, "Read %d entries while expecting %d ones\n", i, n); + exit(-1); + } + + *ptab = tab; + return n; +} + +int build_comm(char *filename,double ***pcomm) +{ + double **comm = NULL; + int i,N; + + if(get_verbose_level() >= INFO) + printf("Reading communication matrix file: %s\n",filename); + + N = nb_lines(filename); + comm = (double**)MALLOC(N*sizeof(double*)); + for( i = 0 ; i < N ; i++) + /* the last column stores the sum of the line*/ + comm[i] = (double*)MALLOC((N+1)*sizeof(double)); + init_comm(filename,N,comm); + *pcomm = comm; + + if(get_verbose_level() >= INFO) + printf("Communication matrix built from %s!\n",filename); + + return N; +} + +void map_Packed(tm_topology_t *topology,int N,int *Value) +{ + int i,j = 0,depth; + + depth = topology->nb_levels-1; + + for( i = 0 ; i < nb_processing_units(topology) ; i++){ + /* printf ("%d -> %d\n",objs[i]->os_index,i); */ + if(topology->node_id[depth][i] != -1){ + Value[j++]=topology->node_id[depth][i]; + if(j == N) + break; + } + } +} + +void map_RR(int N,int *Value, int *constraints) +{ + int i; + + for( i = 0 ; i < N ; i++ ){ + /*printf ("%d -> %d\n",i,i);*/ + if(constraints) + Value[i]=constraints[i]; + else + Value[i]=i; + } +} + +int hash_asc(const void* x1,const void* x2) +{ + hash_t *e1 = NULL,*e2 = NULL; + + e1 = ((hash_t*)x1); + e2 = ((hash_t*)x2); + + return (e1->key < e2->key) ? -1 : 1; +} + + +int *generate_random_sol(tm_topology_t *topology,int N,int level,int seed) +{ + hash_t *hash_tab = NULL; + int *sol = NULL; + int *nodes_id= NULL; + int i; + + nodes_id = topology->node_id[level]; + + hash_tab = (hash_t*)MALLOC(sizeof(hash_t)*N); + sol = (int*)MALLOC(sizeof(int)*N); + + init_genrand(seed); + + for( i = 0 ; i < N ; i++ ){ + hash_tab[i].val = nodes_id[i]; + hash_tab[i].key = genrand_int32(); + } + + qsort(hash_tab,N,sizeof(hash_t),hash_asc); + for( i = 0 ; i < N ; i++ ) + sol[i] = hash_tab[i].val; + + FREE(hash_tab); + return sol; +} + + +double eval_sol(int *sol,int N,double **comm, double **arch) +{ + double a,c,res; + int i,j; + + res = 0; + for ( i = 0 ; i < N ; i++ ) + for ( j = i+1 ; j < N ; j++ ){ + c = comm[i][j]; + a = arch[sol[i]][sol[j]]; + res += c/a; + } + + return res; +} + +double eval_sol_inv(int *sol,int N,double **comm, double **arch) +{ + double a,c,res; + int i,j; + + res = 0; + for ( i = 0 ; i < N ; i++ ) + for ( j = i+1 ; j < N ; j++ ){ + c = comm[i][j]; + a = arch[sol[i]][sol[j]]; + res += c*a; + } + + return res; +} + +void exchange(int *sol,int i,int j) +{ + int tmp; + tmp = sol[i]; + sol[i] = sol[j]; + sol[j] = tmp; +} + +double gain_exchange(int *sol,int l,int m,double eval1,int N,double **comm, double **arch) +{ + double eval2; + if( l == m ) + return 0; + exchange(sol,l,m); + eval2 = eval_sol(sol,N,comm,arch); + exchange(sol,l,m); + + return eval1-eval2; +} + +void select_max(int *l,int *m,double **gain,int N,int *state) +{ + double max; + int i,j; + + max = -DBL_MAX; + + for( i = 0 ; i < N ; i++ ) + if(!state[i]) + for( j = 0 ; j < N ; j++ ) + if( (i != j) && (!state[j]) ){ + if(gain[i][j] > max){ + *l = i; + *m = j; + max=gain[i][j]; + } + } +} + +void compute_gain(int *sol,int N,double **gain,double **comm, double **arch) +{ + double eval1; + int i,j; + + eval1 = eval_sol(sol,N,comm,arch); + for( i = 0 ; i < N ; i++ ) + for( j = 0 ; j <= i ; j++) + gain[i][j] = gain[j][i] = gain_exchange(sol,i,j,eval1,N,comm,arch); +} + + + +/* Randomized Algorithm of +Hu Chen, Wenguang Chen, Jian Huang ,Bob Robert,and H.Kuhn. Mpipp: an automatic profile-guided +parallel process placement toolset for smp clusters and multiclusters. In +Gregory K. Egan and Yoichi Muraoka, editors, ICS, pages 353-360. ACM, 2006. + */ + +void map_MPIPP(tm_topology_t *topology,int nb_seed,int N,int *Value,double **comm, double **arch) +{ + int *sol = NULL; + int *state = NULL; + double **gain = NULL; + int **history = NULL; + double *temp = NULL; + int i,j,t,l=0,m=0,seed=0; + double max,sum,best_eval,eval; + + gain = (double**)MALLOC(sizeof(double*)*N); + history = (int**)MALLOC(sizeof(int*)*N); + for( i = 0 ; i < N ; i++){ + gain[i] = (double*)MALLOC(sizeof(double)*N); + history[i] = (int*)MALLOC(sizeof(int)*3); + } + + state = (int*)MALLOC(sizeof(int)*N); + temp = (double*)MALLOC(sizeof(double)*N); + + sol = generate_random_sol(topology,N,topology->nb_levels-1,seed++); + for( i = 0 ; i < N ; i++) + Value[i] = sol[i]; + + best_eval = DBL_MAX; + while(seed <= nb_seed){ + do{ + for( i = 0 ; i < N ; i++ ){ + state[i] = 0; + /* printf("%d ",sol[i]); */ + } + /* printf("\n"); */ + compute_gain(sol,N,gain,comm,arch); + /* + display_tab(gain,N); + exit(-1); + */ + for( i = 0 ; i < N/2 ; i++ ){ + select_max(&l,&m,gain,N,state); + /* printf("%d: %d <=> %d : %f\n",i,l,m,gain[l][m]); */ + state[l] = 1; + state[m] = 1; + exchange(sol,l,m); + history[i][1] = l; + history[i][2] = m; + temp[i] = gain[l][m]; + compute_gain(sol,N,gain,comm,arch); + } + + t = -1; + max = 0; + sum = 0; + for(i = 0 ; i < N/2 ; i++ ){ + sum += temp[i]; + if( sum > max ){ + max = sum; + t = i; + } + } + /*for(j=0;j<=t;j++) + printf("exchanging: %d with %d for gain: %f\n",history[j][1],history[j][2],temp[j]); */ + for( j = t+1 ; j < N/2 ; j++ ){ + exchange(sol,history[j][1],history[j][2]); + /* printf("Undoing: %d with %d for gain: %f\n",history[j][1],history[j][2],temp[j]); */ + } + /* printf("max=%f\n",max); */ + + /*for(i=0;i 0 ); + + sol=generate_random_sol(topology,N,topology->nb_levels-1,seed++); + } +} + +/* void map_tree(tree_t* t1,tree_t *t2) */ +/* { */ + /* double x1,x2; + if((!t1->left)&&(!t1->right)){ + printf ("%d -> %d\n",t1->id,t2->id); + Value[t2->id]=t1->id; + return; + } + x1=t2->right->val/t1->right->val+t2->left->val/t1->left->val; + x2=t2->left->val/t1->right->val+t2->right->val/t1->left->val; + if(x1left,t2->left); + map_tree(t1->right,t2->right); + }else{ + map_tree(t1->right,t2->left); + map_tree(t1->left,t2->right); + }*/ +/* } */ + +void depth_first(tree_t *comm_tree, int *proc_list,int *i) +{ + int j; + if(!comm_tree->child){ + proc_list[(*i)++] = comm_tree->id; + return; + } + + for( j = 0 ; j < comm_tree->arity ; j++ ) + depth_first(comm_tree->child[j],proc_list,i); +} + +int nb_leaves(tree_t *comm_tree) +{ + int j,n=0; + + if(!comm_tree->child) + return 1; + + for( j = 0 ; j < comm_tree->arity ; j++) + n += nb_leaves(comm_tree->child[j]); + + return n; +} + + +/*Map topology to cores: + sigma_i is such that process i is mapped on core sigma_i + k_i is such that core i exectutes process k_i + + size of sigma is the number of process "nb_processes" + size of k is the number of cores/nodes "topology->nb_nodes[level]" + + We must have numbe of process<=number of cores + + k_i =-1 if no process is mapped on core i +*/ + +void map_topology(tm_topology_t *topology,tree_t *comm_tree,int nb_compute_units, + int level,int *sigma, int nb_processes, int *k) +{ + int *nodes_id = NULL; + int *proc_list = NULL; + int i,N,M,block_size; + unsigned int vl = get_verbose_level(); + + M = nb_leaves(comm_tree); + nodes_id = topology->node_id[level]; + N = topology->nb_nodes[level]; + + if(vl >= INFO){ + printf("nb_leaves=%d\n",M); + printf("level=%d, nodes_id=%p, N=%d\n",level,(void *)nodes_id,N); + printf("N=%d,nb_compute_units=%d\n",N,nb_compute_units); + } + + /* The number of node at level "level" in the tree should be equal to the number of processors*/ + assert(N==nb_compute_units); + + proc_list = (int*)MALLOC(sizeof(int)*M); + i = 0; + depth_first(comm_tree,proc_list,&i); + + if(vl >= DEBUG) + for(i=0;i= INFO) + printf("M=%d, N=%d, BS=%d\n",M,N,block_size); + for( i = 0 ; i < nb_processing_units(topology) ; i++ ) + k[i] = -1; + + for( i = 0 ; i < M ; i++ ) + if(proc_list[i] != -1){ + if(vl >= DEBUG) + printf ("%d->%d\n",proc_list[i],nodes_id[i/block_size]); + + if( proc_list[i] < nb_processes ){ + sigma[proc_list[i]] = nodes_id[i/block_size]; + k[nodes_id[i/block_size]] = proc_list[i]; + } + } + }else{ + if(vl >= INFO) + printf("M=%d, N=%d, BS=%d\n",M,N,block_size); + for( i = 0 ; i < M ; i++ ) + if(proc_list[i] != -1){ + if(vl >= DEBUG) + printf ("%d->%d\n",proc_list[i],nodes_id[i/block_size]); + if( proc_list[i] < nb_processes ) + sigma[proc_list[i]] = nodes_id[i/block_size]; + } + } + + if(vl >= DEBUG){ + printf("k: "); + for( i = 0 ; i < nb_processing_units(topology) ; i++ ) + printf("%d ",k[i]); + printf("\n"); + } + + + FREE(proc_list); +} + +void map_topology_simple(tm_topology_t *topology,tree_t *comm_tree, int *sigma, int nb_processes, int *k) +{ + map_topology(topology,comm_tree,topology->nb_nodes[topology->nb_levels-1], + topology->nb_levels-1,sigma,nb_processes,k); +} + +int int_cmp(const void* x1,const void* x2) +{ + int *e1 = NULL,*e2= NULL; + + e1 = ((int *)x1); + e2 = ((int *)x2); + + return ((*e1) > (*e2)) ? -1 : 1; +} + + +int decompose(int n,int optimize,int *tab) +{ + int primes[6] = {2,3,5,7,11,0}; + int i = 0,j = 1,flag = 2; + unsigned int vl = get_verbose_level(); + + while( primes[i] && (n!=1) ){ + /* printf("[%d] before=%d\n",primes[i],n); */ + if( flag && optimize && (n%primes[i]!= 0) ){ + n += primes[i] - n%primes[i]; + flag--; + i = 0; + continue; + } + /* printf("after=%d\n",n); */ + if( n%primes[i] == 0 ){ + tab[j++] = primes[i]; + n /= primes[i]; + }else{ + i++; + flag = 1; + } + } + if( n != 1 ) + tab[j++] = n; + + qsort(tab+1,j-1,sizeof(int),int_cmp); + + if(vl >= DEBUG){ + for( i = 0 ; i < j ; i++ ) + printf("%d:",tab[i]); + printf("\n"); + } + + tab[j] = 0; + + return (j+1); +} + + +tree_t *build_synthetic_topology_old(int *synt_tab,int id,int depth,int nb_levels) +{ + tree_t *res = NULL,**child = NULL; + int arity = synt_tab[0]; + int val,i; + + res = (tree_t*)MALLOC(sizeof(tree_t)); + val = 0; + if(depth >= nb_levels) + child = NULL; + else{ + child = (tree_t**)MALLOC(sizeof(tree_t*)*arity); + for( i = 0 ; i < arity ; i++ ){ + child[i] = build_synthetic_topology_old(synt_tab+1,i,depth+1,nb_levels); + child[i]->parent = res; + val += child[i]->val; + } + } + set_node(res,child,arity,NULL,id,val+speed(depth),child[0],depth); + return res; +} + +void display_topology(tm_topology_t *topology) +{ + int i,j; + + for( i = 0 ; i < topology->nb_levels ; i++ ){ + printf("%d: ",i); + for( j = 0 ; j < topology->nb_nodes[i] ; j++) + printf("%d ",topology->node_id[i][j]); + printf("\n"); + } +} + +/* + Build a synthetic balanced topology + + arity : array of arity of the first nb_level (of size nb_levels-1) + core_numbering: numbering of the core by the system. Array of size nb_core_per_node + + nb_core_per_nodes: number of cores of a given node + + The numbering of the cores is done in round robin fashion after a width traversal of the topology + */ + +tm_topology_t *build_synthetic_topology(int *arity, int nb_levels, int *core_numbering, int nb_core_per_nodes) +{ + tm_topology_t *topology = NULL; + int i,j,n = 1; + + topology = (tm_topology_t*)MALLOC(sizeof(tm_topology_t)); + topology->arity = (int*)MALLOC(sizeof(int)*nb_levels); + memcpy(topology->arity,arity,sizeof(int)*nb_levels); + topology->nb_levels = nb_levels; + + topology->node_id = (int**)MALLOC(sizeof(int*)*topology->nb_levels); + topology->nb_nodes = (int*)MALLOC(sizeof(int)*topology->nb_levels); + + for( i = 0 ; i < topology->nb_levels ; i++ ){ + topology->nb_nodes[i] = n; + topology->node_id[i] = (int*)MALLOC(sizeof(int)*n); + if( i < topology->nb_levels-1) + for( j = 0 ; j < n ; j++ ) + topology->node_id[i][j] = j; + else + for( j = 0 ; j < n ; j++ ) + topology->node_id[i][j] = core_numbering[j%nb_core_per_nodes] + (nb_core_per_nodes)*(j/nb_core_per_nodes); + + n *= topology->arity[i]; + } + return topology; +} + + +void build_synthetic_proc_id(tm_topology_t *topology) +{ + int i; + size_t j,n = 1; + + topology->node_id = (int**)MALLOC(sizeof(int*)*topology->nb_levels); + topology->nb_nodes = (int*)MALLOC(sizeof(int)*topology->nb_levels); + + for( i = 0 ; i < topology->nb_levels ; i++ ){ + /* printf("n= %lld, arity := %d\n",n, topology->arity[i]); */ + topology->nb_nodes[i] = n; + topology->node_id[i] = (int*)MALLOC(sizeof(long int)*n); + if ( !topology->node_id[i] ){ + if(get_verbose_level() >= CRITICAL) + fprintf(stderr,"Cannot allocate level %d (of size %ld) of the topology\n", i, (unsigned long int)n); + exit(-1); + } + for( j = 0 ; j < n ; j++ ) + topology->node_id[i][j] = j; + n *= topology->arity[i]; + } +} + +void update_comm_speed(double **comm_speed,int old_size,int new_size) +{ + double *old_tab = NULL,*new_tab= NULL; + int i; + unsigned int vl = get_verbose_level(); + + if(vl >= DEBUG) + printf("comm speed [%p]: ",(void *)*comm_speed); + + old_tab = *comm_speed; + new_tab = (double*)MALLOC(sizeof(double)*new_size); + *comm_speed = new_tab; + + for( i = 0 ; i < new_size ; i++ ){ + if( i < old_size) + new_tab[i] = old_tab[i]; + else + new_tab[i] = new_tab[i-1]; + + if(vl >= DEBUG) + printf("%f ",new_tab[i]); + } + if(vl >= DEBUG) + printf("\n"); +} + + +/* d: size of comm_speed */ +void TreeMatchMapping(int nb_obj, int nb_proc, double **comm_mat, double *obj_weight, double * comm_speed, int d, int *sol) +{ + tree_t *comm_tree = NULL; + tm_topology_t *topology= NULL; + double duration; + int i; + unsigned int vl = get_verbose_level(); + + TIC; + + for( i = 0 ; i < nb_obj ; i++ ){ + sol[i] = i; + /* printf("%f ",obj_weight[i]); */ + } + /* + printf("\n"); + return; + */ + + topology = (tm_topology_t*)MALLOC(sizeof(tm_topology_t)); + topology->arity = (int*)MALLOC(sizeof(int)*MAX_LEVELS); + topology->arity[0] = nb_proc; + topology->nb_levels = decompose((int)ceil((1.0*nb_obj)/nb_proc),1,topology->arity); + if(vl >= INFO) + printf("Topology nb levels=%d\n",topology->nb_levels); + build_synthetic_proc_id(topology); + + if(topology->nb_levels > d) + update_comm_speed(&comm_speed,d,topology->nb_levels); + + /* + exit(-1); + topology_to_arch(topology); + + display_tab(arch,hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PROC)); + display_tab(arch,96); + exit(-1); + int nb_core=topo_nb_proc(topology,1000); + + display_tab(comm_mat,N); + */ + + TIC; + comm_tree = build_tree_from_topology(topology,comm_mat,nb_obj,obj_weight,comm_speed); + if(vl >= INFO) + printf("Tree building time=%f\n",TOC); + TIC; + map_topology(topology,comm_tree,nb_proc,1,sol,nb_obj,NULL); + if(vl >= INFO) + printf("Topology mapping time=%f\n",TOC); + + if(topology->nb_levels > d) + FREE(comm_speed); + + FREE_topology(topology); + FREE_tree(comm_tree); + + duration=TOC; + if(vl >= INFO) + printf("-------------- Mapping done in %.4fs!\n",duration); +} + +void display_other_heuristics(tm_topology_t *topology,int N,double **comm,int TGT_flag, int *constraints, double *cost) +{ + int *sol = NULL; + + sol = (int*)MALLOC(sizeof(int)*N); + + map_Packed(topology,N,sol); + printf("Packed: "); + if (TGT_flag == 1) + print_sol_inv(N,sol,comm,cost, topology); + else + print_sol(N,sol,comm,cost, topology); + + map_RR(N,sol,constraints); + printf("RR: "); + if (TGT_flag == 1) + print_sol_inv(N,sol,comm, cost, topology); + else + print_sol(N,sol,comm, cost, topology); + +/* double duration; */ +/* CLOCK_T time1,time0; */ +/* CLOCK(time0); */ +/* map_MPIPP(topology,1,N,sol,comm,arch); */ +/* CLOCK(time1); */ +/* duration=CLOCK_DIFF(time1,time0); */ +/* printf("MPIPP-1-D:%f\n",duration); */ +/* printf("MPIPP-1: "); */ +/* if (TGT_flag == 1) */ +/* print_sol_inv(N,sol,comm,arch); */ +/* else */ +/* print_sol(N,sol,comm,arch); */ + +/* CLOCK(time0); */ +/* map_MPIPP(topology,5,N,sol,comm,arch); */ +/* CLOCK(time1); */ +/* duration=CLOCK_DIFF(time1,time0); */ +/* printf("MPIPP-5-D:%f\n",duration); */ +/* printf("MPIPP-5: "); */ +/* if (TGT_flag == 1) */ +/* print_sol_inv(N,sol,comm,arch); */ +/* else */ +/* print_sol(N,sol,comm,arch); */ + + FREE(sol); +} + +void topology_numbering(tm_topology_t *topology,int **numbering,int *nb_nodes) +{ + int nb_levels; + unsigned int vl = get_verbose_level(); + + nb_levels = topology->nb_levels; + *nb_nodes = topology->nb_nodes[nb_levels-1]; + if(vl >= INFO) + printf("nb_nodes=%d\n",*nb_nodes); + *numbering = (int*)MALLOC(sizeof(int)*(*nb_nodes)); + memcpy(*numbering,topology->node_id[nb_levels-1],sizeof(int)*(*nb_nodes)); +} + +void topology_arity(tm_topology_t *topology,int **arity,int *nb_levels) +{ + *nb_levels = topology->nb_levels; + *arity = (int*)MALLOC(sizeof(int)*(*nb_levels)); + memcpy(*arity,topology->arity,sizeof(int)*(*nb_levels)); +} + +void optimize_arity(int **arity, int *nb_levels,int n) +{ + int a,i; + int *new_arity = NULL; + + if( n < 0 ) + return; + /* printf("n=%d\tnb_levels=%d\n",n,*nb_levels); */ + /* for(i=0;i<*nb_levels;i++) */ + /* printf("%d:",(*arity)[i]); */ + /* printf("\n"); */ + /* if(n==(*nb_levels)-3) */ + /* exit(-1); */ + a = (*arity)[n]; + if( (a%3 == 0) && (a > 3) ){ + /* + check if the a rity of level n devides 3 + If this is the case: + Add a level + */ + (*nb_levels)++; + /* Build a new arity array */ + new_arity = (int*)MALLOC(sizeof(int)*(*nb_levels)); + /* Copy the begining if the old array */ + for( i = 0 ; i < n ; i++) + new_arity[i] = (*arity)[i]; + /* set the nth level to arity 3 */ + new_arity[n] = 3; + /* printf("a=%d\n",a); */ + /* Set the (n+1) level to arity a/3 */ + new_arity[n+1] = a/3; + /* Copy the end of the array */ + for( i = n+2 ; i < *nb_levels ; i++) + new_arity[i] = (*arity)[i-1]; + FREE(*arity); + /* if a/3 =3 then go to the next level */ + if(new_arity[n+1] == 3) + optimize_arity(&new_arity,nb_levels,n); + else /* continue to this level (remember we just add a new level */ + optimize_arity(&new_arity,nb_levels,n+1); + *arity=new_arity; + }else if( (a%2==0) && (a>2) ){/* same as above but for arity == 2 instead of 3 */ + (*nb_levels)++; + new_arity = (int*)MALLOC(sizeof(int)*(*nb_levels)); + for( i = 0 ; i < n ; i++ ) + new_arity[i] = (*arity)[i]; + new_arity[n] = 2; + /* printf("a=%d\n",a); */ + new_arity[n+1] = a/2; + for( i = n+2 ; i < *nb_levels ; i++ ) + new_arity[i] = (*arity)[i-1]; + FREE(*arity); + if(new_arity[n+1] == 2) + optimize_arity(&new_arity,nb_levels,n); + else + optimize_arity(&new_arity,nb_levels,n+1); + *arity = new_arity; + }else /* if nothing works go to next level. */ + optimize_arity(arity,nb_levels,n-1); +} + + + +tm_topology_t *optimize_topology(tm_topology_t *topology){ + int *arity = NULL,nb_levels; + int *numbering = NULL,nb_nodes; + tm_topology_t *new_topo; + + topology_arity(topology,&arity,&nb_levels); + /* printf("nb_levels=%d\n",nb_levels); */ + /* for(i=0;inb_levels-1) + res *= topology->arity[depth++]; + + return res; +} + + + +/* return the indice of the greatest element of tab slower than val + tab needs to be sorted in increasing order*/ +int get_indice(int *tab, int n, int val) +{ + int i = 0, j = n-1, k; + + if( tab[n-1] < val ) + return n-1; + + while( i != j){ + k = (i+j)/2; + if( (tab[k]= max_val) + break; + end++; + } + + /* if none return */ + if( start == end ){ + *new_tab = NULL; + return end; + } + + /* allocate the result*/ + res = (int*) MALLOC (sizeof(int)*(end-start)); + + /* copy and shift*/ + j = 0; + for( i = start ; i < end ; i++ ){ + res[j] = tab[i] - shift; + j++; + } + + /* set the pointer passed in parameter and return */ + *new_tab = res; + return end; +} + + +/* + update the table of canonical values by adding the shift from start to end + This is required because value in canonical are initialized by 0 + When we know which part of this table belong to which subtree we can update the values +*/ +void update_canonical(int *canonical,int start, int end, int shift) +{ + int i; + for( i = start ; i < end ; i++ ) + canonical[i] += shift; +} + + + +/* function to sort constraint_t* tab using qsort*/ +int constraint_dsc(const void* x1,const void* x2) +{ + constraint_t *e1 = NULL,*e2 = NULL; + + e1 = ((constraint_t*)x1); + e2 = ((constraint_t*)x2); + + return (e1->length > e2->length) ? -1 : 1; +} + + +/* display function*/ +void display_contsraint_tab(constraint_t *const_tab, int n) +{ + int i; + for( i = 0; i < n; i++ ) + printf("tab %d:",i);print_1D_tab(const_tab[i].constraints, const_tab[i].length); +} + + +/* + We shift perm in new_perm and then copy back + perm is decomposed in m part of size 'size' + + in part k of new_perm we copy part constratint[k].id +*/ + +void update_perm(int *perm, int n, constraint_t *const_tab, int m, int size) +{ + int k; + int *new_perm = NULL; + + if( n <= 1 ) + return; + + new_perm = (int*)MALLOC(sizeof(int)*n); + + for ( k = 0 ; k < m ; k++ ) + memcpy(new_perm+k*size,perm+const_tab[k].id*size,size*sizeof(int)); + + memcpy(perm,new_perm,n*sizeof(int)); + /*printf("perm:");print_1D_tab(perm,n);*/ + + FREE(new_perm); +} + + + +/* we are at a given subtree of depth depth of the topology + the mapping constraints are in the table constraints of size n + The value of constraints are between 0 and the number of leaves-1 of the current subtree + + Canonical is the output of the function and is a just a renumbering of constraints in the canonical way + perm is a way to go from canonical[i] to the corresponding constraints[k]: perm[canonical[i]]=constraints[k] +*/ + +void recursive_canonicalization(int depth, tm_topology_t *topology, int *constraints, int *canonical, int *perm, int n, int m) +{ + constraint_t *const_tab = NULL; + int nb_leaves,nb_subtrees; + int k, prec, start, end; + + /* if there is no constraints stop and return*/ + if( !constraints ){ + assert( n == 0 ); + return; + } + + /* if we are at teh bottom of the tree set canonical to the 0 value: it will be shifted by update_canonical + and return*/ + if ( depth == topology->nb_levels ){ + assert( n==1 ); + canonical[0] = 0; + return; + } + + /* compute in how many subtrees we need to devide the curret one*/ + nb_subtrees = topology->arity[depth]; + /* construct a tab of constraints of this size*/ + const_tab = (constraint_t *) MALLOC( nb_subtrees * sizeof(constraint_t) ); + + /*printf("tab (%d):",nb_subtrees,n);print_1D_tab(constraints,n);*/ + /* nb_leaves is the number of leaves of the current subtree + this will help to detremine where to split constraints and how to shift values + */ + nb_leaves = compute_nb_leaves_from_level( depth + 1, topology ); + + /* split the constraints into nb_subtrees sub-constraints + each sub-contraints k contains constraints of value in [k*nb_leaves,(k+1)*nb_leaves[ + */ + start = 0; + for(k = 0; k < nb_subtrees; k++){ + /*returns the indice in contsraints that contains the smallest value not copied + end is used to compute the number of copied elements (end-size) and is used as the next staring indices*/ + end=fill_tab(&(const_tab[k].constraints), constraints, n,start, (k+1) * nb_leaves, k * nb_leaves); + const_tab[k].length = end-start; + const_tab[k].id = k; + start = end; + } + + /* sort constraint tab such that subtrees with the largest number of + constraints are put on the left and managed first, this how we canonize subtrees*/ + qsort(const_tab, nb_subtrees, sizeof(constraint_t), constraint_dsc); + /*display_contsraint_tab(const_tab,nb_subtrees);*/ + + /* update perm such taht we can backtrack the changes between constraints and caononical + To go from canonical[i] to the corresponding constraints[k] perm is such that perm[canonical[i]]=constraints[k]*/ + update_perm(perm, m, const_tab, nb_subtrees, nb_leaves); + + /* recursively call each subtree*/ + prec = 0; + for(k = 0; k < nb_subtrees; k++){ + /* the tricky part is here : we send only a subtab of canonical that will be updated recursively + This will greatly simplify the merging*/ + recursive_canonicalization(depth+1, topology, const_tab[k].constraints, canonical+prec, perm+k*nb_leaves, + const_tab[k].length, nb_leaves); + prec += const_tab[k].length; + } + + /* merging consist only in shifting the right part of canonical*/ + start = const_tab[0].length; + for( k = 1; k < nb_subtrees ; k++){ + update_canonical(canonical, start, start+const_tab[k].length, k * nb_leaves); + start += const_tab[k].length; + } + + /* FREE local subconstraints*/ + for( k = 0; k < nb_subtrees; k++ ) + if(const_tab[k].length) + FREE(const_tab[k].constraints); + + FREE(const_tab); +} + +/* + shuffle the constraints such that for each node there are more constraints on the left subtree than on the right subtree + + This is required to avoid handling permutations. On a 2:2:2:2 tree, if the + contraints are (0,1,3), it is equivalent to (0,1,2) The canonical form is the + second one. This help to handle the case (0,6,7,9,11,13,14,15) which are + symetric constaints and for which the canonical form is (0,1,2,4,6,8,9,12)) + + + + We store in *perm the way to go from the canonical form to the original constraints. + perm is a way to go from canonical[i] to the corresponding constraints[k]: perm[canonical[i]]=constraints[k] + */ +void canonize_constraints(tm_topology_t *topology, int *constraints, int **canonical, int n, int **perm, int *m) +{ + int *p = NULL, *c = NULL; + int i; + unsigned int vl = get_verbose_level(); + + *m = compute_nb_leaves_from_level(0,topology); + + p = (int*) MALLOC(sizeof(int)*(*m)); + for( i = 0 ; i < *m ; i++ ) + p[i] = i; + + c = (int*) MALLOC(sizeof(int)*n); + + if(vl>=DEBUG){ + printf("constraints:"); + print_1D_tab(constraints, n); + } + + recursive_canonicalization(0, topology, constraints, c, p, n, *m); + + if(vl>=DEBUG){ + printf("canonical:"); + print_1D_tab(c, n); + printf("perm:"); + print_1D_tab(p, *m); + } + + *perm = p; + *canonical = c; +} diff --git a/ompi/mca/topo/treematch/treematch/tm_mapping.h b/ompi/mca/topo/treematch/treematch/tm_mapping.h new file mode 100644 index 00000000000..0068184b567 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_mapping.h @@ -0,0 +1,43 @@ +#include "tm_tree.h" +#include "tm_hwloc.h" +#include "tm_timings.h" +#include "tm_verbose.h" + +int build_comm(char *filename,double ***pcomm); +void TreeMatchMapping(int nb_obj, int nb_proc,double **comm_mat, double * obj_weigth, double *com_speed, int d, int *sol); + +/*Map topology to cores: + sigma_i is such that process i is mapped on core sigma_i + k_i is such that core i exectutes process k_i + + size of sigma is the number of process (nb_objs) + size of k is the number of cores/nodes (nb_proc) + + We must have numbe of process<=number of cores + + k_i =-1 if no process is mapped on core i +*/ +void map_topology_simple(tm_topology_t *topology,tree_t *comm_tree, int *sigma, int nb_processes, int *k); + +int nb_processing_units(tm_topology_t *topology); +void free_topology(tm_topology_t *topology); +void display_other_heuristics(tm_topology_t *topology,int N,double **comm,int TGT_flag, int *constraints, double *cost); +void print_1D_tab(int *tab,int N); +void build_synthetic_proc_id(tm_topology_t *topology); +void display_topology(tm_topology_t *topology); +tm_topology_t *build_synthetic_topology(int *arity, int nb_levels, int *core_numbering, int nb_core_per_node); +tm_topology_t *optimize_topology(tm_topology_t *topology); +double print_sol_inv(int N,int *Value,double **comm, double *cost, tm_topology_t *topology); +double print_sol(int N,int *Value,double **comm, double *cost, tm_topology_t *topology); +int build_binding_constraints(char *filename, int **ptab); +void canonize_constraints(tm_topology_t *topology, int *constraints, int **canonical, int n, int **perm, int *m); +int compute_nb_leaves_from_level(int depth,tm_topology_t *topology); +void FREE_topology(tm_topology_t *); + + +/* use to split a constaint into subconstraint according the tree*/ +typedef struct _constraint{ + int *constraints; /* the subconstraints*/ + int length; /*length of *constraints*/ + int id; /* id of the corresponding subtree*/ +}constraint_t; diff --git a/ompi/mca/topo/treematch/treematch/tm_mt.c b/ompi/mca/topo/treematch/treematch/tm_mt.c new file mode 100644 index 00000000000..55c8f7bd905 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_mt.c @@ -0,0 +1,198 @@ +/* + A C-program for MT19937, with improved initialization 2002/1/26. + + This is an optimized version that amortizes the shift/reload cost, + by Eric Landry 2004-03-15. + + Before using, initialize the state by using init_genrand(seed) or + init_by_array(init_key, key_length). + + Copyright (C) 1997--2004, Makoto Matsumoto, Takuji Nishimura, and + Eric Landry; All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + + 3. The names of its contributors may not be used to endorse or + promote products derived from this software without specific + prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Any feedback is very welcome. + http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html + email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space) + + Reference: M. Matsumoto and T. Nishimura, "Mersenne Twister: + A 623-Dimensionally Equidistributed Uniform Pseudo-Random Number + Generator", ACM Transactions on Modeling and Computer Simulation, + Vol. 8, No. 1, January 1998, pp 3--30. +*/ + +#include "tm_mt.h" + +/* Period parameters */ +#define N 624 +#define M 397 +#define MATRIX_A 0x9908b0dfUL /* constant vector a */ +#define UPPER_MASK 0x80000000UL /* most significant w-r bits */ +#define LOWER_MASK 0x7fffffffUL /* least significant r bits */ + +static unsigned long x[N]; /* the array for the state vector */ +static unsigned long *p0, *p1, *pm; + +/* + initialize with a seed + + See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. + + In the previous versions, MSBs of the seed affect only MSBs of + the state. + + 2002-01-09 modified by Makoto Matsumoto +*/ +void +init_genrand(unsigned long s) +{ + int i; + + x[0] = s & 0xffffffffUL; + for (i = 1; i < N; ++i) { + x[i] = (1812433253UL * (x[i - 1] ^ (x[i - 1] >> 30)) + i) + & 0xffffffffUL; /* for >32 bit machines */ + } + p0 = x; + p1 = x + 1; + pm = x + M; +} + +/* + initialize by an array with array-length + + init_key is the array for initializing keys + + key_length is its length + + 2004-02-26 slight change for C++ +*/ +void +init_by_array(unsigned long init_key[], int key_length) +{ + int i, j, k; + + init_genrand(19650218UL); + i = 1; + j = 0; + for (k = (N > key_length ? N : key_length); k; --k) { + /* non linear */ + x[i] = ((x[i] ^ ((x[i - 1] ^ (x[i - 1] >> 30)) * 1664525UL)) + + init_key[j] + j) & 0xffffffffUL; /* for WORDSIZE > 32 machines */ + if (++i >= N) { + x[0] = x[N - 1]; + i = 1; + } + if (++j >= key_length) { + j = 0; + } + } + for (k = N - 1; k; --k) { + /* non linear */ + x[i] = ((x[i] ^ ((x[i - 1] ^ (x[i - 1] >> 30)) * 1566083941UL)) - i) + & 0xffffffffUL; /* for WORDSIZE > 32 machines */ + if (++i >= N) { + x[0] = x[N - 1]; + i = 1; + } + } + x[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */ +} + +/* generates a random number on the interval [0,0xffffffff] */ +unsigned long +genrand_int32(void) +{ + unsigned long y; + + if (!p0) { + /* Default seed */ + init_genrand(5489UL); + } + /* Twisted feedback */ + y = *p0 = *pm++ ^ (((*p0 & UPPER_MASK) | (*p1 & LOWER_MASK)) >> 1) + ^ (-(*p1 & 1) & MATRIX_A); + p0 = p1++; + if (pm == x + N) { + pm = x; + } + if (p1 == x + N) { + p1 = x; + } + /* Temper */ + y ^= y >> 11; + y ^= y << 7 & 0x9d2c5680UL; + y ^= y << 15 & 0xefc60000UL; + y ^= y >> 18; + return y; +} + +/* generates a random number on the interval [0,0x7fffffff] */ +long +genrand_int31(void) +{ + return (long) (genrand_int32() >> 1); +} + +/* generates a random number on the real interval [0,1] */ +double +genrand_real1(void) +{ + return genrand_int32() * (1.0 / 4294967295.0); + /* divided by 2^32-1 */ +} + +/* generates a random number on the real interval [0,1) */ +double +genrand_real2(void) +{ + return genrand_int32() * (1.0 / 4294967296.0); + /* divided by 2^32 */ +} + +/* generates a random number on the real interval (0,1) */ +double +genrand_real3(void) +{ + return (((double) genrand_int32()) + 0.5) * (1.0 / 4294967296.0); + /* divided by 2^32 */ +} + +/* generates a 53-bit random number on the real interval [0,1) */ +double +genrand_res53(void) +{ + unsigned long a = genrand_int32() >> 5, b = genrand_int32() >> 6; + + return (a * 67108864.0 + b) * (1.0 / 9007199254740992.0); +} + +/* 2002-01-09 These real versions are due to Isaku Wada */ + diff --git a/ompi/mca/topo/treematch/treematch/tm_mt.h b/ompi/mca/topo/treematch/treematch/tm_mt.h new file mode 100644 index 00000000000..260067d514d --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_mt.h @@ -0,0 +1,11 @@ +void init_genrand(unsigned long s); +void init_by_array(unsigned long init_key[], int key_length); + +/* generates a random number on the interval [0,0x7fffffff] */ +unsigned long genrand_int32(void); + +long genrand_int31(void); +double genrand_real1(void); +double genrand_real2(void); +double genrand_real3(void); +double genrand_res53(void); diff --git a/ompi/mca/topo/treematch/treematch/tm_thread_pool.c b/ompi/mca/topo/treematch/treematch/tm_thread_pool.c new file mode 100644 index 00000000000..e4fbc22ad2a --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_thread_pool.c @@ -0,0 +1,349 @@ +#include +#include "tm_thread_pool.h" +#include "tm_verbose.h" +#include +#include "tm_verbose.h" +#include "tm_tree.h" +#include + +static int verbose_level = ERROR; +static thread_pool_t *pool = NULL; + +static thread_pool_t *get_thread_pool(void); +static void execute_work(work_t *work); +static int bind_myself_to_core(hwloc_topology_t topology, int id); +static void *thread_loop(void *arg); +static void add_work(pthread_mutex_t *list_lock, pthread_cond_t *cond_var, work_t *working_list, work_t *work); +static thread_pool_t *create_threads(void); + +static void f1 (int nb_args, void **args); +static void f2 (int nb_args, void **args); +static void destroy_work(work_t *work); + + +void f1 (int nb_args, void **args){ + int a, b; + a = *(int*)args[0]; + b = *(int*)args[1]; + printf("nb_args=%d, a=%d, b=%d\n",nb_args,a,b); +} + + +void f2 (int nb_args, void **args){ + int n, *tab; + int *res; + int i,j; + n = *(int*)args[0]; + tab = (int*)args[1]; + res=(int*)args[2]; + + for(j=0;j<1000000;j++){ + *res=0; + for (i=0;itask(work->nb_args, work->args); +} + +int bind_myself_to_core(hwloc_topology_t topology, int id){ + hwloc_cpuset_t cpuset; + hwloc_obj_t obj; + char *str; + int binding_res; + int depth = hwloc_topology_get_depth(topology); + /* printf("depth=%d\n",depth); */ + + /* Get my core. */ + obj = hwloc_get_obj_by_depth(topology, depth-1, id); + if (obj) { + /* Get a copy of its cpuset that we may modify. */ + cpuset = hwloc_bitmap_dup(obj->cpuset); + + /* Get only one logical processor (in case the core is + SMT/hyperthreaded). */ + hwloc_bitmap_singlify(cpuset); + + + /*hwloc_bitmap_asprintf(&str, cpuset); + printf("Binding thread %d to cpuset %s\n", id,str); + FREE(str); + */ + + /* And try to bind ourself there. */ + binding_res = hwloc_set_cpubind(topology, cpuset, HWLOC_CPUBIND_THREAD); + if (binding_res == -1){ + int error = errno; + hwloc_bitmap_asprintf(&str, obj->cpuset); + if(verbose_level>=WARNING) + fprintf(stderr,"%d Couldn't bind to cpuset %s: %s\n", id, str, strerror(error)); + FREE(str); + return 0; + } + /* FREE our cpuset copy */ + hwloc_bitmap_free(cpuset); + return 1; + }else{ + if(verbose_level>=WARNING) + fprintf(stderr,"No valid object for core id %d!\n",id); + return 0; + } +} + + + + +void *thread_loop(void *arg){ + local_thread_t *local=(local_thread_t*)arg; + int id = local->id; + hwloc_topology_t topology= local->topology; + work_t *start_working_list = local ->working_list; + pthread_cond_t *cond_var = local->cond_var; + pthread_mutex_t *list_lock = local->list_lock; + work_t *work; + int *ret = (int *)MALLOC(sizeof(int)); + + bind_myself_to_core(topology,id); + + + + while(1){ + pthread_mutex_lock(list_lock); + while(start_working_list->next == NULL) { + pthread_cond_wait(cond_var, list_lock); + } + + work = start_working_list->next; + start_working_list->next = work-> next; + pthread_mutex_unlock(list_lock); + + if(!work->task){ + *ret = 0; + pthread_exit(ret); + } + + execute_work(work); + pthread_mutex_lock(&work->mutex); + work->done=1; + pthread_mutex_unlock(&work->mutex); + pthread_cond_signal(&work->work_done); + } + +} + +void add_work(pthread_mutex_t *list_lock, pthread_cond_t *cond_var, work_t *working_list, work_t *work){ + + work_t *elem = working_list; + pthread_mutex_lock(list_lock); + while(elem->next!=NULL){ + elem=elem->next; + } + elem->next=work; + work -> next = NULL; + work -> done = 0; + pthread_cond_signal(cond_var); + pthread_mutex_unlock(list_lock); +} + + +void wait_work_completion(work_t *work){ + pthread_mutex_lock(&work->mutex); + while(!work->done) + pthread_cond_wait(&work->work_done, &work->mutex); + +} + + +int submit_work(work_t *work, int thread_id){ + if( (thread_id>=0) && (thread_id< pool->nb_threads)){ + add_work(&pool->list_lock[thread_id], &pool->cond_var[thread_id], &pool->working_list[thread_id], work); + return 1; + } + return 0; +} + +thread_pool_t *create_threads(){ + hwloc_topology_t topology; + int i; + local_thread_t *local; + int nb_cores; + int depth; + + verbose_level = get_verbose_level(); + + + /*Get number of cores: set 1 thread per core*/ + /* Allocate and initialize topology object. */ + hwloc_topology_init(&topology); + /* Only keep relevant levels + hwloc_topology_ignore_all_keep_structure(topology);*/ + /* Perform the topology detection. */ + hwloc_topology_load(topology); + depth = hwloc_topology_get_depth(topology); + if (depth == -1 ) { + if(verbose_level>=CRITICAL) + fprintf(stderr,"Error: topology with unknown depth\n"); + exit(-1); + } + + + + /* at depth 'depth' it is necessary a PU/core where we can execute things*/ + nb_cores = hwloc_get_nbobjs_by_depth(topology, depth-1); + + pool = (thread_pool_t*) MALLOC(sizeof(thread_pool_t)); + pool -> topology = topology; + pool -> nb_threads = nb_cores; + pool -> thread_list = (pthread_t*)MALLOC(sizeof(pthread_t)*nb_cores); + pool -> working_list = (work_t*)CALLOC(nb_cores,sizeof(work_t)); + pool -> cond_var = (pthread_cond_t*)MALLOC(sizeof(pthread_cond_t)*nb_cores); + pool -> list_lock = (pthread_mutex_t*)MALLOC(sizeof(pthread_mutex_t)*nb_cores); + + local=(local_thread_t*)MALLOC(sizeof(local_thread_t)*nb_cores); + pool->local = local; + + for (i=0;iworking_list[i]; + pthread_cond_init(pool->cond_var +i, NULL); + local[i].cond_var = pool->cond_var +i; + pthread_mutex_init(pool->list_lock +i, NULL); + local[i].list_lock = pool->list_lock+i; + if (pthread_create (pool->thread_list+i, NULL, thread_loop, local+i) < 0) { + if(verbose_level>=CRITICAL) + fprintf(stderr, "pthread_create error for exec thread %d\n",i); + return NULL; + } + } + return pool; +} + +thread_pool_t *get_thread_pool(){; + if (pool == NULL) + return create_threads(); + + return pool; +} + +void terminate_thread_pool(){ + int id; + int *ret=NULL; + work_t work; + + if(pool){ + work.task=NULL; + for (id=0;idnb_threads;id++){ + submit_work(&work,id); + } + + + for (id=0;idnb_threads;id++){ + pthread_join(pool->thread_list[id],(void **) &ret); + pthread_cond_destroy(pool->cond_var +id); + pthread_mutex_destroy(pool->list_lock +id); + if (pool->working_list[id].next != NULL) + if(verbose_level >= WARNING) + fprintf(stderr,"Working list of thread %d not empty!\n",id); + } + + hwloc_topology_destroy(pool->topology); + FREE(pool -> thread_list); + FREE(pool -> working_list); + FREE(pool -> cond_var); + FREE(pool -> list_lock); + FREE(pool -> local); + FREE(pool); + pool = NULL; + } +} + + + + +int get_nb_threads(){ + pool = get_thread_pool(); + return pool -> nb_threads; +} + + +work_t *create_work(int nb_args, void **args, void (*task) (int, void **)){ + work_t *work; + work = MALLOC(sizeof(work_t)); + work -> nb_args = nb_args; + work -> args = args; + work -> task = task; + work -> done = 0; + pthread_cond_init (&work->work_done, NULL); + pthread_mutex_init(&work->mutex, NULL); + if( verbose_level >= DEBUG) + printf("work %p created\n",(void *)work); + return work; +} + + +void destroy_work(work_t *work){ + pthread_cond_destroy(&work->work_done); + pthread_mutex_destroy(&work->mutex); + FREE(work); +} + + +int test_main(void){ + + int a=3, c; + int b=-5; + void *args1[3]; + void *args2[3]; + int tab[100]; + int i,res; + work_t *work1,*work2,*work3,*work4; + int nb_threads = get_nb_threads(); + + + printf("nb_threads= %d\n", nb_threads); + + + args1[0] = &a; + args1[1] = &b; + work1 = create_work(2,args1,f1); + + + for (i=0;i<100;i++) + tab[i]=i; + + c=100; + args2[0] = &c; + args2[1] = tab; + args2[2] = &res; + + work2 = create_work(3, args2, f2); + work3 = create_work(4, args2, f2); + work4 = create_work(5, args2, f2); + + submit_work(work1,0); + submit_work(work2,1); + submit_work(work3,1); + submit_work(work4,1); + + + + terminate_thread_pool(); + wait_work_completion(work1); + wait_work_completion(work2); + wait_work_completion(work3); + wait_work_completion(work4); + + printf("res=%d\n",res); + + destroy_work(work1); + destroy_work(work2); + destroy_work(work3); + destroy_work(work4); + return 0; +} diff --git a/ompi/mca/topo/treematch/treematch/tm_thread_pool.h b/ompi/mca/topo/treematch/treematch/tm_thread_pool.h new file mode 100644 index 00000000000..36fd80d17f8 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_thread_pool.h @@ -0,0 +1,45 @@ +#ifndef THREAD_POOL_H +#define THREAD_POOL_H + +#include +#include + + +typedef struct _work_t{ + int nb_args; + void (*task)(int nb_args, void **args); + void **args; + struct _work_t *next; + pthread_cond_t work_done; + pthread_mutex_t mutex; + int done; +}work_t; + +typedef struct { + int id; + hwloc_topology_t topology; + work_t *working_list; + pthread_cond_t *cond_var; + pthread_mutex_t *list_lock; +}local_thread_t; + + +typedef struct _thread_pool_t{ + int nb_threads; + pthread_t *thread_list; + work_t *working_list; + pthread_cond_t *cond_var; + pthread_mutex_t *list_lock; + local_thread_t *local; + hwloc_topology_t topology; +}thread_pool_t; + +int get_nb_threads(void); +int submit_work(work_t *work, int thread_id); +void wait_work_completion(work_t *work); +void terminate_thread_pool(void); +work_t *create_work(int nb_args, void **args, void (int, void **)); +int test_main(void); + + +#endif /* THREAD_POOL_H */ diff --git a/ompi/mca/topo/treematch/treematch/tm_timings.c b/ompi/mca/topo/treematch/treematch/tm_timings.c new file mode 100644 index 00000000000..8f00865eba9 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_timings.c @@ -0,0 +1,32 @@ +#include "tm_timings.h" + +static CLOCK_T time_tab[MAX_CLOCK]; +static int clock_num = -1; + +void get_time(void) +{ + clock_num++; + + if(clock_num>MAX_CLOCK-1) + return; + + CLOCK(time_tab[clock_num]); +} +double time_diff(void) +{ + CLOCK_T t2,t1; + + if(clock_num>MAX_CLOCK-1){ + clock_num--; + return -1.0; + } + + if(clock_num < 0){ + return -1.0; + } + + CLOCK(t2); + t1=time_tab[clock_num--]; + + return CLOCK_DIFF(t2,t1); +} diff --git a/ompi/mca/topo/treematch/treematch/tm_timings.h b/ompi/mca/topo/treematch/treematch/tm_timings.h new file mode 100644 index 00000000000..250ee5c1459 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_timings.h @@ -0,0 +1,47 @@ + +#ifndef TIMINGS_H +#define TIMINGS_H +#include + +#ifndef _WIN32 +#include +#else +#include +#endif +#include +#include + +#define MAX_CLOCK 1000 + +#ifndef _WIN32 +typedef struct timeval CLOCK_T; + + +#define CLOCK(c) gettimeofday(&c,(struct timezone *)NULL) +#define CLOCK_DIFF(c1,c2) \ +((double)(c1.tv_sec-c2.tv_sec)+(double)(c1.tv_usec-c2.tv_usec)/1e+6) +#define CLOCK_DISPLAY(c) fprintf(stderr,"%d.%d",(int)c.tv_sec,(int)c.tv_usec) + +#else /* for windows */ + +#ifdef __CYGWIN__ +typedef struct timeb CLOCK_T; +#else +typedef struct _timeb CLOCK_T; +#endif + +#define CLOCK(c) _ftime(&c) +#define CLOCK_DIFF(c1,c2) \ +((double)(c1.time-c2.time)+(double)(c1.millitm-c2.millitm)/1e+3) +#define CLOCK_DISPLAY(c) fprintf(stderr,"%d.%d",(int)c.time,(int)c.millitm*1e+3) + +#endif + +double time_diff(void); +void get_time(void); + +#define TIC get_time() +#define TOC time_diff() + +#endif /*TIMINGS_H*/ + diff --git a/ompi/mca/topo/treematch/treematch/tm_tree.c b/ompi/mca/topo/treematch/treematch/tm_tree.c new file mode 100644 index 00000000000..0d39a8e3b25 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_tree.c @@ -0,0 +1,1648 @@ +#include +#include +#include +#include +#include +#include "tm_tree.h" +#include "tm_timings.h" +#include "tm_bucket.h" +#include "tm_kpartitioning.h" +#include "tm_mapping.h" +#include "tm_verbose.h" +#include "tm_thread_pool.h" + + +#define MIN(a,b) ((a)<(b)?(a):(b)) +#define MAX(a,b) ((a)>(b)?(a):(b)) + +#ifndef __CHARMC__ +#define __CHARMC__ 0 +#endif + +#if __CHARMC__ +#include "converse.h" +#else +static int ilog2(int val) +{ + int i = 0; + for( ; val != 0; val >>= 1, i++ ); + return i; +} +#define CmiLog2(VAL) ilog2((int)(VAL)) +#endif + + + +static int verbose_level = ERROR; + + +void FREE_list_child(tree_t *); +void FREE_tab_child(tree_t *); +unsigned long int choose (long,long); +void display_node(tree_t *); +void clone_tree(tree_t *,tree_t *); +double *aggregate_obj_weight(tree_t *,double *,int); +affinity_mat_t *aggregate_com_mat(tree_t *,affinity_mat_t *,int); +double eval_grouping(affinity_mat_t *,tree_t **,int); +group_list_t *new_group_list(tree_t **,double,group_list_t *); +void add_to_list(group_list_t *,tree_t **,int,double); +void list_all_possible_groups(affinity_mat_t *,tree_t *,int,int,int,tree_t **,group_list_t *); +int independent_groups(group_list_t **,int,group_list_t *,int); +void display_selection (group_list_t**,int,int,double); +void display_grouping (tree_t *,int,int,double); +int recurs_select_independent_groups(group_list_t **,int,int,int,int, + int,double,double *,group_list_t **,group_list_t **); +int test_independent_groups(group_list_t **,int,int,int,int,int,double,double *, + group_list_t **,group_list_t **); +void delete_group_list(group_list_t *); +int group_list_id(const void*,const void*); +int group_list_asc(const void*,const void*); +int group_list_dsc(const void*,const void*); +int weighted_degree_asc(const void*,const void*); +int weighted_degree_dsc(const void*,const void*); +int select_independent_groups(group_list_t **,int,int,int,double *,group_list_t **,int,double); +int select_independent_groups_by_largest_index(group_list_t **,int,int,int,double *, + group_list_t **,int,double); +void list_to_tab(group_list_t *,group_list_t **,int); +void display_tab_group(group_list_t **,int,int); +int independent_tab(tree_t **,tree_t **,int); +void compute_weighted_degree(group_list_t **,int,int); +void group(affinity_mat_t *,tree_t *,tree_t *,int,int,int,double *,tree_t **); +void fast_group(affinity_mat_t *,tree_t *,tree_t *,int,int,int,double *,tree_t **, int *, int); +int adjacency_asc(const void*,const void*); +int adjacency_dsc(const void*,const void*); + void super_fast_grouping(affinity_mat_t *,tree_t *,tree_t *,int, int); +affinity_mat_t *build_cost_matrix(affinity_mat_t *,double *,double); +void group_nodes(affinity_mat_t *,tree_t *,tree_t *,int ,int,double*,double); +void fast_grouping(affinity_mat_t *,tree_t *,tree_t *,int,int,long int); +void complete_aff_mat(affinity_mat_t **,int,int); +void complete_obj_weight(double **,int,int); +void create_dumb_tree(tree_t *,int,tm_topology_t *); +void complete_tab_node(tree_t **,int,int,int,tm_topology_t *); +void set_deb_tab_child(tree_t *,tree_t *,int); +tree_t *build_level_topology(tree_t *,affinity_mat_t *,int,int,tm_topology_t *,double *,double *); +int check_constraints(tm_topology_t *,int **); +tree_t *bottom_up_build_tree_from_topology(tm_topology_t *,double **, int ,double *,double *); +void FREE_non_constraint_tree(tree_t *); +void FREE_constraint_tree(tree_t *); +void FREE_tab_double(double**,int); +void FREE_tab_int(int**,int ); +void partial_aggregate_com_mat (int, void **); +affinity_mat_t *new_affinity_mat(double **, double *, int); +void partial_aggregate_aff_mat (int, void **); +affinity_mat_t *aggregate_aff_mat(tree_t *, affinity_mat_t *, int); +affinity_mat_t * build_affinity_mat(double **, int); + +affinity_mat_t *new_affinity_mat(double **mat, double *sum_row, int order){ + affinity_mat_t *res = (affinity_mat_t *) MALLOC (sizeof(affinity_mat_t)); + + res -> mat = mat; + res -> sum_row = sum_row; + res -> order = order; + + return res; +} + +void FREE_list_child(tree_t *tree) +{ + int i; + + if(tree) + for(i=0;iarity;i++) + FREE_list_child(tree->child[i]); + + FREE(tree->child); + if(tree->dumb) + FREE(tree); +} + +void FREE_tab_child(tree_t *tree) +{ + if(tree){ + FREE_tab_child(tree->tab_child); + FREE(tree->tab_child); + } +} + +void FREE_non_constraint_tree(tree_t *tree) +{ + + FREE_list_child(tree); + FREE_tab_child(tree); + FREE(tree); +} + +void FREE_constraint_tree(tree_t *tree) +{ + int i; + if(tree){ + for(i=0;iarity;i++) + FREE_constraint_tree(tree->child[i]); + FREE(tree->child); + FREE(tree); + } +} + + +void FREE_tree(tree_t *tree) +{ + if(tree->constraint) + FREE_constraint_tree(tree); + else + FREE_non_constraint_tree(tree); +} + +unsigned long int choose (long n,long k) +{ + /* compute C_n_k */ + double res = 1; + int i; + + for( i = 0 ; i < k ; i++ ) + res *= (double)(n-i)/(double)(k-i); + + return (unsigned long int)res; +} + +void set_node(tree_t *node,tree_t ** child, int arity,tree_t *parent, + int id,double val,tree_t *tab_child,int depth) +{ + static int uniq = 0; + node->child = child; + node->arity = arity; + node->tab_child = tab_child; + node->parent = parent; + node->id = id; + node->val = val; + node->uniq = uniq++; + node->depth= depth; + node->dumb = 0; +} + +void display_node(tree_t *node) +{ + if (verbose_level >= DEBUG) + printf("child : %p\narity : %d\nparent : %p\nid : %d\nval : %f\nuniq : %d\n\n", + (void *)(node->child),node->arity,(void *)(node->parent),node->id,node->val,node->uniq); +} + +void clone_tree(tree_t *new,tree_t *old) +{ + int i; + new->child = old->child; + new->parent = old->parent; + new->tab_child = old->tab_child; + new->val = old->val; + new->arity = old->arity; + new->depth = old->depth; + new->id = old->id; + new->uniq = old->uniq; + new->dumb = old->dumb; + for( i = 0 ; i < new->arity ; i++ ) + new->child[i]->parent = new; +} + + +double *aggregate_obj_weight(tree_t *new_tab_node, double *tab, int M) +{ + int i,i1,id1; + double *res = NULL; + + if(!tab) + return NULL; + + res = (double*)MALLOC(M*sizeof(double)); + + for( i = 0 ; i < M ; i++ ){ + res[i] = 0.0; + for( i1 = 0 ; i1 < new_tab_node[i].arity ; i1++ ){ + id1 = new_tab_node[i].child[i1]->id; + res[i] += tab[id1]; + } + } + return res; +} + + + +void partial_aggregate_aff_mat (int nb_args, void **args){ + int inf = *(int*)args[0]; + int sup = *(int*)args[1]; + double **old_mat = (double**)args[2]; + tree_t *tab_node = (tree_t*)args[3]; + int M = *(int*)args[4]; + double **mat = (double**)args[5]; + double *sum_row = (double*)args[6]; + int i,j,i1,j1; + int id1, id2; + + + if(nb_args != 6){ + if(verbose_level >= ERROR) + fprintf(stderr,"Wrong number of args in %s: %d\n",__FUNCTION__, nb_args); + exit(-1); + } + + if(verbose_level >= INFO) + printf("Aggregate in parallel (%d-%d)\n",inf,sup-1); + + for( i = inf ; i < sup ; i++ ) + for( j = 0 ; j < M ; j++ ){ + if(i != j){ + for( i1 = 0 ; i1 < tab_node[i].arity ; i1++ ){ + id1 = tab_node[i].child[i1]->id; + for( j1 = 0 ; j1 < tab_node[j].arity ; j1++ ){ + id2 = tab_node[j].child[j1]->id; + mat[i][j] += old_mat[id1][id2]; + /* printf("mat[%d][%d]+=old_mat[%d][%d]=%f\n",i,j,id1,id2,old_mat[id1][id2]);*/ + } + sum_row[i] += mat[i][j]; + } + } + } +} + + +affinity_mat_t *aggregate_aff_mat(tree_t *tab_node, affinity_mat_t *aff_mat, int M) +{ + int i,j,i1,j1,id1,id2; + double **new_mat = NULL, **old_mat = aff_mat->mat; + double *sum_row = NULL; + + new_mat = (double**)MALLOC(M*sizeof(double*)); + for( i = 0 ; i < M ; i++ ) + new_mat[i] = (double*)CALLOC((M),sizeof(double)); + + sum_row = (double*)CALLOC(M,sizeof(double)); + + if(M>512){ /* perform this part in parallel*/ + int id; + int nb_threads; + work_t **works; + int *inf; + int *sup; + + nb_threads = MIN(M/512,get_nb_threads()); + works = (work_t**)MALLOC(sizeof(work_t*)*nb_threads); + inf = (int*)MALLOC(sizeof(int)*nb_threads); + sup = (int*)MALLOC(sizeof(int)*nb_threads); + for(id=0;id= DEBUG) + printf("Executing %p\n",(void *)works[id]); + + submit_work( works[id], id); + } + + for(id=0;idargs); + } + + + FREE(inf); + FREE(sup); + FREE(works); + + }else{ + for( i = 0 ; i < M ; i++ ) + for( j = 0 ; j < M ; j++ ){ + if(i != j){ + for( i1 = 0 ; i1 < tab_node[i].arity ; i1++ ){ + id1 = tab_node[i].child[i1]->id; + for( j1 = 0 ; j1 < tab_node[j].arity ; j1++ ){ + id2 = tab_node[j].child[j1]->id; + new_mat[i][j] += old_mat[id1][id2]; + /* printf("mat[%d][%d]+=old_mat[%d][%d]=%f\n",i,j,id1,id2,old_mat[id1][id2]);*/ + } + sum_row[i] += new_mat[i][j]; + } + } + } + } + return new_affinity_mat(new_mat,sum_row,M); +} + +void FREE_tab_double(double**tab,int N) +{ + int i; + for( i = 0 ; i < N ; i++ ) + FREE(tab[i]); + FREE(tab); +} + +void FREE_tab_int(int**tab,int N) +{ + int i; + for( i = 0 ; i < N ; i++ ) + FREE(tab[i]); + FREE(tab); +} + +void display_tab(double **tab,int N) +{ + int i,j; + double line,total = 0; + + + for( i = 0 ; i < N ; i++ ){ + line = 0; + for( j = 0 ; j < N ; j++ ){ + printf("%g ",tab[i][j]); + line += tab[i][j]; + } + total += line; + /* printf(": %g",line);*/ + printf("\n"); + } + /* printf("Total: %.2f\n",total);*/ +} + + +double eval_grouping(affinity_mat_t *aff_mat,tree_t **cur_group,int arity) +{ + double res = 0; + int i,j,id,id1,id2; + double **mat = aff_mat->mat; + double * sum_row = aff_mat -> sum_row; + + /*display_tab(tab,N);*/ + + for( i = 0 ; i < arity ; i++ ){ + id = cur_group[i]->id; + res += sum_row[id]; + } + + for( i = 0 ; i < arity ; i++ ){ + id1 = cur_group[i]->id; + for( j = 0 ; j < arity ; j++ ){ + id2 = cur_group[j]->id; + /*printf("res-=tab[%d][%d]=%f\n",id1,id2,tab[id1][id2]);*/ + res -= mat[id1][id2]; + } + } + /*printf(" = %f\n",res);*/ + return res; +} + + +group_list_t *new_group_list(tree_t **tab,double val,group_list_t *next) +{ + group_list_t *res = NULL; + + res = (group_list_t *)MALLOC(sizeof(group_list_t)); + res->tab = tab; + res->val = val; + res->next = next; + res->sum_neighbour = 0; + return res; +} + + +void add_to_list(group_list_t *list,tree_t **cur_group, int arity, double val) +{ + group_list_t *elem = NULL; + tree_t **tab = NULL; + int i; + + tab=(tree_t **)MALLOC(sizeof(tree_t *)*arity); + + for( i = 0 ; i < arity ; i++ ){ + tab[i] = cur_group[i]; + if(verbose_level>=INFO) + printf("cur_group[%d]=%d ",i,cur_group[i]->id); + } + if(verbose_level>=INFO) + printf(": %f\n",val); + + /*printf("\n");*/ + elem = new_group_list(tab,val,list->next); + list->next = elem; + list->val++; +} + + +void list_all_possible_groups(affinity_mat_t *aff_mat,tree_t *tab_node,int id,int arity, int depth, + tree_t **cur_group, group_list_t *list) +{ + double val; + int i; + int N = aff_mat->order; + + if(depth == arity){ + val = eval_grouping(aff_mat,cur_group,arity); + add_to_list(list,cur_group,arity,val); + return; + }else if( (N+depth) >= (arity+id) ){ + /*}else if(1){*/ + for( i = id ; i < N ; i++ ){ + if(tab_node[i].parent) + continue; + cur_group[depth] = &tab_node[i]; + if(verbose_level>=INFO) + printf("%d<-%d\n",depth,i); + list_all_possible_groups(aff_mat,tab_node,i+1,arity,depth+1,cur_group,list); + } + } +} + +void update_val(affinity_mat_t *aff_mat,tree_t *parent) +{ + /* int i; */ + + parent->val = eval_grouping(aff_mat,parent->child,parent->arity); + /*printf("connecting: ");*/ + /*for( i = 0 ; i < parent->arity ; i++ ){ */ + /*printf("%d ",parent->child[i]->id);*/ + /* if(parent->child[i]->parent!=parent){ + parent->child[i]->parent=parent; + }else{ + fprintf(stderr,"redundant operation!\n"); + exit(-1); + }*/ + /* } */ + /*printf(": %f\n",parent->val);*/ +} + +int independent_groups(group_list_t **selection,int d,group_list_t *elem,int arity) +{ + int i,j,k; + + if(d == 0) + return 1; + + for( i = 0 ; i < arity ; i++ ) + for( j = 0 ; j < d ; j++ ) + for( k = 0 ; k < arity ; k++ ) + if(elem->tab[i]->id == selection[j]->tab[k]->id) + return 0; + return 1; +} + +void display_selection (group_list_t** selection,int M,int arity,double val) +{ + int i,j; + + if(verbose_leveltab[j]->id); + printf("-- "); + } + printf(":%f\n",val); +} + +void display_grouping (tree_t *father,int M,int arity,double val) +{ + int i,j; + + if(verbose_level < INFO) + return; + + printf("Grouping : "); + for( i = 0 ; i < M ; i++ ){ + for( j = 0 ; j < arity ; j++ ) + printf("%d ",father[i].child[j]->id); + printf("-- "); + } + printf(":%f\n",val); +} + + +int recurs_select_independent_groups(group_list_t **tab,int i,int n,int arity,int d,int M,double val,double *best_val,group_list_t **selection,group_list_t **best_selection) +{ + group_list_t *elem = NULL; + /* + if(val>=*best_val) + return 0; + */ + + if( d == M ){ + if(verbose_level>=INFO) + display_selection(selection,M,arity,val); + if( val < *best_val ){ + *best_val = val; + for( i = 0 ; i < M ; i++ ) + best_selection[i] = selection[i]; + return 1; + } + return 0; + } + + while( i < n ){ + elem = tab[i]; + if(independent_groups(selection,d,elem,arity)){ + if(verbose_level>=INFO) + printf("%d: %d\n",d,i); + selection[d] = elem; + val += elem->val; + return recurs_select_independent_groups(tab,i+1,n,arity,d+1,M,val,best_val,selection,best_selection); + } + i++; + } + return 0; +} + + +int test_independent_groups(group_list_t **tab,int i,int n,int arity,int d,int M,double val,double *best_val,group_list_t **selection,group_list_t **best_selection) +{ + group_list_t *elem = NULL; + + if( d == M ){ + /*display_selection(selection,M,arity,val);*/ + return 1; + } + + while( i < n ){ + elem = tab[i]; + if(independent_groups(selection,d,elem,arity)){ + /*printf("%d: %d\n",d,i);*/ + selection[d] = elem; + val += elem->val; + return recurs_select_independent_groups(tab,i+1,n,arity,d+1,M,val,best_val,selection,best_selection); + } + i++; + } + return 0; +} + +void delete_group_list(group_list_t *list) +{ + if(list){ + delete_group_list(list->next); + FREE(list->tab); + FREE(list); + } +} + +int group_list_id(const void* x1,const void* x2) +{ + group_list_t *e1 = NULL,*e2= NULL; + + e1 = *((group_list_t**)x1); + e2 = *((group_list_t**)x2); + + return (e1->tab[0]->id < e2->tab[0]->id) ? - 1 : 1; +} + +int group_list_asc(const void* x1,const void* x2) +{ + group_list_t *e1 = NULL,*e2 = NULL; + + e1 = *((group_list_t**)x1); + e2 = *((group_list_t**)x2); + + return (e1->val < e2->val) ? - 1 : 1; +} + +int group_list_dsc(const void* x1,const void* x2) +{ + group_list_t *e1 = NULL,*e2 = NULL; + + e1 = *((group_list_t**)x1); + e2 = *((group_list_t**)x2); + + return (e1->val > e2->val) ? -1 : 1; +} + +int weighted_degree_asc(const void* x1,const void* x2) +{ + group_list_t *e1= NULL,*e2 = NULL; + + e1 = *((group_list_t**)x1); + e2 = *((group_list_t**)x2); + + return (e1->wg > e2->wg) ? 1 : -1; +} + +int weighted_degree_dsc(const void* x1,const void* x2) +{ + group_list_t *e1 = NULL,*e2 = NULL; + + e1 = *((group_list_t**)x1); + e2 = *((group_list_t**)x2); + + return (e1->wg > e2->wg) ? - 1 : 1; +} + +int select_independent_groups(group_list_t **tab_group,int n,int arity,int M,double *best_val, + group_list_t **best_selection,int bound,double max_duration) +{ + int i,j; + group_list_t **selection = NULL; + double val,duration; + CLOCK_T time1,time0; + + if(verbose_level>=INFO){ + for(i=0;itab[j]->id); + } + printf(" : %f\n",tab_group[i]->val); + } + } + + + + selection = (group_list_t **)MALLOC(sizeof(group_list_t*)*M); + CLOCK(time0); + for( i = 0 ; i < MIN(bound,n) ; i++ ){ + /* if(!(i%100)) {printf("%d/%d ",i, MIN(bound,n)); fflush(stdout);} */ + selection[0] = tab_group[i]; + val = tab_group[i]->val; + recurs_select_independent_groups(tab_group,i+1,n,arity,1,M,val,best_val,selection,best_selection); + if((!(i%5)) && (max_duration>0)){ + CLOCK(time1); + duration = CLOCK_DIFF(time1,time0); + if(duration>max_duration){ + FREE(selection); + return 1; + } + } + } + FREE(selection); + + + if(verbose_level>=INFO) + display_selection(best_selection,M,arity,*best_val); + return 0; +} + +int select_independent_groups_by_largest_index(group_list_t **tab_group,int n,int arity,int M,double *best_val,group_list_t **best_selection,int bound,double max_duration) +{ + int i,dec,nb_groups=0; + group_list_t **selection = NULL; + double val,duration; + CLOCK_T time1,time0; + + selection = (group_list_t **)MALLOC(sizeof(group_list_t*)*M); + CLOCK(time0); + + dec = MAX(n/10000,2); + for( i = n-1 ; i >= 0 ; i -= dec*dec){ + selection[0] = tab_group[i]; + val = tab_group[i]->val; + nb_groups += test_independent_groups(tab_group,i+1,n,arity,1,M,val,best_val,selection,best_selection); + if(verbose_level>=DEBUG) + printf("%d:%d\n",i,nb_groups); + + if(nb_groups >= bound){ + FREE(selection); + return 0; + } + if((!(i%5)) && (max_duration>0)){ + CLOCK(time1); + duration=CLOCK_DIFF(time1,time0); + if(duration>max_duration){ + FREE(selection); + return 1; + } + } + } + + FREE(selection); + return 0; +} + +void list_to_tab(group_list_t *list,group_list_t **tab,int n) +{ + int i; + for( i = 0 ; i < n ; i++ ){ + if(!list){ + if(verbose_level>=CRITICAL) + fprintf(stderr,"Error not enough elements. Only %d on %d\n",i,n); + exit(-1); + } + tab[n-i-1] = list; + list = list->next; + } + if(list){ + if(verbose_level>=DEBUG) + fprintf(stderr,"Error too many elements\n"); + exit(-1); + } +} + +void display_tab_group(group_list_t **tab, int n,int arity) +{ + int i,j; + if(verbose_leveltab[j]->id); + printf(": %.2f %.2f\n",tab[i]->val,tab[i]->wg); + } +} + +int independent_tab(tree_t **tab1,tree_t **tab2,int n) +{ + int i = 0,j = 0; + + while( (iid == tab2[j]->id) + return 0; + else if(tab1[i]->id > tab2[j]->id) + j++; + else + i++; + } + return 1; +} + +void compute_weighted_degree(group_list_t **tab, int n,int arity) +{ + int i,j; + for( i = 0 ; i < n ; i++) + tab[i]->sum_neighbour = 0; + for( i = 0 ; i < n ; i++ ){ + /*printf("%d/%d=%f%%\n",i,n,(100.0*i)/n);*/ + for( j = i+1 ; j < n ; j++ ) + /*if(!independent_groups(&tab[i],1,tab[j],arity)){*/ + if(!independent_tab(tab[i]->tab,tab[j]->tab,arity)){ + tab[i]->sum_neighbour += tab[j]->val; + tab[j]->sum_neighbour += tab[i]->val; + } + + tab[i]->wg = tab[i]->sum_neighbour/tab[i]->val; + if(tab[i]->sum_neighbour == 0) + tab[i]->wg = 0; + /*printf("%d:%f/%f=%f\n",i,tab[i]->sum_neighbour,tab[i]->val,tab[i]->wg);*/ + } +} + +/* + Very slow: explore all possibilities + aff_mat : the affiity matrix at the considered level (used to evaluate a grouping) + tab_node: array of the node to group + parent: node to which attached the computed group + id: current considered node of tab_node + arity: number of children of parent (i.e.) size of the group to compute + best_val: current value of th grouping + cur_group: current grouping + */ +void group(affinity_mat_t *aff_mat,tree_t *tab_node,tree_t *parent,int id,int arity, int n,double *best_val,tree_t **cur_group) +{ + + int N = aff_mat->order; + double val; + int i; + + /*if we have found enough noide in the group*/ + if( n == arity){ + /* evaluate this group*/ + val = eval_grouping(aff_mat,cur_group,arity); + /* If we improve compared to previous grouping: uodate the children of parent accordingly */ + if( val < *best_val ){ + *best_val = val; + for( i = 0 ; i < arity ; i++ ) + parent->child[i] = cur_group[i]; + parent->arity = arity; + } + return; + } + + /* + If we need more node in the group + Continue to explore avilable nodes + */ + for( i = id+1 ; i < N ; i++ ){ + /* If this node is allready in a group: skip it*/ + if(tab_node[i].parent) + continue; + /*Otherwise, add it to the group at place n*/ + cur_group[n] = &tab_node[i]; + /* + printf("%d<-%d\n",n,i); + recursively add the next element to this group + */ + group(aff_mat,tab_node,parent,i,arity,n+1,best_val,cur_group); + } +} + +/* + aff_mat : the affiity matrix at the considered level (used to evaluate a grouping) + tab_node: array of the node to group + parent: node to which attached the computed group + id: current considered node of tab_node + arity: number of children of parent (i.e.) size of the group to compute + best_val: current value of th grouping + cur_group: current grouping + N: size of tab and tab_node. i.e. number of nodes at the considered level + */ +void fast_group(affinity_mat_t *aff_mat,tree_t *tab_node,tree_t *parent,int id,int arity, int n, + double *best_val,tree_t **cur_group, int *nb_groups,int max_groups) +{ + double val; + int i; + int N = aff_mat->order; + + /*printf("Max groups=%d\n",max_groups);*/ + + /*if we have found enough node in the group*/ + if( n == arity ){ + (*nb_groups)++; + /*evaluate this group*/ + val = eval_grouping(aff_mat,cur_group,arity); + /* If we improve compared to previous grouping: uodate the children of parent accordingly*/ + if( val < *best_val ){ + *best_val = val; + for( i = 0 ; i < arity ; i++ ) + parent->child[i] = cur_group[i]; + + parent->arity = arity; + } + return; + } + + /* + If we need more node in the group + Continue to explore avilable nodes + */ + for( i = id+1 ; i < N ; i++ ){ + /* If this node is allready in a group: skip it*/ + if(tab_node[i].parent) + continue; + /*Otherwise, add it to the group at place n */ + cur_group[n] = &tab_node[i]; + /* + printf("%d<-%d %d/%d\n",n,i,*nb_groups,max_groups); + exit(-1); + recursively add the next element to this group + */ + fast_group(aff_mat,tab_node,parent,i,arity,n+1,best_val,cur_group,nb_groups,max_groups); + if(*nb_groups > max_groups) + return; + } +} + + +void fast_grouping(affinity_mat_t *aff_mat,tree_t *tab_node, tree_t *new_tab_node, int arity, int M,long int k) +{ + tree_t **cur_group = NULL; + int l,i,nb_groups; + double best_val,val=0; + + cur_group = (tree_t**)MALLOC(sizeof(tree_t*)*arity); + for( l = 0 ; l < M ; l++ ){ + best_val = DBL_MAX; + nb_groups = 0; + /*printf("k%d/%d, k=%ld\n",l,M,k);*/ + /* select the best greedy grouping among the 10 first one*/ + /*fast_group(tab,tab_node,&new_tab_node[l],-1,arity,0,&best_val,cur_group,N,&nb_groups,MAX(2,(int)(50-log2(k))-M/10));*/ + fast_group(aff_mat,tab_node,&new_tab_node[l],-1,arity,0,&best_val,cur_group,&nb_groups,MAX(1,(int)(50-CmiLog2(k))-M/10)); + val += best_val; + for( i = 0 ; i < new_tab_node[l].arity ; i++ ) + new_tab_node[l].child[i]->parent=&new_tab_node[l]; + update_val(aff_mat,&new_tab_node[l]); + } + + FREE(cur_group); + + if(verbose_level>=INFO) + printf("val=%f\n",val); + /*exit(-1);*/ + + if(verbose_level>=INFO) + display_grouping(new_tab_node,M,arity,val); + +} + + +int adjacency_asc(const void* x1,const void* x2) +{ + adjacency_t *e1 = NULL,*e2 = NULL; + + e1 = ((adjacency_t*)x1); + e2 = ((adjacency_t*)x2); + + return (e1->val < e2->val) ? - 1 : 1; +} + +int adjacency_dsc(const void* x1,const void* x2) +{ + adjacency_t *e1 = NULL,*e2 = NULL; + + e1 = ((adjacency_t*)x1); + e2 = ((adjacency_t*)x2); + + + return (e1->val > e2->val) ? -1 : 1; +} + +void super_fast_grouping(affinity_mat_t *aff_mat,tree_t *tab_node, tree_t *new_tab_node, int arity, int M) +{ + double val = 0,duration; + adjacency_t *graph; + int i,j,e,l,nb_groups; + int N = aff_mat->order; + double **mat = aff_mat->mat; + + assert( 2 == arity); + + TIC; + graph = (adjacency_t*)MALLOC(sizeof(adjacency_t)*((N*N-N)/2)); + e = 0; + for( i = 0 ; i < N ; i++ ) + for( j = i+1 ; j < N ; j++){ + graph[e].i = i; + graph[e].j = j; + graph[e].val = mat[i][j]; + e++; + } + + duration = TOC; + if(verbose_level>=DEBUG) + printf("linearization=%fs\n",duration); + + + assert( e == (N*N-N)/2); + TIC; + qsort(graph,e,sizeof(adjacency_t),adjacency_dsc); + duration = TOC; + if(verbose_level>=DEBUG) + printf("sorting=%fs\n",duration); + + TIC; + +TIC; + l = 0; + nb_groups = 0; + for( i = 0 ; (i < e) && (l < M) ; i++ ) + if(try_add_edge(tab_node,&new_tab_node[l],arity,graph[i].i,graph[i].j,&nb_groups)) + l++; + + for( l = 0 ; l < M ; l++ ){ + update_val(aff_mat,&new_tab_node[l]); + val += new_tab_node[l].val; + } + + duration = TOC; + if(verbose_level>=DEBUG) + printf("Grouping=%fs\n",duration); + + + if(verbose_level>=DEBUG) + printf("val=%f\n",val); + + + display_grouping(new_tab_node,M,arity,val); + +} + + +affinity_mat_t *build_cost_matrix(affinity_mat_t *aff_mat, double* obj_weight, double comm_speed) +{ + double **mat = NULL, *sum_row; + double **old_mat; + double avg; + int i,j,N; + + if(!obj_weight) + return aff_mat; + + N = aff_mat->order; + old_mat = aff_mat -> mat; + + mat = (double**)MALLOC(N*sizeof(double*)); + for( i = 0 ; i < N ; i++ ) + mat[i] = (double*)MALLOC(N*sizeof(double)); + + sum_row = (double*)CALLOC(N,sizeof(double)); + + + + avg = 0; + for( i = 0 ; i < N ; i++ ) + avg += obj_weight[i]; + avg /= N; + + + if(verbose_level>=DEBUG) + printf("avg=%f\n",avg); + + for( i = 0 ; i < N ; i++ ) + for( j = 0 ; j < N ; j++){ + if( i == j ) + mat[i][j] = 0; + else{ + mat[i][j] = 1e-4*old_mat[i][j]/comm_speed-fabs(avg-(obj_weight[i]+obj_weight[j])/2); + sum_row[i] += mat[i][j]; + } + } + return new_affinity_mat(mat,sum_row,N); + +} + + +/* + aff_mat: affinity matrix at the considered level (use to evaluate a grouping) + tab_node: array of the node to group + new_tab_node: array of nodes at the next level (the parents of the node in tab_node once the grouping will be done). + arity: number of children of parent (i.e.) size of the group to compute + M: size of new_tab_node (i.e) the number of parents +*/ +void group_nodes(affinity_mat_t *aff_mat,tree_t *tab_node, tree_t *new_tab_node, int arity, int M, double* obj_weigth, double comm_speed) +{ + + /* + N: size of tab and tab_node. i.e. number of nodes at the considered level + Hence we have: M*arity=N + */ + int N = aff_mat -> order; + tree_t **cur_group = NULL; + int j,l; + unsigned int n; + unsigned long int k; + group_list_t list,**best_selection = NULL,**tab_group = NULL; + double best_val,last_best; + int timeout; + affinity_mat_t *cost_mat = NULL; /*cost matrix taking into account the communiocation cost but also the weight of the object*/ + double duration; + + TIC; + + /* might return aff_mat (if obj_weight==NULL): do not FREE this tab in this case*/ + cost_mat = build_cost_matrix(aff_mat,obj_weigth,comm_speed); + + k = choose(N,arity); + if(verbose_level>=INFO) + printf("Number of groups:%ld\n",k); + + /* Todo: check if the depth is a criteria for speeding up the computation*/ + /* if(k>30000||depth>5){*/ + if( k > 30000 ){ + + double duration; + + TIC; + if( arity <= 2 ){ + /*super_fast_grouping(tab,tab_node,new_tab_node,arity,N,M,k);*/ + if(verbose_level >= INFO ) + printf("Bucket Grouping...\n"); + bucket_grouping(cost_mat,tab_node,new_tab_node,arity,M); + }else{ + if(verbose_level >= INFO) + printf("Fast Grouping...\n"); + fast_grouping(cost_mat,tab_node,new_tab_node,arity,M,k); + } + + duration = TOC; + if(verbose_level>=INFO) + printf("Fast grouping duration=%f\n",duration); + + if(verbose_level>=DEBUG) + display_grouping(new_tab_node,M,arity,-1); + + }else{ + if(verbose_level>=INFO) + printf("Grouping nodes...\n"); + list.next = NULL; + list.val = 0; /*number of elements in the list*/ + cur_group = (tree_t**)MALLOC(sizeof(tree_t*)*arity); + best_selection = (group_list_t **)MALLOC(sizeof(group_list_t*)*M); + + list_all_possible_groups(cost_mat,tab_node,0,arity,0,cur_group,&list); + n = (int)list.val; + assert( n == k ); + tab_group = (group_list_t**)MALLOC(sizeof(group_list_t*)*n); + list_to_tab(list.next,tab_group,n); + if(verbose_level>=INFO) + printf("List to tab done\n"); + + best_val = DBL_MAX; + + /* perform the pack mapping fist*/ + /* timeout = select_independent_groups(tab_group,n,arity,M,&best_val,best_selection,1,0.1); */ + timeout = select_independent_groups(tab_group,n,arity,M,&best_val,best_selection,1,100); + if(verbose_level>=INFO) + if(timeout) + printf("Packed mapping timeout!\n"); + /* give this mapping an exra credit (in general MPI application are made such that + neighbour process communicates more than distant ones) */ + best_val /= 1.001; + /* best_val *= 1.001; */ + if(verbose_level>=INFO) + printf("Packing computed\n"); + + /* perform a mapping trying to use group that cost less first*/ + qsort(tab_group,n,sizeof(group_list_t*),group_list_asc); + last_best = best_val; + timeout = select_independent_groups(tab_group,n,arity,M,&best_val,best_selection,10,0.1); + /* timeout = select_independent_groups(tab_group,n,arity,M,&best_val,best_selection,n,0); */ + if(verbose_level>=INFO){ + if(timeout){ + printf("Cost less first timeout!\n"); + }else if(last_best>best_val){ + printf("Cost less first Impoved solution\n"); + } + printf("----\n"); + } + /* perform a mapping trying to minimize the use of groups that cost a lot */ + qsort(tab_group,n,sizeof(group_list_t*),group_list_dsc); + last_best=best_val; + timeout=select_independent_groups_by_largest_index(tab_group,n,arity,M,&best_val,best_selection,10,0.1); + if(verbose_level>=DEBUG){ + if(timeout) + printf("Cost most last timeout!\n"); + else if(last_best>best_val) + printf("Cost most last impoved solution\n"); + } + if( n < 10000 ){ + /* perform a mapping in the weighted degree order */ + + + if(verbose_level>=INFO) + printf("----WG----\n"); + + compute_weighted_degree(tab_group,n,arity); + + if(verbose_level>=INFO) + printf("Weigted degree computed\n"); + + qsort(tab_group,n,sizeof(group_list_t*),weighted_degree_dsc); + /* display_tab_group(tab_group,n,arity);*/ + last_best = best_val; + timeout = select_independent_groups(tab_group,n,arity,M,&best_val,best_selection,10,0.1); + /* timeout = select_independent_groups(tab_group,n,arity,M,&best_val,best_selection,n,0); */ + + if(verbose_level>=DEBUG){ + if(timeout) + printf("WG timeout!\n"); + else if(last_best>best_val) + printf("WG impoved solution\n"); + } + } + + qsort(best_selection,M,sizeof(group_list_t*),group_list_id); + + for( l = 0 ; l < M ; l++ ){ + for( j = 0 ; j < arity ; j++ ){ + new_tab_node[l].child[j] = best_selection[l]->tab[j]; + new_tab_node[l].child[j]->parent = &new_tab_node[l]; + } + new_tab_node[l].arity = arity; + + /* printf("arity=%d\n",new_tab_node[l].arity); */ + update_val(cost_mat,&new_tab_node[l]); + } + + delete_group_list((&list)->next); + FREE(best_selection); + FREE(tab_group); + FREE(cur_group); + } + + if(cost_mat != aff_mat){ + FREE_tab_double(cost_mat->mat,N); + FREE(cost_mat->sum_row); + FREE(cost_mat); + } + + duration = TOC; + + if(verbose_level>=INFO) + display_grouping(new_tab_node,M,arity,-1); + + + if(verbose_level>=INFO) + printf("Grouping done in %.4fs!\n",duration); +} + +void complete_aff_mat(affinity_mat_t **aff_mat ,int N, int K) +{ + double **old_mat = NULL,**new_mat = NULL; double *sum_row; + int M,i; + + old_mat = (*aff_mat) -> mat; + + M = N+K; + new_mat = (double**)MALLOC(M*sizeof(double*)); + for( i = 0 ; i < M ; i++ ) + new_mat[i] = (double*)CALLOC((M),sizeof(double)); + + sum_row = (double*) CALLOC(M,sizeof(double)); + + for( i = 0 ; i < N ; i++ ){ + memcpy(new_mat[i],old_mat[i],N*sizeof(double)); + sum_row[i] = (*aff_mat)->sum_row[i]; + } + + *aff_mat = new_affinity_mat(new_mat,sum_row,M); +} + +void complete_obj_weight(double **tab,int N, int K) +{ + double *old_tab = NULL,*new_tab = NULL,avg; + int M,i; + + old_tab = *tab; + + if(!old_tab) + return; + + avg = 0; + for( i = 0 ; i < N ; i++ ) + avg += old_tab[i]; + avg /= N; + + M = N+K; + new_tab = (double*)MALLOC(M*sizeof(double)); + + *tab = new_tab; + for( i = 0 ; i < M ; i++ ) + if(i < N) + new_tab[i] = old_tab[i]; + else + new_tab[i] = avg; +} + +void create_dumb_tree(tree_t *node,int depth,tm_topology_t *topology) +{ + tree_t **list_child = NULL; + int arity,i; + + if( depth == topology->nb_levels-1) { + set_node(node,NULL,0,NULL,-1,0,NULL,depth); + return; + } + + arity = topology->arity[depth]; + assert(arity>0); + list_child = (tree_t**)CALLOC(arity,sizeof(tree_t*)); + for( i = 0 ; i < arity ; i++ ){ + list_child[i] = (tree_t*)MALLOC(sizeof(tree_t)); + create_dumb_tree(list_child[i],depth+1,topology); + list_child[i]->parent = node; + list_child[i]->dumb = 1; + } + + set_node(node,list_child,arity,NULL,-1,0,list_child[0], depth); +} + +void complete_tab_node(tree_t **tab,int N, int K,int depth,tm_topology_t *topology) +{ + tree_t *old_tab = NULL,*new_tab = NULL; + int M,i; + + if( K == 0 ) + return; + + old_tab = *tab; + + M = N+K; + new_tab = (tree_t*)MALLOC(M*sizeof(tree_t)); + + *tab = new_tab; + for( i = 0 ; i < M ; i++ ) + if(i < N) + clone_tree(&new_tab[i],&old_tab[i]); + else{ + create_dumb_tree(&new_tab[i],depth,topology); + new_tab[i].id = i; + } + + /* do not suppress tab if you are at the depth-most level it will be used at the mapping stage */ + FREE(old_tab); +} + +void set_deb_tab_child(tree_t *tree, tree_t *child,int depth) +{ + /* printf("depth=%d\t%p\t%p\n",depth,child,tree);*/ + if( depth > 0 ) + set_deb_tab_child(tree->tab_child,child,depth-1); + else + tree->tab_child=child; +} + +/* +Build the tree of the matching. It is a bottom up algorithm: it starts from the bottom of the tree on proceed by decreasing the depth +It groups nodes of the matrix tab and link these groups to the nodes of the under level. +Then it calls recursively the function to prefrom the grouping at the above level. + +tab_node: array of nodes of the under level. +aff_mat: local affinity matrix +arity: arity of the nodes of the above level. +depth: current depth of the algorithm +toplogy: description of the hardware topology. +constraints: set of constraints: core ids where to bind the processes +*/ +tree_t *build_level_topology(tree_t *tab_node, affinity_mat_t *aff_mat,int arity,int depth,tm_topology_t *topology, + double *obj_weight, double *comm_speed) +{ + + /* N: number of nodes. Order of com_mat, size of obj_weight */ + int N=aff_mat->order ; + int i,K=0,M; /*M = N/Arity: number the groups*/ + tree_t *new_tab_node = NULL; /*array of node for this level (of size M): there will be linked to the nodes of tab_nodes*/ + affinity_mat_t * new_aff_mat= NULL; /*New communication matrix (after grouyping nodes together)*/ + tree_t *res = NULL; /*resulting tree*/ + int completed = 0; + double speed; /* communication speed at this level*/ + double *new_obj_weight = NULL; + double duration; + + if( 0 == depth ){ + if((1 == N) && (0 == depth)) + return &tab_node[0]; + else { + if(verbose_level >= CRITICAL) + fprintf(stderr,"Error: matrix size: %d and depth:%d (should be 1 and -1 respectively)\n",N,depth); + exit(-1); + } + } + + /* If the number of nodes does not divide the arity: we add K nodes */ + if( N%arity != 0 ){ + TIC; + K = arity*((N/arity)+1)-N; + /*printf("****N=%d arity=%d K=%d\n",N,arity,K); */ + /*display_tab(tab,N);*/ + /* add K rows and columns to comm_matrix*/ + complete_aff_mat(&aff_mat,N,K); + /* add K element to the object weight*/ + complete_obj_weight(&obj_weight,N,K); + /*display_tab(tab,N+K);*/ + /* add a dumb tree to the K new "virtual nodes"*/ + complete_tab_node(&tab_node,N,K,depth,topology); + completed = 1; /*flag this addition*/ + N += K; /*increase the number of nodes accordingly*/ + duration = TOC; + if(verbose_level >= INFO) + fprintf(stderr,"Completing matrix duration= %fs\n ", duration); + } /*display_tab(tab,N);*/ + + M = N/arity; + if(verbose_level >= INFO) + printf("Depth=%d\tnb_nodes=%d\tnb_groups=%d\tsize of groups(arity)=%d\n",depth,N,M,arity); + + TIC; + /*create the new nodes*/ + new_tab_node = (tree_t*)MALLOC(sizeof(tree_t)*M); + /*intitialize each node*/ + for( i = 0 ; i < M ; i++ ){ + tree_t **list_child = NULL; + list_child = (tree_t**)CALLOC(arity,sizeof(tree_t*)); + set_node(&new_tab_node[i],list_child,arity,NULL,i,0,tab_node,depth); + } + duration = TOC; + if(verbose_level >= INFO) + printf("New nodes creation= %fs\n ", duration); + + /*Core of the algorithm: perfrom the grouping*/ + if(comm_speed) + speed = comm_speed[depth]; + else + speed = -1; + group_nodes(aff_mat, tab_node, new_tab_node, arity, M, obj_weight, speed); + + TIC; + /*based on that grouping aggregate the communication matrix*/ + new_aff_mat = aggregate_aff_mat(new_tab_node,aff_mat,M); + duration = TOC; + if(verbose_level >= INFO) + printf("Aggregate_com_mat= %fs\n", duration); + TIC; + + + /*based on that grouping aggregate the object weight matrix*/ + new_obj_weight = aggregate_obj_weight(new_tab_node,obj_weight,M); + duration = TOC; + if(verbose_level >= INFO) + printf("Aggregate obj_weight= %fs\n ", duration); + + /* set ID of virtual nodes to -1*/ + for( i = N-K ; i < N ; i++ ) + tab_node[i].id = -1; + /* + for(i=0;i 0) + arity = topology->arity[depth-1]; + else + arity = 1; + /* assume all objects have the same arity*/ + res = build_level_topology(new_tab_node, new_aff_mat, arity, depth,topology, new_obj_weight, comm_speed); + + set_deb_tab_child(res,tab_node,depth); + + /* if we have extended the matrix with zero, free the data here as they are local to this recursive step only*/ + if(completed){ + FREE_tab_double(aff_mat->mat,aff_mat->order); + FREE(aff_mat->sum_row); + FREE(aff_mat); + FREE(obj_weight); + } + FREE_tab_double(new_aff_mat->mat,new_aff_mat->order); + FREE(new_aff_mat->sum_row); + FREE(new_aff_mat); + FREE(new_obj_weight); + + FREE(new_obj_weight); + + return res; +} + +double speed(int depth) +{ + /* + Bertha values + double tab[5]={21,9,4.5,2.5,0.001}; + double tab[5]={1,1,1,1,1}; + double tab[6]={100000,10000,1000,500,100,10}; + */ + double tab[11] = {1024,512,256,128,64,32,16,8,4,2,1}; + + return 1.0/tab[depth]; + /* + return 10*log(depth+2); + return (depth+1); + return (long int)pow(100,depth); + */ +} + + + +/* check the leaf numbering of the topology + this number must be between 0 and n-1 (the number of leaves) + teh number must all be different + However if a given leaf number is -1, it means that this + leaf cannot bee used for the mapping + + The function returns the number of constraints (leaves that can be used) + and their numbers (in increasing order) in the array pointed by contraints + +*/ + +int check_constraints(tm_topology_t *topology, int **constraints) +{ + int j,i,n = nb_processing_units(topology); + int *tab_constraints = NULL, nb_constraints = 0; + int *tab_node = NULL; + int *count = NULL; + + /* tab_node: array of core numbers. + tab_node[i]=-1 if this core is forbiden + numbering is such that + 0<=tab_node[i]node_id[topology->nb_levels-1]; + + /* "count" counts the number of cores of a given number. + count[i]: number of cores of number i. + 0<=count[i]<=1 + */ + count = (int *)CALLOC(n,sizeof(int)); + for( i = 0 ; i < n ; i++ ) + if (tab_node[i] != -1){ + if( (tab_node[i] >= 0) && (tab_node[i] < n)){ + /* In the remaining, we assume that the core numbering is logical from 0 to n + so if tab_node[i]!=-1 this mean sthat we have to use core number i*/ + count[i]++; + nb_constraints++; + }else{ + if(verbose_level >= ERROR) + fprintf(stderr, "*** Error: Core numbering not between 0 and %d: tab_node[%d]=%d\n", n , i, tab_node[i]); + *constraints = NULL; + return 0; + } + } + + if(nb_constraints == 0){ + FREE(count); + *constraints = NULL; + return 0; + } + + tab_constraints = (int*) MALLOC(sizeof(int)*nb_constraints); + + /* we can now use the "counting sort" to sort the constraint tab in increasing order in linear time*/ + j = 0; + for( i = 0 ; i < n ; i++ ) + if(count[i]) + tab_constraints[j++] = i; + + /* if the constraint_tab is not full, this means that some count[i]>1*/ + if( j != nb_constraints ){ + if(verbose_level >= ERROR) + fprintf(stderr,"*** Error: Duplicate numbering: j=%d, nb_constraints= %d\n",j, nb_constraints); + FREE(tab_constraints); + FREE(count); + *constraints = NULL; + return 0; + } + + /* FREE local variables, assign result, return result*/ + FREE(count); + *constraints = tab_constraints; + return nb_constraints; +} + +affinity_mat_t * build_affinity_mat(double **mat, int order){ + int i,j; + double *sum_row = (double*) CALLOC (order, sizeof(double)); + + for (i=0 ; inb_levels; + for( i = 0 ; i < N ; i++ ) + set_node(&tab_node[i],NULL,0,NULL,i,0,NULL,depth); + + aff_mat = build_affinity_mat(com_mat,N); + + if(verbose_level >= INFO) + printf("nb_levels=%d\n",depth); + /* assume all objects have the same arity*/ + res = build_level_topology(tab_node, aff_mat , topology->arity[depth-2], depth-1, topology, obj_weight, comm_speed); + if(verbose_level >= INFO) + printf("Build (top down) tree done!\n"); + + /* tell the system it is not a constraint tree, this is usefull for freeing pointers*/ + res->constraint = 0; + FREE(aff_mat -> sum_row); + FREE(aff_mat); + + return res; +} + + + + +tree_t * build_tree_from_topology(tm_topology_t *topology, double **com_mat, int N, double *obj_weight, double *com_speed) +{ + int *constraints = NULL, nb_constraints; + tree_t * result; + + verbose_level = get_verbose_level(); + + nb_constraints = check_constraints (topology, &constraints); + + printf("nb_constraints = %d, N= %d; nb_processing units = %d\n",nb_constraints, N, nb_processing_units(topology)); + + if(N>nb_constraints){ + if(verbose_level >= CRITICAL){ + printf("Error : More processes (%d) than number of constraints (%d)!\n",N ,nb_constraints); + } + exit(-1); + } + + if(verbose_level >= INFO){ + printf("Com matrix size: %d\n",N); + printf("nb_constraints: %d\n",nb_constraints); + } + + if(nb_constraints == nb_processing_units(topology)) + { + nb_constraints = 0; + FREE(constraints); + } + + if(nb_constraints){ + if(verbose_level >= INFO){ + printf("Partitionning with constraints\n"); + } + result = kpartition_build_tree_from_topology(topology, com_mat, N, constraints, nb_constraints, obj_weight, com_speed); + FREE(constraints); + return result; + } + else{ + if(verbose_level >= INFO){ + printf("Partitionning without constraints\n"); + } + return bottom_up_build_tree_from_topology(topology, com_mat, N, obj_weight, com_speed); + } +} diff --git a/ompi/mca/topo/treematch/treematch/tm_tree.h b/ompi/mca/topo/treematch/treematch/tm_tree.h new file mode 100644 index 00000000000..342a61bd4f7 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_tree.h @@ -0,0 +1,94 @@ +#ifndef __TREE_H__ +#define __TREE_H__ +#include + + +typedef struct _node_info_t{ + int submit_date; + int job_id; + int finish_date; +} job_info_t; + +typedef struct _tree_t{ + int constraint; /* tells if the tree has been constructed with constraints on the nodes or not. usefull for freeing it. needs to be set on the root only*/ + struct _tree_t **child; + struct _tree_t *parent; + struct _tree_t *tab_child; /*the pointer to be freed*/ + double val; + int arity; + int depth; + int id; + int uniq; + int dumb; /* 1 if the node belongs to a dumb tree: hence has to be freed separately*/ + job_info_t *job_info; +}tree_t; + +/* Maximum number of levels in the tree*/ +#define MAX_LEVELS 100 + +typedef struct { + int *arity; /* arity of the nodes of each level*/ + int nb_levels; /*number of levels of the tree. Levels are numbered from top to bottom starting at 0*/ + int *nb_nodes; /*nb of nodes of each level*/ + int *nb_free_nodes; /*nb of available nodes of each level*/ + int **node_id; /*ID of the nodes of the tree for each level*/ + int **free_nodes; /*ID of the nodes of the tree for each level*/ +}tm_topology_t; + + +typedef struct { + double ** mat; + double * sum_row; + int order; +} affinity_mat_t; + + + +tree_t * build_tree(double **tab,int N); +tree_t * build_tree_from_topology(tm_topology_t *topology,double **tab,int N, double *obj_weight, double *comm_speed); +void map_tree(tree_t *,tree_t*); +void display_tab(double **tab,int N); +double speed(int depth); +void set_node(tree_t *node,tree_t ** child, int arity,tree_t *parent,int id,double val,tree_t *deb_tab_child, int depth); +void free_constraint_tree(tree_t *tree); +void free_tree(tree_t *tree); +void free_tab_double(double**tab,int N); +void free_tab_int(int**tab,int N); +void update_val(affinity_mat_t *aff_mat,tree_t *parent); +void FREE_tree(tree_t *tree); +void FREE_tab_double(double**,int); + +typedef struct _group_list_t{ + struct _group_list_t *next; + tree_t **tab; + double val; + double sum_neighbour; + double wg; +}group_list_t; + + +typedef struct{ + int i; + int j; + double val; +}adjacency_t; + + + +/* for debugging malloc */ +/* #define __DEBUG_MY_MALLOC__ */ +#undef __DEBUG_MY_MALLOC__ +#ifdef __DEBUG_MY_MALLOC__ +#include "tm_malloc.h" +#define MALLOC(x) my_malloc(x,__FILE__,__LINE__) +#define CALLOC(x,y) my_calloc(x,y,__FILE__,__LINE__) +#define FREE my_free +#define MEM_CHECK my_mem_check +#else +#define MALLOC malloc +#define CALLOC calloc +#define FREE free +#define MEM_CHECK my_mem_check +#endif + +#endif diff --git a/ompi/mca/topo/treematch/treematch/tm_verbose.c b/ompi/mca/topo/treematch/treematch/tm_verbose.c new file mode 100644 index 00000000000..9ff83191215 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_verbose.c @@ -0,0 +1,11 @@ +#include "tm_verbose.h" +static unsigned int verbose_level = ERROR; + +void set_verbose_level(unsigned int level){ + verbose_level = level; +} + + +unsigned int get_verbose_level(){ + return verbose_level; +} diff --git a/ompi/mca/topo/treematch/treematch/tm_verbose.h b/ompi/mca/topo/treematch/treematch/tm_verbose.h new file mode 100644 index 00000000000..eafb0942f4e --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/tm_verbose.h @@ -0,0 +1,11 @@ +#define NONE 0 +#define CRITICAL 1 +#define ERROR 2 +#define WARNING 3 +#define INFO 4 +#define DEBUG 5 + +void set_verbose_level(unsigned int level); +unsigned int get_verbose_level(void); + + diff --git a/ompi/mca/topo/treematch/treematch/uthash.h b/ompi/mca/topo/treematch/treematch/uthash.h new file mode 100644 index 00000000000..7b98cad5cc9 --- /dev/null +++ b/ompi/mca/topo/treematch/treematch/uthash.h @@ -0,0 +1,905 @@ +/* +Copyright (c) 2003-2011, Troy D. Hanson http://uthash.sourceforge.net +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER +OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef UTHASH_H +#define UTHASH_H + +#include /* memcmp,strlen */ +#include /* ptrdiff_t */ +#include /* exit() */ + +/* These macros use decltype or the earlier __typeof GNU extension. + As decltype is only available in newer compilers (VS2010 or gcc 4.3+ + when compiling c++ source) this code uses whatever method is needed + or, for VS2008 where neither is available, uses casting workarounds. */ +#ifdef _MSC_VER /* MS compiler */ +#if _MSC_VER >= 1600 && defined(__cplusplus) /* VS2010 or newer in C++ mode */ +#define DECLTYPE(x) (decltype(x)) +#else /* VS2008 or older (or VS2010 in C mode) */ +#define NO_DECLTYPE +#define DECLTYPE(x) +#endif +#else /* GNU, Sun and other compilers */ +#define DECLTYPE(x) (__typeof(x)) +#endif + +#ifdef NO_DECLTYPE +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + char **_da_dst = (char**)(&(dst)); \ + *_da_dst = (char*)(src); \ +} while(0) +#else +#define DECLTYPE_ASSIGN(dst,src) \ +do { \ + (dst) = DECLTYPE(dst)(src); \ +} while(0) +#endif + +/* a number of the hash function use uint32_t which isn't defined on win32 */ +#ifdef _MSC_VER +typedef unsigned int uint32_t; +typedef unsigned char uint8_t; +#else +#include /* uint32_t */ +#endif + +#define UTHASH_VERSION 1.9.4 + +#define uthash_fatal(msg) exit(-1) /* fatal error (out of memory,etc) */ +#define uthash_malloc(sz) malloc(sz) /* malloc fcn */ +#define uthash_free(ptr,sz) free(ptr) /* free fcn */ + +#define uthash_noexpand_fyi(tbl) /* can be defined to log noexpand */ +#define uthash_expand_fyi(tbl) /* can be defined to log expands */ + +/* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS 32 /* initial number of buckets */ +#define HASH_INITIAL_NUM_BUCKETS_LOG2 5 /* lg2 of initial number of buckets */ +#define HASH_BKT_CAPACITY_THRESH 10 /* expand when bucket count reaches */ + +/* calculate the element whose hash handle address is hhe */ +#define ELMT_FROM_HH(tbl,hhp) ((void*)(((char*)(hhp)) - ((tbl)->hho))) + +#define HASH_FIND(hh,head,keyptr,keylen,out) \ +do { \ + unsigned _hf_bkt,_hf_hashv; \ + out=NULL; \ + if (head) { \ + HASH_FCN(keyptr,keylen, (head)->hh.tbl->num_buckets, _hf_hashv, _hf_bkt); \ + if (HASH_BLOOM_TEST((head)->hh.tbl, _hf_hashv)) { \ + HASH_FIND_IN_BKT((head)->hh.tbl, hh, (head)->hh.tbl->buckets[ _hf_bkt ], \ + keyptr,keylen,out); \ + } \ + } \ +} while (0) + +#ifdef HASH_BLOOM +#define HASH_BLOOM_BITLEN (1ULL << HASH_BLOOM) +#define HASH_BLOOM_BYTELEN (HASH_BLOOM_BITLEN/8) + ((HASH_BLOOM_BITLEN%8) ? 1:0) +#define HASH_BLOOM_MAKE(tbl) \ +do { \ + (tbl)->bloom_nbits = HASH_BLOOM; \ + (tbl)->bloom_bv = (uint8_t*)uthash_malloc(HASH_BLOOM_BYTELEN); \ + if (!((tbl)->bloom_bv)) { uthash_fatal( "out of memory"); } \ + memset((tbl)->bloom_bv, 0, HASH_BLOOM_BYTELEN); \ + (tbl)->bloom_sig = HASH_BLOOM_SIGNATURE; \ +} while (0); + +#define HASH_BLOOM_FREE(tbl) \ +do { \ + uthash_free((tbl)->bloom_bv, HASH_BLOOM_BYTELEN); \ +} while (0); + +#define HASH_BLOOM_BITSET(bv,idx) (bv[(idx)/8] |= (1U << ((idx)%8))) +#define HASH_BLOOM_BITTEST(bv,idx) (bv[(idx)/8] & (1U << ((idx)%8))) + +#define HASH_BLOOM_ADD(tbl,hashv) \ + HASH_BLOOM_BITSET((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1))) + +#define HASH_BLOOM_TEST(tbl,hashv) \ + HASH_BLOOM_BITTEST((tbl)->bloom_bv, (hashv & (uint32_t)((1ULL << (tbl)->bloom_nbits) - 1))) + +#else +#define HASH_BLOOM_MAKE(tbl) +#define HASH_BLOOM_FREE(tbl) +#define HASH_BLOOM_ADD(tbl,hashv) +#define HASH_BLOOM_TEST(tbl,hashv) (1) +#endif + +#define HASH_MAKE_TABLE(hh,head) \ +do { \ + (head)->hh.tbl = (UT_hash_table*)uthash_malloc( \ + sizeof(UT_hash_table)); \ + if (!((head)->hh.tbl)) { uthash_fatal( "out of memory"); } \ + memset((head)->hh.tbl, 0, sizeof(UT_hash_table)); \ + (head)->hh.tbl->tail = &((head)->hh); \ + (head)->hh.tbl->num_buckets = HASH_INITIAL_NUM_BUCKETS; \ + (head)->hh.tbl->log2_num_buckets = HASH_INITIAL_NUM_BUCKETS_LOG2; \ + (head)->hh.tbl->hho = (char*)(&(head)->hh) - (char*)(head); \ + (head)->hh.tbl->buckets = (UT_hash_bucket*)uthash_malloc( \ + HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ + if (! (head)->hh.tbl->buckets) { uthash_fatal( "out of memory"); } \ + memset((head)->hh.tbl->buckets, 0, \ + HASH_INITIAL_NUM_BUCKETS*sizeof(struct UT_hash_bucket)); \ + HASH_BLOOM_MAKE((head)->hh.tbl); \ + (head)->hh.tbl->signature = HASH_SIGNATURE; \ +} while(0) + +#define HASH_ADD(hh,head,fieldname,keylen_in,add) \ + HASH_ADD_KEYPTR(hh,head,&((add)->fieldname),keylen_in,add) + +#define HASH_ADD_KEYPTR(hh,head,keyptr,keylen_in,add) \ +do { \ + unsigned _ha_bkt; \ + (add)->hh.next = NULL; \ + (add)->hh.key = (char*)keyptr; \ + (add)->hh.keylen = keylen_in; \ + if (!(head)) { \ + head = (add); \ + (head)->hh.prev = NULL; \ + HASH_MAKE_TABLE(hh,head); \ + } else { \ + (head)->hh.tbl->tail->next = (add); \ + (add)->hh.prev = ELMT_FROM_HH((head)->hh.tbl, (head)->hh.tbl->tail); \ + (head)->hh.tbl->tail = &((add)->hh); \ + } \ + (head)->hh.tbl->num_items++; \ + (add)->hh.tbl = (head)->hh.tbl; \ + HASH_FCN(keyptr,keylen_in, (head)->hh.tbl->num_buckets, \ + (add)->hh.hashv, _ha_bkt); \ + HASH_ADD_TO_BKT((head)->hh.tbl->buckets[_ha_bkt],&(add)->hh); \ + HASH_BLOOM_ADD((head)->hh.tbl,(add)->hh.hashv); \ + HASH_EMIT_KEY(hh,head,keyptr,keylen_in); \ + HASH_FSCK(hh,head); \ +} while(0) + +#define HASH_TO_BKT( hashv, num_bkts, bkt ) \ +do { \ + bkt = ((hashv) & ((num_bkts) - 1)); \ +} while(0) + +/* delete "delptr" from the hash table. + * "the usual" patch-up process for the app-order doubly-linked-list. + * The use of _hd_hh_del below deserves special explanation. + * These used to be expressed using (delptr) but that led to a bug + * if someone used the same symbol for the head and deletee, like + * HASH_DELETE(hh,users,users); + * We want that to work, but by changing the head (users) below + * we were forfeiting our ability to further refer to the deletee (users) + * in the patch-up process. Solution: use scratch space to + * copy the deletee pointer, then the latter references are via that + * scratch pointer rather than through the repointed (users) symbol. + */ +#define HASH_DELETE(hh,head,delptr) \ +do { \ + unsigned _hd_bkt; \ + struct UT_hash_handle *_hd_hh_del; \ + if ( ((delptr)->hh.prev == NULL) && ((delptr)->hh.next == NULL) ) { \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + head = NULL; \ + } else { \ + _hd_hh_del = &((delptr)->hh); \ + if ((delptr) == ELMT_FROM_HH((head)->hh.tbl,(head)->hh.tbl->tail)) { \ + (head)->hh.tbl->tail = \ + (UT_hash_handle*)((char*)((delptr)->hh.prev) + \ + (head)->hh.tbl->hho); \ + } \ + if ((delptr)->hh.prev) { \ + ((UT_hash_handle*)((char*)((delptr)->hh.prev) + \ + (head)->hh.tbl->hho))->next = (delptr)->hh.next; \ + } else { \ + DECLTYPE_ASSIGN(head,(delptr)->hh.next); \ + } \ + if (_hd_hh_del->next) { \ + ((UT_hash_handle*)((char*)_hd_hh_del->next + \ + (head)->hh.tbl->hho))->prev = \ + _hd_hh_del->prev; \ + } \ + HASH_TO_BKT( _hd_hh_del->hashv, (head)->hh.tbl->num_buckets, _hd_bkt); \ + HASH_DEL_IN_BKT(hh,(head)->hh.tbl->buckets[_hd_bkt], _hd_hh_del); \ + (head)->hh.tbl->num_items--; \ + } \ + HASH_FSCK(hh,head); \ +} while (0) + + +/* convenience forms of HASH_FIND/HASH_ADD/HASH_DEL */ +#define HASH_FIND_STR(head,findstr,out) \ + HASH_FIND(hh,head,findstr,strlen(findstr),out) +#define HASH_ADD_STR(head,strfield,add) \ + HASH_ADD(hh,head,strfield,strlen(add->strfield),add) +#define HASH_FIND_INT(head,findint,out) \ + HASH_FIND(hh,head,findint,sizeof(int),out) +#define HASH_ADD_INT(head,intfield,add) \ + HASH_ADD(hh,head,intfield,sizeof(int),add) +#define HASH_FIND_PTR(head,findptr,out) \ + HASH_FIND(hh,head,findptr,sizeof(void *),out) +#define HASH_ADD_PTR(head,ptrfield,add) \ + HASH_ADD(hh,head,ptrfield,sizeof(void *),add) +#define HASH_DEL(head,delptr) \ + HASH_DELETE(hh,head,delptr) + +/* HASH_FSCK checks hash integrity on every add/delete when HASH_DEBUG is defined. + * This is for uthash developer only; it compiles away if HASH_DEBUG isn't defined. + */ +#ifdef HASH_DEBUG +#define HASH_OOPS(...) do { fprintf(stderr,__VA_ARGS__); exit(-1); } while (0) +#define HASH_FSCK(hh,head) \ +do { \ + unsigned _bkt_i; \ + unsigned _count, _bkt_count; \ + char *_prev; \ + struct UT_hash_handle *_thh; \ + if (head) { \ + _count = 0; \ + for( _bkt_i = 0; _bkt_i < (head)->hh.tbl->num_buckets; _bkt_i++) { \ + _bkt_count = 0; \ + _thh = (head)->hh.tbl->buckets[_bkt_i].hh_head; \ + _prev = NULL; \ + while (_thh) { \ + if (_prev != (char*)(_thh->hh_prev)) { \ + HASH_OOPS("invalid hh_prev %p, actual %p\n", \ + _thh->hh_prev, _prev ); \ + } \ + _bkt_count++; \ + _prev = (char*)(_thh); \ + _thh = _thh->hh_next; \ + } \ + _count += _bkt_count; \ + if ((head)->hh.tbl->buckets[_bkt_i].count != _bkt_count) { \ + HASH_OOPS("invalid bucket count %d, actual %d\n", \ + (head)->hh.tbl->buckets[_bkt_i].count, _bkt_count); \ + } \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("invalid hh item count %d, actual %d\n", \ + (head)->hh.tbl->num_items, _count ); \ + } \ + /* traverse hh in app order; check next/prev integrity, count */ \ + _count = 0; \ + _prev = NULL; \ + _thh = &(head)->hh; \ + while (_thh) { \ + _count++; \ + if (_prev !=(char*)(_thh->prev)) { \ + HASH_OOPS("invalid prev %p, actual %p\n", \ + _thh->prev, _prev ); \ + } \ + _prev = (char*)ELMT_FROM_HH((head)->hh.tbl, _thh); \ + _thh = ( _thh->next ? (UT_hash_handle*)((char*)(_thh->next) + \ + (head)->hh.tbl->hho) : NULL ); \ + } \ + if (_count != (head)->hh.tbl->num_items) { \ + HASH_OOPS("invalid app item count %d, actual %d\n", \ + (head)->hh.tbl->num_items, _count ); \ + } \ + } \ +} while (0) +#else +#define HASH_FSCK(hh,head) +#endif + +/* When compiled with -DHASH_EMIT_KEYS, length-prefixed keys are emitted to + * the descriptor to which this macro is defined for tuning the hash function. + * The app can #include to get the prototype for write(2). */ +#ifdef HASH_EMIT_KEYS +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) \ +do { \ + unsigned _klen = fieldlen; \ + write(HASH_EMIT_KEYS, &_klen, sizeof(_klen)); \ + write(HASH_EMIT_KEYS, keyptr, fieldlen); \ +} while (0) +#else +#define HASH_EMIT_KEY(hh,head,keyptr,fieldlen) +#endif + +/* default to Jenkin's hash unless overridden e.g. DHASH_FUNCTION=HASH_SAX */ +#ifdef HASH_FUNCTION +#define HASH_FCN HASH_FUNCTION +#else +#define HASH_FCN HASH_JEN +#endif + +/* The Bernstein hash function, used in Perl prior to v5.6 */ +#define HASH_BER(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _hb_keylen=keylen; \ + char *_hb_key=(char*)(key); \ + (hashv) = 0; \ + while (_hb_keylen--) { (hashv) = ((hashv) * 33) + *_hb_key++; } \ + bkt = (hashv) & (num_bkts-1); \ +} while (0) + + +/* SAX/FNV/OAT/JEN hash functions are macro variants of those listed at + * http://eternallyconfuzzled.com/tuts/algorithms/jsw_tut_hashing.aspx */ +#define HASH_SAX(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _sx_i; \ + char *_hs_key=(char*)(key); \ + hashv = 0; \ + for(_sx_i=0; _sx_i < keylen; _sx_i++) \ + hashv ^= (hashv << 5) + (hashv >> 2) + _hs_key[_sx_i]; \ + bkt = hashv & (num_bkts-1); \ +} while (0) + +#define HASH_FNV(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _fn_i; \ + char *_hf_key=(char*)(key); \ + hashv = 2166136261UL; \ + for(_fn_i=0; _fn_i < keylen; _fn_i++) \ + hashv = (hashv * 16777619) ^ _hf_key[_fn_i]; \ + bkt = hashv & (num_bkts-1); \ +} while(0); + +#define HASH_OAT(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _ho_i; \ + char *_ho_key=(char*)(key); \ + hashv = 0; \ + for(_ho_i=0; _ho_i < keylen; _ho_i++) { \ + hashv += _ho_key[_ho_i]; \ + hashv += (hashv << 10); \ + hashv ^= (hashv >> 6); \ + } \ + hashv += (hashv << 3); \ + hashv ^= (hashv >> 11); \ + hashv += (hashv << 15); \ + bkt = hashv & (num_bkts-1); \ +} while(0) + +#define HASH_JEN_MIX(a,b,c) \ +do { \ + a -= b; a -= c; a ^= ( c >> 13 ); \ + b -= c; b -= a; b ^= ( a << 8 ); \ + c -= a; c -= b; c ^= ( b >> 13 ); \ + a -= b; a -= c; a ^= ( c >> 12 ); \ + b -= c; b -= a; b ^= ( a << 16 ); \ + c -= a; c -= b; c ^= ( b >> 5 ); \ + a -= b; a -= c; a ^= ( c >> 3 ); \ + b -= c; b -= a; b ^= ( a << 10 ); \ + c -= a; c -= b; c ^= ( b >> 15 ); \ +} while (0) + +#define HASH_JEN(key,keylen,num_bkts,hashv,bkt) \ +do { \ + unsigned _hj_i,_hj_j,_hj_k; \ + char *_hj_key=(char*)(key); \ + hashv = 0xfeedbeef; \ + _hj_i = _hj_j = 0x9e3779b9; \ + _hj_k = keylen; \ + while (_hj_k >= 12) { \ + _hj_i += (_hj_key[0] + ( (unsigned)_hj_key[1] << 8 ) \ + + ( (unsigned)_hj_key[2] << 16 ) \ + + ( (unsigned)_hj_key[3] << 24 ) ); \ + _hj_j += (_hj_key[4] + ( (unsigned)_hj_key[5] << 8 ) \ + + ( (unsigned)_hj_key[6] << 16 ) \ + + ( (unsigned)_hj_key[7] << 24 ) ); \ + hashv += (_hj_key[8] + ( (unsigned)_hj_key[9] << 8 ) \ + + ( (unsigned)_hj_key[10] << 16 ) \ + + ( (unsigned)_hj_key[11] << 24 ) ); \ + \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + \ + _hj_key += 12; \ + _hj_k -= 12; \ + } \ + hashv += keylen; \ + switch ( _hj_k ) { \ + case 11: hashv += ( (unsigned)_hj_key[10] << 24 ); \ + case 10: hashv += ( (unsigned)_hj_key[9] << 16 ); \ + case 9: hashv += ( (unsigned)_hj_key[8] << 8 ); \ + case 8: _hj_j += ( (unsigned)_hj_key[7] << 24 ); \ + case 7: _hj_j += ( (unsigned)_hj_key[6] << 16 ); \ + case 6: _hj_j += ( (unsigned)_hj_key[5] << 8 ); \ + case 5: _hj_j += _hj_key[4]; \ + case 4: _hj_i += ( (unsigned)_hj_key[3] << 24 ); \ + case 3: _hj_i += ( (unsigned)_hj_key[2] << 16 ); \ + case 2: _hj_i += ( (unsigned)_hj_key[1] << 8 ); \ + case 1: _hj_i += _hj_key[0]; \ + } \ + HASH_JEN_MIX(_hj_i, _hj_j, hashv); \ + bkt = hashv & (num_bkts-1); \ +} while(0) + +/* The Paul Hsieh hash function */ +#undef get16bits +#if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ + || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) +#define get16bits(d) (*((const uint16_t *) (d))) +#endif + +#if !defined (get16bits) +#define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8) \ + +(uint32_t)(((const uint8_t *)(d))[0]) ) +#endif +#define HASH_SFH(key,keylen,num_bkts,hashv,bkt) \ +do { \ + char *_sfh_key=(char*)(key); \ + uint32_t _sfh_tmp, _sfh_len = keylen; \ + \ + int _sfh_rem = _sfh_len & 3; \ + _sfh_len >>= 2; \ + hashv = 0xcafebabe; \ + \ + /* Main loop */ \ + for (;_sfh_len > 0; _sfh_len--) { \ + hashv += get16bits (_sfh_key); \ + _sfh_tmp = (get16bits (_sfh_key+2) << 11) ^ hashv; \ + hashv = (hashv << 16) ^ _sfh_tmp; \ + _sfh_key += 2*sizeof (uint16_t); \ + hashv += hashv >> 11; \ + } \ + \ + /* Handle end cases */ \ + switch (_sfh_rem) { \ + case 3: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 16; \ + hashv ^= _sfh_key[sizeof (uint16_t)] << 18; \ + hashv += hashv >> 11; \ + break; \ + case 2: hashv += get16bits (_sfh_key); \ + hashv ^= hashv << 11; \ + hashv += hashv >> 17; \ + break; \ + case 1: hashv += *_sfh_key; \ + hashv ^= hashv << 10; \ + hashv += hashv >> 1; \ + } \ + \ + /* Force "avalanching" of final 127 bits */ \ + hashv ^= hashv << 3; \ + hashv += hashv >> 5; \ + hashv ^= hashv << 4; \ + hashv += hashv >> 17; \ + hashv ^= hashv << 25; \ + hashv += hashv >> 6; \ + bkt = hashv & (num_bkts-1); \ +} while(0); + +#ifdef HASH_USING_NO_STRICT_ALIASING +/* The MurmurHash exploits some CPU's (x86,x86_64) tolerance for unaligned reads. + * For other types of CPU's (e.g. Sparc) an unaligned read causes a bus error. + * MurmurHash uses the faster approach only on CPU's where we know it's safe. + * + * Note the preprocessor built-in defines can be emitted using: + * + * gcc -m64 -dM -E - < /dev/null (on gcc) + * cc -## a.c (where a.c is a simple test file) (Sun Studio) + */ +#if (defined(__i386__) || defined(__x86_64__)) +#define MUR_GETBLOCK(p,i) p[i] +#else /* non intel */ +#define MUR_PLUS0_ALIGNED(p) (((unsigned long)p & 0x3) == 0) +#define MUR_PLUS1_ALIGNED(p) (((unsigned long)p & 0x3) == 1) +#define MUR_PLUS2_ALIGNED(p) (((unsigned long)p & 0x3) == 2) +#define MUR_PLUS3_ALIGNED(p) (((unsigned long)p & 0x3) == 3) +#define WP(p) ((uint32_t*)((unsigned long)(p) & ~3UL)) +#if (defined(__BIG_ENDIAN__) || defined(SPARC) || defined(__ppc__) || defined(__ppc64__)) +#define MUR_THREE_ONE(p) ((((*WP(p))&0x00ffffff) << 8) | (((*(WP(p)+1))&0xff000000) >> 24)) +#define MUR_TWO_TWO(p) ((((*WP(p))&0x0000ffff) <<16) | (((*(WP(p)+1))&0xffff0000) >> 16)) +#define MUR_ONE_THREE(p) ((((*WP(p))&0x000000ff) <<24) | (((*(WP(p)+1))&0xffffff00) >> 8)) +#else /* assume little endian non-intel */ +#define MUR_THREE_ONE(p) ((((*WP(p))&0xffffff00) >> 8) | (((*(WP(p)+1))&0x000000ff) << 24)) +#define MUR_TWO_TWO(p) ((((*WP(p))&0xffff0000) >>16) | (((*(WP(p)+1))&0x0000ffff) << 16)) +#define MUR_ONE_THREE(p) ((((*WP(p))&0xff000000) >>24) | (((*(WP(p)+1))&0x00ffffff) << 8)) +#endif +#define MUR_GETBLOCK(p,i) (MUR_PLUS0_ALIGNED(p) ? ((p)[i]) : \ + (MUR_PLUS1_ALIGNED(p) ? MUR_THREE_ONE(p) : \ + (MUR_PLUS2_ALIGNED(p) ? MUR_TWO_TWO(p) : \ + MUR_ONE_THREE(p)))) +#endif +#define MUR_ROTL32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +#define MUR_FMIX(_h) \ +do { \ + _h ^= _h >> 16; \ + _h *= 0x85ebca6b; \ + _h ^= _h >> 13; \ + _h *= 0xc2b2ae35l; \ + _h ^= _h >> 16; \ +} while(0) + +#define HASH_MUR(key,keylen,num_bkts,hashv,bkt) \ +do { \ + const uint8_t *_mur_data = (const uint8_t*)(key); \ + const int _mur_nblocks = (keylen) / 4; \ + uint32_t _mur_h1 = 0xf88D5353; \ + uint32_t _mur_c1 = 0xcc9e2d51; \ + uint32_t _mur_c2 = 0x1b873593; \ + const uint32_t *_mur_blocks = (const uint32_t*)(_mur_data+_mur_nblocks*4); \ + int _mur_i; \ + for(_mur_i = -_mur_nblocks; _mur_i; _mur_i++) { \ + uint32_t _mur_k1 = MUR_GETBLOCK(_mur_blocks,_mur_i); \ + _mur_k1 *= _mur_c1; \ + _mur_k1 = MUR_ROTL32(_mur_k1,15); \ + _mur_k1 *= _mur_c2; \ + \ + _mur_h1 ^= _mur_k1; \ + _mur_h1 = MUR_ROTL32(_mur_h1,13); \ + _mur_h1 = _mur_h1*5+0xe6546b64; \ + } \ + const uint8_t *_mur_tail = (const uint8_t*)(_mur_data + _mur_nblocks*4); \ + uint32_t _mur_k1=0; \ + switch((keylen) & 3) { \ + case 3: _mur_k1 ^= _mur_tail[2] << 16; \ + case 2: _mur_k1 ^= _mur_tail[1] << 8; \ + case 1: _mur_k1 ^= _mur_tail[0]; \ + _mur_k1 *= _mur_c1; \ + _mur_k1 = MUR_ROTL32(_mur_k1,15); \ + _mur_k1 *= _mur_c2; \ + _mur_h1 ^= _mur_k1; \ + } \ + _mur_h1 ^= (keylen); \ + MUR_FMIX(_mur_h1); \ + hashv = _mur_h1; \ + bkt = hashv & (num_bkts-1); \ +} while(0) +#endif /* HASH_USING_NO_STRICT_ALIASING */ + +/* key comparison function; return 0 if keys equal */ +#define HASH_KEYCMP(a,b,len) memcmp(a,b,len) + +/* iterate over items in a known bucket to find desired item */ +#define HASH_FIND_IN_BKT(tbl,hh,head,keyptr,keylen_in,out) \ +do { \ + if (head.hh_head) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,head.hh_head)); \ + else out=NULL; \ + while (out) { \ + if (out->hh.keylen == keylen_in) { \ + if ((HASH_KEYCMP(out->hh.key,keyptr,keylen_in)) == 0) break; \ + } \ + if (out->hh.hh_next) DECLTYPE_ASSIGN(out,ELMT_FROM_HH(tbl,out->hh.hh_next)); \ + else out = NULL; \ + } \ +} while(0) + +/* add an item to a bucket */ +#define HASH_ADD_TO_BKT(head,addhh) \ +do { \ + head.count++; \ + (addhh)->hh_next = head.hh_head; \ + (addhh)->hh_prev = NULL; \ + if (head.hh_head) { (head).hh_head->hh_prev = (addhh); } \ + (head).hh_head=addhh; \ + if (head.count >= ((head.expand_mult+1) * HASH_BKT_CAPACITY_THRESH) \ + && (addhh)->tbl->noexpand != 1) { \ + HASH_EXPAND_BUCKETS((addhh)->tbl); \ + } \ +} while(0) + +/* remove an item from a given bucket */ +#define HASH_DEL_IN_BKT(hh,head,hh_del) \ + (head).count--; \ + if ((head).hh_head == hh_del) { \ + (head).hh_head = hh_del->hh_next; \ + } \ + if (hh_del->hh_prev) { \ + hh_del->hh_prev->hh_next = hh_del->hh_next; \ + } \ + if (hh_del->hh_next) { \ + hh_del->hh_next->hh_prev = hh_del->hh_prev; \ + } + +/* Bucket expansion has the effect of doubling the number of buckets + * and redistributing the items into the new buckets. Ideally the + * items will distribute more or less evenly into the new buckets + * (the extent to which this is true is a measure of the quality of + * the hash function as it applies to the key domain). + * + * With the items distributed into more buckets, the chain length + * (item count) in each bucket is reduced. Thus by expanding buckets + * the hash keeps a bound on the chain length. This bounded chain + * length is the essence of how a hash provides constant time lookup. + * + * The calculation of tbl->ideal_chain_maxlen below deserves some + * explanation. First, keep in mind that we're calculating the ideal + * maximum chain length based on the *new* (doubled) bucket count. + * In fractions this is just n/b (n=number of items,b=new num buckets). + * Since the ideal chain length is an integer, we want to calculate + * ceil(n/b). We don't depend on floating point arithmetic in this + * hash, so to calculate ceil(n/b) with integers we could write + * + * ceil(n/b) = (n/b) + ((n%b)?1:0) + * + * and in fact a previous version of this hash did just that. + * But now we have improved things a bit by recognizing that b is + * always a power of two. We keep its base 2 log handy (call it lb), + * so now we can write this with a bit shift and logical AND: + * + * ceil(n/b) = (n>>lb) + ( (n & (b-1)) ? 1:0) + * + */ +#define HASH_EXPAND_BUCKETS(tbl) \ +do { \ + unsigned _he_bkt; \ + unsigned _he_bkt_i; \ + struct UT_hash_handle *_he_thh, *_he_hh_nxt; \ + UT_hash_bucket *_he_new_buckets, *_he_newbkt; \ + _he_new_buckets = (UT_hash_bucket*)uthash_malloc( \ + 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + if (!_he_new_buckets) { uthash_fatal( "out of memory"); } \ + memset(_he_new_buckets, 0, \ + 2 * tbl->num_buckets * sizeof(struct UT_hash_bucket)); \ + tbl->ideal_chain_maxlen = \ + (tbl->num_items >> (tbl->log2_num_buckets+1)) + \ + ((tbl->num_items & ((tbl->num_buckets*2)-1)) ? 1 : 0); \ + tbl->nonideal_items = 0; \ + for(_he_bkt_i = 0; _he_bkt_i < tbl->num_buckets; _he_bkt_i++) \ + { \ + _he_thh = tbl->buckets[ _he_bkt_i ].hh_head; \ + while (_he_thh) { \ + _he_hh_nxt = _he_thh->hh_next; \ + HASH_TO_BKT( _he_thh->hashv, tbl->num_buckets*2, _he_bkt); \ + _he_newbkt = &(_he_new_buckets[ _he_bkt ]); \ + if (++(_he_newbkt->count) > tbl->ideal_chain_maxlen) { \ + tbl->nonideal_items++; \ + _he_newbkt->expand_mult = _he_newbkt->count / \ + tbl->ideal_chain_maxlen; \ + } \ + _he_thh->hh_prev = NULL; \ + _he_thh->hh_next = _he_newbkt->hh_head; \ + if (_he_newbkt->hh_head) _he_newbkt->hh_head->hh_prev = \ + _he_thh; \ + _he_newbkt->hh_head = _he_thh; \ + _he_thh = _he_hh_nxt; \ + } \ + } \ + uthash_free( tbl->buckets, tbl->num_buckets*sizeof(struct UT_hash_bucket) ); \ + tbl->num_buckets *= 2; \ + tbl->log2_num_buckets++; \ + tbl->buckets = _he_new_buckets; \ + tbl->ineff_expands = (tbl->nonideal_items > (tbl->num_items >> 1)) ? \ + (tbl->ineff_expands+1) : 0; \ + if (tbl->ineff_expands > 1) { \ + tbl->noexpand=1; \ + uthash_noexpand_fyi(tbl); \ + } \ + uthash_expand_fyi(tbl); \ +} while(0) + + +/* This is an adaptation of Simon Tatham's O(n log(n)) mergesort */ +/* Note that HASH_SORT assumes the hash handle name to be hh. + * HASH_SRT was added to allow the hash handle name to be passed in. */ +#define HASH_SORT(head,cmpfcn) HASH_SRT(hh,head,cmpfcn) +#define HASH_SRT(hh,head,cmpfcn) \ +do { \ + unsigned _hs_i; \ + unsigned _hs_looping,_hs_nmerges,_hs_insize,_hs_psize,_hs_qsize; \ + struct UT_hash_handle *_hs_p, *_hs_q, *_hs_e, *_hs_list, *_hs_tail; \ + if (head) { \ + _hs_insize = 1; \ + _hs_looping = 1; \ + _hs_list = &((head)->hh); \ + while (_hs_looping) { \ + _hs_p = _hs_list; \ + _hs_list = NULL; \ + _hs_tail = NULL; \ + _hs_nmerges = 0; \ + while (_hs_p) { \ + _hs_nmerges++; \ + _hs_q = _hs_p; \ + _hs_psize = 0; \ + for ( _hs_i = 0; _hs_i < _hs_insize; _hs_i++ ) { \ + _hs_psize++; \ + _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ + ((void*)((char*)(_hs_q->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + if (! (_hs_q) ) break; \ + } \ + _hs_qsize = _hs_insize; \ + while ((_hs_psize > 0) || ((_hs_qsize > 0) && _hs_q )) { \ + if (_hs_psize == 0) { \ + _hs_e = _hs_q; \ + _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ + ((void*)((char*)(_hs_q->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + _hs_qsize--; \ + } else if ( (_hs_qsize == 0) || !(_hs_q) ) { \ + _hs_e = _hs_p; \ + _hs_p = (UT_hash_handle*)((_hs_p->next) ? \ + ((void*)((char*)(_hs_p->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + _hs_psize--; \ + } else if (( \ + cmpfcn(DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_p)), \ + DECLTYPE(head)(ELMT_FROM_HH((head)->hh.tbl,_hs_q))) \ + ) <= 0) { \ + _hs_e = _hs_p; \ + _hs_p = (UT_hash_handle*)((_hs_p->next) ? \ + ((void*)((char*)(_hs_p->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + _hs_psize--; \ + } else { \ + _hs_e = _hs_q; \ + _hs_q = (UT_hash_handle*)((_hs_q->next) ? \ + ((void*)((char*)(_hs_q->next) + \ + (head)->hh.tbl->hho)) : NULL); \ + _hs_qsize--; \ + } \ + if ( _hs_tail ) { \ + _hs_tail->next = ((_hs_e) ? \ + ELMT_FROM_HH((head)->hh.tbl,_hs_e) : NULL); \ + } else { \ + _hs_list = _hs_e; \ + } \ + _hs_e->prev = ((_hs_tail) ? \ + ELMT_FROM_HH((head)->hh.tbl,_hs_tail) : NULL); \ + _hs_tail = _hs_e; \ + } \ + _hs_p = _hs_q; \ + } \ + _hs_tail->next = NULL; \ + if ( _hs_nmerges <= 1 ) { \ + _hs_looping=0; \ + (head)->hh.tbl->tail = _hs_tail; \ + DECLTYPE_ASSIGN(head,ELMT_FROM_HH((head)->hh.tbl, _hs_list)); \ + } \ + _hs_insize *= 2; \ + } \ + HASH_FSCK(hh,head); \ + } \ +} while (0) + +/* This function selects items from one hash into another hash. + * The end result is that the selected items have dual presence + * in both hashes. There is no copy of the items made; rather + * they are added into the new hash through a secondary hash + * hash handle that must be present in the structure. */ +#define HASH_SELECT(hh_dst, dst, hh_src, src, cond) \ +do { \ + unsigned _src_bkt, _dst_bkt; \ + void *_last_elt=NULL, *_elt; \ + UT_hash_handle *_src_hh, *_dst_hh, *_last_elt_hh=NULL; \ + ptrdiff_t _dst_hho = ((char*)(&(dst)->hh_dst) - (char*)(dst)); \ + if (src) { \ + for(_src_bkt=0; _src_bkt < (src)->hh_src.tbl->num_buckets; _src_bkt++) { \ + for(_src_hh = (src)->hh_src.tbl->buckets[_src_bkt].hh_head; \ + _src_hh; \ + _src_hh = _src_hh->hh_next) { \ + _elt = ELMT_FROM_HH((src)->hh_src.tbl, _src_hh); \ + if (cond(_elt)) { \ + _dst_hh = (UT_hash_handle*)(((char*)_elt) + _dst_hho); \ + _dst_hh->key = _src_hh->key; \ + _dst_hh->keylen = _src_hh->keylen; \ + _dst_hh->hashv = _src_hh->hashv; \ + _dst_hh->prev = _last_elt; \ + _dst_hh->next = NULL; \ + if (_last_elt_hh) { _last_elt_hh->next = _elt; } \ + if (!dst) { \ + DECLTYPE_ASSIGN(dst,_elt); \ + HASH_MAKE_TABLE(hh_dst,dst); \ + } else { \ + _dst_hh->tbl = (dst)->hh_dst.tbl; \ + } \ + HASH_TO_BKT(_dst_hh->hashv, _dst_hh->tbl->num_buckets, _dst_bkt); \ + HASH_ADD_TO_BKT(_dst_hh->tbl->buckets[_dst_bkt],_dst_hh); \ + (dst)->hh_dst.tbl->num_items++; \ + _last_elt = _elt; \ + _last_elt_hh = _dst_hh; \ + } \ + } \ + } \ + } \ + HASH_FSCK(hh_dst,dst); \ +} while (0) + +#define HASH_CLEAR(hh,head) \ +do { \ + if (head) { \ + uthash_free((head)->hh.tbl->buckets, \ + (head)->hh.tbl->num_buckets*sizeof(struct UT_hash_bucket)); \ + HASH_BLOOM_FREE((head)->hh.tbl); \ + uthash_free((head)->hh.tbl, sizeof(UT_hash_table)); \ + (head)=NULL; \ + } \ +} while(0) + +#ifdef NO_DECLTYPE +#define HASH_ITER(hh,head,el,tmp) \ +for((el)=(head), (*(char**)(&(tmp)))=(char*)((head)?(head)->hh.next:NULL); \ + el; (el)=(tmp),(*(char**)(&(tmp)))=(char*)((tmp)?(tmp)->hh.next:NULL)) +#else +#define HASH_ITER(hh,head,el,tmp) \ +for((el)=(head),(tmp)=DECLTYPE(el)((head)?(head)->hh.next:NULL); \ + el; (el)=(tmp),(tmp)=DECLTYPE(el)((tmp)?(tmp)->hh.next:NULL)) +#endif + +/* obtain a count of items in the hash */ +#define HASH_COUNT(head) HASH_CNT(hh,head) +#define HASH_CNT(hh,head) ((head)?((head)->hh.tbl->num_items):0) + +typedef struct UT_hash_bucket { + struct UT_hash_handle *hh_head; + unsigned count; + + /* expand_mult is normally set to 0. In this situation, the max chain length + * threshold is enforced at its default value, HASH_BKT_CAPACITY_THRESH. (If + * the bucket's chain exceeds this length, bucket expansion is triggered). + * However, setting expand_mult to a non-zero value delays bucket expansion + * (that would be triggered by additions to this particular bucket) + * until its chain length reaches a *multiple* of HASH_BKT_CAPACITY_THRESH. + * (The multiplier is simply expand_mult+1). The whole idea of this + * multiplier is to reduce bucket expansions, since they are expensive, in + * situations where we know that a particular bucket tends to be overused. + * It is better to let its chain length grow to a longer yet-still-bounded + * value, than to do an O(n) bucket expansion too often. + */ + unsigned expand_mult; + +} UT_hash_bucket; + +/* random signature used only to find hash tables in external analysis */ +#define HASH_SIGNATURE 0xa0111fe1 +#define HASH_BLOOM_SIGNATURE 0xb12220f2 + +typedef struct UT_hash_table { + UT_hash_bucket *buckets; + unsigned num_buckets, log2_num_buckets; + unsigned num_items; + struct UT_hash_handle *tail; /* tail hh in app order, for fast append */ + ptrdiff_t hho; /* hash handle offset (byte pos of hash handle in element */ + + /* in an ideal situation (all buckets used equally), no bucket would have + * more than ceil(#items/#buckets) items. that's the ideal chain length. */ + unsigned ideal_chain_maxlen; + + /* nonideal_items is the number of items in the hash whose chain position + * exceeds the ideal chain maxlen. these items pay the penalty for an uneven + * hash distribution; reaching them in a chain traversal takes >ideal steps */ + unsigned nonideal_items; + + /* ineffective expands occur when a bucket doubling was performed, but + * afterward, more than half the items in the hash had nonideal chain + * positions. If this happens on two consecutive expansions we inhibit any + * further expansion, as it's not helping; this happens when the hash + * function isn't a good fit for the key domain. When expansion is inhibited + * the hash will still work, albeit no longer in constant time. */ + unsigned ineff_expands, noexpand; + + uint32_t signature; /* used only to find hash tables in external analysis */ +#ifdef HASH_BLOOM + uint32_t bloom_sig; /* used only to test bloom exists in external analysis */ + uint8_t *bloom_bv; + char bloom_nbits; +#endif + +} UT_hash_table; + +typedef struct UT_hash_handle { + struct UT_hash_table *tbl; + void *prev; /* prev element in app order */ + void *next; /* next element in app order */ + struct UT_hash_handle *hh_prev; /* previous hh in bucket order */ + struct UT_hash_handle *hh_next; /* next hh in bucket order */ + void *key; /* ptr to enclosing struct's key */ + unsigned keylen; /* enclosing struct's key len */ + unsigned hashv; /* result of hash-fcn(key) */ +} UT_hash_handle; + +#endif /* UTHASH_H */ diff --git a/opal/mca/common/ofacm/.opal_ignore b/opal/mca/common/ofacm/.opal_ignore deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/opal/mca/common/ofacm/Makefile.am b/opal/mca/common/ofacm/Makefile.am deleted file mode 100644 index b592f6badf4..00000000000 --- a/opal/mca/common/ofacm/Makefile.am +++ /dev/null @@ -1,83 +0,0 @@ -# -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -AM_CPPFLAGS = $(common_ofacm_CPPFLAGS) - -dist_opaldata_DATA = \ - help-mpi-common-ofacm-base.txt \ - help-mpi-common-ofacm-oob.txt -headers = \ - base.h \ - common_ofacm_oob.h \ - common_ofacm_empty.h \ - connect.h - -sources = \ - common_ofacm_base.c \ - common_ofacm_oob.c \ - common_ofacm_empty.c - -# If we have XRC support, build XOOB connection module -if MCA_common_ofacm_have_xrc -sources += \ - common_ofacm_xoob.c \ - common_ofacm_xoob.h -endif - -# See opal/mca/common/sm/Makefile.am for an explanation of the -# LTLIBRARIES values listed below. - -lib_LTLIBRARIES = -noinst_LTLIBRARIES = -comp_inst = lib@OPAL_LIB_PREFIX@mca_common_ofacm.la -comp_noinst = lib@OPAL_LIB_PREFIX@mca_common_ofacm_noinst.la - -if MCA_BUILD_opal_common_ofacm_DSO -lib_LTLIBRARIES += $(comp_inst) -else -noinst_LTLIBRARIES += $(comp_noinst) -endif - -lib@OPAL_LIB_PREFIX@mca_common_ofacm_la_SOURCES = $(headers) $(sources) -lib@OPAL_LIB_PREFIX@mca_common_ofacm_la_CPPFLAGS = $(common_ofacm_CPPFLAGS) -lib@OPAL_LIB_PREFIX@mca_common_ofacm_la_LDFLAGS = \ - -version-info $(libmca_common_ofacm_so_version) \ - $(common_ofacm_LDFLAGS) -lib@OPAL_LIB_PREFIX@mca_common_ofacm_la_LIBADD = $(common_ofacm_LIBS) -lib@OPAL_LIB_PREFIX@mca_common_ofacm_noinst_la_SOURCES = $(headers) $(sources) - -# Conditionally install the header files - -if WANT_INSTALL_HEADERS -opaldir = $(opalincludedir)/opal/mca/common/ofacm -opal_HEADERS = $(headers) -else -opaldir = $(includedir) -endif - -# See opal/mca/common/sm/Makefile.am for an explanation of the -# the *-local rules, below. - -V=0 -OMPI_V_LN_SCOMP = $(ompi__v_LN_SCOMP_$V) -ompi__v_LN_SCOMP_ = $(ompi__v_LN_SCOMP_$AM_DEFAULT_VERBOSITY) -ompi__v_LN_SCOMP_0 = @echo " LN_S " `basename $(comp_inst)`; - -all-local: - $(OMPI_V_LN_SCOMP) if test -z "$(lib_LTLIBRARIES)"; then \ - rm -f "$(comp_inst)"; \ - $(LN_S) "$(comp_noinst)" "$(comp_inst)"; \ - fi - -clean-local: - if test -z "$(lib_LTLIBRARIES)"; then \ - rm -f "$(comp_inst)"; \ - fi diff --git a/opal/mca/common/ofacm/base.h b/opal/mca/common/ofacm/base.h deleted file mode 100644 index feffad6f555..00000000000 --- a/opal/mca/common/ofacm/base.h +++ /dev/null @@ -1,190 +0,0 @@ -/* - * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef OPAL_COMMON_OFACM_BASE_H -#define OPAL_COMMON_OFACM_BASE_H -#include "opal_config.h" - -#include -#include - -#include "connect.h" - -BEGIN_C_DECLS - -#define HAVE_XRC (1 == OPAL_HAVE_CONNECTX_XRC) - -extern int opal_common_ofacm_base_output; -extern int opal_common_ofacm_base_verbose; /* disabled by default */ -/* File for sl data produced only for a 3D-Torus Cluster */ -extern char* opal_common_ofacm_three_dim_torus; - -static inline int opal_common_ofacm_base_err(const char* fmt, ...) -{ - va_list list; - int ret; - - va_start(list, fmt); - ret = vfprintf(stderr, fmt, list); - va_end(list); - return ret; -} - -#define OFACM_ERROR(args) \ - do { \ - opal_common_ofacm_base_err("[%s]%s[%s:%d:%s] ", \ - opal_proc_local_get()->proc_hostname, \ - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - opal_common_ofacm_base_err args; \ - opal_common_ofacm_base_err("\n"); \ - } while(0); - -#if OPAL_ENABLE_DEBUG -#define OFACM_VERBOSE(args) \ - do { \ - if(opal_common_ofacm_base_verbose > 0) { \ - opal_common_ofacm_base_err("[%s]%s[%s:%d:%s] ",\ - opal_proc_local_get()->proc_hostname, \ - OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), \ - __FILE__, __LINE__, __func__); \ - opal_common_ofacm_base_err args; \ - opal_common_ofacm_base_err("\n"); \ - } \ - } while(0); -#else -#define OFACM_VERBOSE(args) -#endif - -/* - * PUBLIC functions - * **************** - */ - -/* - * Open function - */ -OPAL_DECLSPEC int opal_common_ofacm_base_register(mca_base_component_t *base); - -/* - * Query CPCs to see if they want to run on a specific port. - * Input: - * port - port information - * Output: - * cpcs - list of availible cpcs - * num_cpcs - number of cpcs - */ -OPAL_DECLSPEC int opal_common_ofacm_base_select_for_local_port - (opal_common_ofacm_base_dev_desc_t *dev, - opal_common_ofacm_base_module_t ***cpcs, int *num_cpcs); - -/* - * Select function - * Input: - * local_cpcs - local cpc modules - * num_local_cpcs - number of local cpc modules - * remote_cpc_data - cpc information from remote peer - * remote_cpc_data_count - num of remote information from remote peer - * Output: - * ret_local_cpc - matched cpc module - * ret_remote_cpc_data - matched remote cpc data - */ -OPAL_DECLSPEC int opal_common_ofacm_base_find_match - (opal_common_ofacm_base_module_t **local_cpcs, int num_local_cpcs, - opal_common_ofacm_base_module_data_t *remote_cpc_data, int remote_cpc_data_count, - opal_common_ofacm_base_module_t **ret_local_cpc, - opal_common_ofacm_base_module_data_t **ret_remote_cpc_data); - -/* - * Find a CPC's index so that we can send it in the modex - */ -OPAL_DECLSPEC int opal_common_ofacm_base_get_cpc_index - (opal_common_ofacm_base_component_t *cpc); - -/* - * Start a new connection to an endpoint - */ -OPAL_DECLSPEC int opal_common_ofacm_base_start_connect - (struct opal_common_ofacm_base_local_connection_context_t *context); - -/* - * Component-wide CPC finalize - */ -OPAL_DECLSPEC void opal_common_ofacm_base_finalize(void); - -/* - * Component-wide CPC init - */ -OPAL_DECLSPEC int opal_common_ofacm_base_init(void); - -/* - * Lookup a CPC by its index (received from the modex) - */ -OPAL_DECLSPEC opal_common_ofacm_base_component_t * - opal_common_ofacm_base_get_cpc_byindex(uint8_t index); - -/* - * PRIVATE functions (called only by cpcs) - * *************************************** - */ - -/* - * Proc initialization function - */ -void opal_common_ofacm_base_proc_setup - (opal_common_ofacm_base_proc_t *proc, - opal_common_ofacm_base_local_connection_context_t *context, - opal_proc_t *proc_opal); -/* - * Context initialization function - */ -int opal_common_ofacm_base_context_init - (opal_common_ofacm_base_local_connection_context_t *context, - opal_common_ofacm_base_module_t *cpc, - opal_common_ofacm_base_context_connect_cb_fn_t connect_cb, - opal_common_ofacm_base_context_error_cb_fn_t error_cb, - opal_common_ofacm_base_context_prepare_recv_cb_fn_t prepare_recv_cb, - opal_common_ofacm_base_proc_t *proc, - opal_common_ofacm_base_qp_config_t *qp_config, - struct ibv_pd *pd, uint64_t subnet_id, int cpc_type, - uint16_t lid, uint16_t rem_lid, - int32_t user_context_index, void *user_context); - -/* - * Remote context initialization. - * Returns operation status - */ -int opal_common_ofacm_base_remote_context_init - (opal_common_ofacm_base_remote_connection_context_t *context, - int num_qps, int num_srqs); - -/* Find OFACM proc on specific component */ -opal_common_ofacm_base_proc_t* opal_common_ofacm_base_find_proc - (opal_common_ofacm_base_component_t *component, opal_proc_t *proc); - -#if 0 -/* - * Allocate a CTS frag - */ -int opal_common_ofacm_base_alloc_cts( - struct mca_btl_base_endpoint_t *endpoint); - -/* - * Free a CTS frag - */ -int opal_common_ofacm_base_free_cts( - struct mca_btl_base_endpoint_t *endpoint); -#endif - -END_C_DECLS - -#endif diff --git a/opal/mca/common/ofacm/common_ofacm_base.c b/opal/mca/common/ofacm/common_ofacm_base.c deleted file mode 100644 index b3af7aa436d..00000000000 --- a/opal/mca/common/ofacm/common_ofacm_base.c +++ /dev/null @@ -1,699 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2007-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2007-2012 Mellanox Technologies. All rights reserved. - * - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2013 Intel, Inc. All rights reserved - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * Copyright (c) 2015 Los Alamos National Security, LLC. All rights - * reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ -#include "opal_config.h" - -#include -#include "base.h" -#include "common_ofacm_oob.h" -#include "common_ofacm_empty.h" -#if HAVE_XRC -#include "common_ofacm_xoob.h" -#endif - -#include "opal/constants.h" -#include "opal/class/opal_list.h" -#include "opal/util/argv.h" -#include "opal/util/output.h" -#include "opal/util/show_help.h" -#include "infiniband/verbs.h" - -static opal_common_ofacm_base_component_t **available = NULL; -static int num_available = 0; -static char *opal_common_ofacm_cpc_include; -static char *opal_common_ofacm_cpc_exclude; - -/* Global variables */ -int opal_common_ofacm_base_verbose = 0; /* disabled by default */ -char* opal_common_ofacm_three_dim_torus = NULL; -bool cpc_explicitly_defined = false; -int opal_common_ofacm_base_output = 1; -bool opal_common_ofacm_base_register_was_called = false; -bool opal_common_ofacm_base_init_was_called = false; -/* - * Array of all possible connection functions - */ -static opal_common_ofacm_base_component_t *all[] = { - &opal_common_ofacm_oob, - - /* Always have an entry here so that the CP indexes will always be - the same: if XRC is not available, use the "empty" CPC */ -#if HAVE_XRC - &opal_common_ofacm_xoob, -#else - &opal_common_ofacm_empty, -#endif - NULL -}; - -static void ofacm_base_proc_contructor (opal_common_ofacm_base_proc_t *proc) -{ - proc->proc_opal = NULL; - OBJ_CONSTRUCT(&proc->all_contexts, opal_list_t); -} - -static void ofacm_base_proc_destructor (opal_common_ofacm_base_proc_t *proc) -{ - OBJ_DESTRUCT(&proc->all_contexts); -} - -void opal_common_ofacm_base_proc_setup(opal_common_ofacm_base_proc_t *proc, - opal_common_ofacm_base_local_connection_context_t *context, - opal_proc_t *proc_opal) -{ - if (NULL == proc->proc_opal) { - /* first init for the proc, lets set ompi proc */ - proc->proc_opal = proc_opal; - } - /* put the context on the proc list */ - opal_list_append(&proc->all_contexts, (opal_list_item_t *)context); -} - -OBJ_CLASS_INSTANCE(opal_common_ofacm_base_proc_t, - opal_list_item_t, - ofacm_base_proc_contructor, - ofacm_base_proc_destructor); - -/* Constructors / Destructors */ -static void base_local_context_constructor - (opal_common_ofacm_base_local_connection_context_t *context) -{ - context->proc = NULL; - context->state = MCA_COMMON_OFACM_CLOSED; - context->subnet_id = 0; - context->lid = 0; - context->num_of_qps = 0; - context->init_attr = NULL; - context->attr = NULL; - context->ib_pd = NULL; - context->qps = NULL; - context->user_context = NULL; - context->initiator = 0; - context->index = 0; - context->xrc_recv_qp_num = 0; - /* remote info we will allocate and fill during qp creation */ - memset(&context->remote_info, 0, sizeof(context->remote_info)); - OBJ_CONSTRUCT(&context->context_lock, opal_mutex_t); -} - -static void base_local_context_destructor - (opal_common_ofacm_base_local_connection_context_t *context) -{ - /* Release remote data arrays */ - if (NULL != context->remote_info.rem_qps) { - free(context->remote_info.rem_qps); - } - if (NULL != context->remote_info.rem_srqs) { - free(context->remote_info.rem_srqs); - } - OBJ_DESTRUCT(&context->context_lock); -} - -OBJ_CLASS_INSTANCE(opal_common_ofacm_base_local_connection_context_t, - opal_list_item_t, - base_local_context_constructor, - base_local_context_destructor); - -int opal_common_ofacm_base_context_init(opal_common_ofacm_base_local_connection_context_t *context, - opal_common_ofacm_base_module_t *cpc, - opal_common_ofacm_base_context_connect_cb_fn_t connect_cb, - opal_common_ofacm_base_context_error_cb_fn_t error_cb, - opal_common_ofacm_base_context_prepare_recv_cb_fn_t prepare_recv_cb, - opal_common_ofacm_base_proc_t *proc, - opal_common_ofacm_base_qp_config_t *qp_config, - struct ibv_pd *pd, uint64_t subnet_id, int cpc_type, - uint16_t lid, uint16_t rem_lid, - int32_t user_context_index, void *user_context) -{ - context->proc = proc; - context->cpc = cpc; - context->subnet_id = subnet_id; - context->cpc_type = cpc_type; - context->lid = lid; - context->rem_lid = rem_lid; - context->num_of_qps = qp_config->num_qps; - /* If upper layer defines the QPs we do not want to overwrite it */ - if (NULL == context->qps) { - context->qps = calloc(context->num_of_qps, sizeof(opal_common_ofacm_base_qp_t)); - if(NULL == context->qps) { - OFACM_ERROR(("Failed to allocate memory for qps")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - - context->num_of_srqs = qp_config->num_srqs; - context->srq_num = qp_config->srq_num; - context->init_attr = qp_config->init_attr; - context->attr = qp_config->attr; - context->custom_init_attr_mask = qp_config->init_attr_mask; - context->custom_rtr_attr_mask = qp_config->rtr_attr_mask; - context->custom_rts_attr_mask = qp_config->rts_attr_mask; - context->ib_pd = pd; - context->connect_cb = connect_cb; - context->error_cb = error_cb; - context->prepare_recv_cb = prepare_recv_cb ; - context->index = user_context_index; - context->user_context = user_context; - return OPAL_SUCCESS; -} - -int opal_common_ofacm_base_remote_context_init(opal_common_ofacm_base_remote_connection_context_t *context, - int num_qps, int num_srqs) -{ - context->rem_qps = (opal_common_ofacm_base_rem_qp_info_t *) - calloc(num_qps, sizeof(opal_common_ofacm_base_rem_qp_info_t)); - if (NULL == context->rem_qps) { - return OPAL_ERROR; - } - - context->rem_srqs = (opal_common_ofacm_base_rem_srq_info_t *) - calloc(num_qps, sizeof(opal_common_ofacm_base_rem_srq_info_t)); - if (NULL == context->rem_srqs) { - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -opal_common_ofacm_base_proc_t* opal_common_ofacm_base_find_proc - (opal_common_ofacm_base_component_t *component, opal_proc_t *proc) -{ - opal_common_ofacm_base_proc_t *ret = NULL; - opal_list_item_t *item; - opal_list_t *list = &component->all_procs; - - for (item = opal_list_get_first(list); - item != opal_list_get_end(list); - item = opal_list_get_next(item)) { - if (proc == ((opal_common_ofacm_base_proc_t *)item)->proc_opal){ - ret = (opal_common_ofacm_base_proc_t *)item; - } - } - return ret; -} - -/* - * Register MCA parameters - */ -int opal_common_ofacm_base_register(mca_base_component_t *base) -{ - int i, j, save; - char **temp = NULL, *string = NULL, *all_cpc_names = NULL; - - if (opal_common_ofacm_base_register_was_called) { - return OPAL_SUCCESS; - } - - opal_common_ofacm_base_register_was_called = true; - - /* Make an MCA parameter to select which connect module to use */ - for (i = 0; NULL != all[i]; ++i) { - /* The CPC name "empty" is reserved for "fake" CPC modules */ - if (0 != strcmp(all[i]->cbc_name, "empty")) { - opal_argv_append_nosize(&temp, all[i]->cbc_name); - } - } - all_cpc_names = opal_argv_join(temp, ','); - opal_argv_free(temp); - asprintf(&string, - "Method used to select OpenFabrics connections (valid values: %s)", - all_cpc_names); - - opal_common_ofacm_cpc_include = NULL; - (void) mca_base_component_var_register(base, "ofacm_cpc_include", string, - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &opal_common_ofacm_cpc_include); - free(string); - - asprintf(&string, - "Method used to exclude OpenFabrics connections (valid values: %s)", - all_cpc_names); - opal_common_ofacm_cpc_exclude = NULL; - (void) mca_base_component_var_register(base, "ofacm_cpc_exclude", string, - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &opal_common_ofacm_cpc_exclude); - free(string); - - /* Register the name of the file containing the fabric's Service Levels (SL) */ - opal_common_ofacm_three_dim_torus = NULL; - (void) mca_base_var_register("ompi", "common", "ofacm", "three_dim_torus", - "The name of the file contating Service Level (SL) data for 3D-Torus cluster", - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &opal_common_ofacm_three_dim_torus); - - opal_common_ofacm_base_verbose = 0; - (void) mca_base_var_register("ompi", "common", "ofacm", "base_verbose", - "Verbosity level of the OFACM framework", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &opal_common_ofacm_base_verbose); - - - /* Parse the if_[in|ex]clude paramters to come up with a list of - CPCs that are available */ - available = calloc(1, sizeof(all)); - - /* If we have an "include" list, then find all those CPCs and put - them in available[] */ - if (NULL != opal_common_ofacm_cpc_include) { - cpc_explicitly_defined = true; - temp = opal_argv_split(opal_common_ofacm_cpc_include, ','); - for (save = j = 0; NULL != temp[j]; ++j) { - for (i = 0; NULL != all[i]; ++i) { - if (0 == strcmp(temp[j], all[i]->cbc_name)) { - OFACM_VERBOSE(("include: saving %s", all[i]->cbc_name)); - available[save++] = all[i]; - ++num_available; - break; - } - } - if (NULL == all[i]) { - opal_show_help("help-mpi-common-ofacm-base.txt", - "cpc name not found", true, - "include", opal_proc_local_get()->proc_hostname, - "include", opal_common_ofacm_cpc_include, temp[j], - all_cpc_names); - opal_argv_free(temp); - free(all_cpc_names); - return OPAL_ERR_NOT_FOUND; - } - } - opal_argv_free(temp); - } - - /* Otherwise, if we have an "exclude" list, take all the CPCs that - are not in that list and put them in available[] */ - else if (NULL != opal_common_ofacm_cpc_exclude) { - cpc_explicitly_defined = true; - temp = opal_argv_split(opal_common_ofacm_cpc_exclude, ','); - /* First: error check -- ensure that all the names are valid */ - for (j = 0; NULL != temp[j]; ++j) { - for (i = 0; NULL != all[i]; ++i) { - if (0 == strcmp(temp[j], all[i]->cbc_name)) { - break; - } - } - if (NULL == all[i]) { - opal_show_help("help-mpi-common-ofacm-base.txt", - "cpc name not found", true, - "exclude", opal_proc_local_get()->proc_hostname, - "exclude", opal_common_ofacm_cpc_exclude, temp[j], - all_cpc_names); - opal_argv_free(temp); - free(all_cpc_names); - return OPAL_ERR_NOT_FOUND; - } - } - - /* Now do the exclude */ - for (save = i = 0; NULL != all[i]; ++i) { - for (j = 0; NULL != temp[j]; ++j) { - if (0 == strcmp(temp[j], all[i]->cbc_name)) { - break; - } - } - if (NULL == temp[j]) { - OFACM_VERBOSE(("exclude: saving %s", all[i]->cbc_name)); - available[save++] = all[i]; - ++num_available; - } - } - opal_argv_free(temp); - } - - /* If there's no include/exclude list, copy all[] into available[] */ - else { - OFACM_VERBOSE(("no include or exclude: saving all")); - memcpy(available, all, sizeof(all)); - num_available = (sizeof(all) / - sizeof(opal_common_ofacm_base_module_t *)) - 1; - } - - /* Call the register function on all the CPCs so that they may - setup any MCA params specific to the connection type */ - for (i = 0; NULL != available[i]; ++i) { - if (NULL != available[i]->cbc_register) { - available[i]->cbc_register(); - } - } - - return OPAL_SUCCESS; -} - -/* - * Called once during openib BTL component initialization to allow CPC - * components to initialize. - */ -int opal_common_ofacm_base_init(void) -{ - int i, rc; - - if (opal_common_ofacm_base_init_was_called) { - return OPAL_SUCCESS; - } - - opal_common_ofacm_base_init_was_called = true; - - /* Call each available CPC component's open function, if it has - one. If the CPC component open function returns OPAL_SUCCESS, - keep it. If it returns ERR_NOT_SUPPORTED, remove it from the - available[] array. If it returns something else, return that - error upward. */ - for (i = num_available = 0; NULL != available[i]; ++i) { - if (NULL == available[i]->cbc_init) { - available[num_available++] = available[i]; - OFACM_VERBOSE(("found available cpc (NULL init): %s", - all[i]->cbc_name)); - continue; - } - - rc = available[i]->cbc_init(); - if (OPAL_SUCCESS == rc) { - available[num_available++] = available[i]; - OFACM_VERBOSE(("found available cpc (SUCCESS init): %s", - all[i]->cbc_name)); - continue; - } else if (OPAL_ERR_NOT_SUPPORTED == rc) { - continue; - } else { - return rc; - } - } - available[num_available] = NULL; - - return (num_available > 0) ? OPAL_SUCCESS : OPAL_ERR_NOT_AVAILABLE; -} - - -/* - * Find all the CPCs that are eligible for a single local port (i.e., - * openib module). - */ -int opal_common_ofacm_base_select_for_local_port(opal_common_ofacm_base_dev_desc_t *dev, - opal_common_ofacm_base_module_t ***cpcs, int *num_cpcs) -{ - char *msg = NULL; - int i, rc, cpc_index, len; - opal_common_ofacm_base_module_t **tmp_cpcs; - - tmp_cpcs = calloc(num_available, - sizeof(opal_common_ofacm_base_module_t *)); - if (NULL == tmp_cpcs) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Go through all available CPCs and query them to see if they - want to run on this module. If they do, save them to a running - array. */ - for (len = 1, i = 0; NULL != available[i]; ++i) { - len += strlen(available[i]->cbc_name) + 2; - } - msg = malloc(len); - if (NULL == msg) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - msg[0] = '\0'; - for (cpc_index = i = 0; NULL != available[i]; ++i) { - if (i > 0) { - strcat(msg, ", "); - } - strcat(msg, available[i]->cbc_name); - - rc = available[i]->cbc_query(dev, &tmp_cpcs[cpc_index]); - if (OPAL_ERR_NOT_SUPPORTED == rc || OPAL_ERR_UNREACH == rc) { - continue; - } else if (OPAL_SUCCESS != rc) { - free(tmp_cpcs); - free(msg); - return rc; - } - OFACM_VERBOSE(("match cpc for local port: %s", - available[i]->cbc_name)); - - /* If the CPC wants to use the CTS protocol, check to ensure - that QP 0 is PP; if it's not, we can't use this CPC (or the - CTS protocol) */ - /* Pasha: Wrong place to check qp type, should be moved to CMs - if (cpcs[cpc_index]->cbm_uses_cts && - !BTL_OPENIB_QP_TYPE_PP(0)) { - OFACM_VERBOSE(("this CPC only supports when the first btl_openib_receive_queues QP is a PP QP")); - continue; - } - */ - - /* This CPC has indicated that it wants to run on this openib - BTL module. Woo hoo! */ - ++cpc_index; - } - - /* If we got an empty array, then no CPCs were eligible. Doh! */ - if (0 == cpc_index) { - opal_show_help("help-mpi-common-ofacm-base.txt", - "no cpcs for port", true, - opal_proc_local_get()->proc_hostname, - ibv_get_device_name(dev->ib_dev), - msg); - free(tmp_cpcs); - free(msg); - return OPAL_ERR_NOT_SUPPORTED; - } - free(msg); - - /* We got at least one eligible CPC; save the array into the - module's port_info */ - *num_cpcs = cpc_index; - *cpcs = tmp_cpcs; - - return OPAL_SUCCESS; -} - -/* - * This function is invoked when determining whether we have a CPC in - * common with a specific remote port. We already know that the - * subnet ID is the same between a specific local port and the target - * remote port; now we need to know if we can find a CPC in common - * between the two. - * - * If yes, be sure to find the *same* CPC on both sides. We know - * which CPCs are available on each side, and we know the priorities - * that were assigned on both sides. So find a CPC that is common to - * both sides and has the highest overall priority (between both - * sides). - * - * Return the matching CPC, or NULL if not found. - */ -int -opal_common_ofacm_base_find_match(opal_common_ofacm_base_module_t **local_cpcs, int num_local_cpcs, - opal_common_ofacm_base_module_data_t *remote_cpc_data, int remote_cpc_data_count, - opal_common_ofacm_base_module_t **ret_local_cpc, - opal_common_ofacm_base_module_data_t **ret_remote_cpc_data) -{ - int i, j, max = -1; - opal_common_ofacm_base_module_t *local_cpc, *local_selected = NULL; - opal_common_ofacm_base_module_data_t *local_cpcd, *remote_cpcd, - *remote_selected = NULL; - - /* Iterate over all the CPCs on the local module */ - for (i = 0; i < num_local_cpcs; ++i) { - local_cpc = local_cpcs[i]; - local_cpcd = &(local_cpc->data); - - /* Iterate over all the CPCs on the remote port */ - for (j = 0; j < remote_cpc_data_count; ++j) { - remote_cpcd = &(remote_cpc_data[j]); - - /* Are the components the same? */ - if (local_cpcd->cbm_component == remote_cpcd->cbm_component) { - /* If so, update the max priority found so far */ - if (max < local_cpcd->cbm_priority) { - max = local_cpcd->cbm_priority; - local_selected = local_cpc; - remote_selected = remote_cpcd; - } - if (max < remote_cpcd->cbm_priority) { - max = remote_cpcd->cbm_priority; - local_selected = local_cpc; - remote_selected = remote_cpcd; - } - } - } - } - - /* All done! */ - if (NULL != local_selected) { - *ret_local_cpc = local_selected; - *ret_remote_cpc_data = remote_selected; - OFACM_VERBOSE(("find_match: found match!")); - return OPAL_SUCCESS; - } else { - OFACM_VERBOSE(("find_match: did NOT find match!")); - return OPAL_ERR_NOT_FOUND; - } -} - -/* - * Lookup a CPC component's index in the all[] array so that we can - * send it int the modex - */ -int opal_common_ofacm_base_get_cpc_index(opal_common_ofacm_base_component_t *cpc) -{ - int i; - for (i = 0; NULL != all[i]; ++i) { - if (all[i] == cpc) { - return i; - } - } - - /* Not found */ - return -1; -} - -/* - * Lookup a CPC by its index (received from the modex) - */ -opal_common_ofacm_base_component_t * -opal_common_ofacm_base_get_cpc_byindex(uint8_t index) -{ - return (index >= (sizeof(all) / - sizeof(opal_common_ofacm_base_module_t *))) ? - NULL : all[index]; -} - -/* - * This function we never call from BTL - so it is no reason to expose it - * in base. - */ -#if 0 -int opal_common_ofacm_base_alloc_cts(mca_btl_base_endpoint_t *endpoint) -{ - opal_free_list_item_t *fli; - int length = sizeof(mca_btl_openib_header_t) + - sizeof(mca_btl_openib_header_coalesced_t) + - sizeof(mca_btl_openib_control_header_t) + - sizeof(mca_btl_openib_footer_t) + - mca_btl_openib_component.qp_infos[mca_btl_openib_component.credits_qp].size; - - /* Explicitly don't use the mpool registration */ - fli = &(endpoint->endpoint_cts_frag.super.super.base.super); - fli->registration = NULL; - fli->ptr = malloc(length); - if (NULL == fli->ptr) { - BTL_ERROR(("malloc failed")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - endpoint->endpoint_cts_mr = - ibv_reg_mr(endpoint->endpoint_btl->device->ib_pd, - fli->ptr, length, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE | - IBV_ACCESS_REMOTE_READ); - OPAL_OUTPUT((-1, "registered memory %p, length %d", fli->ptr, length)); - if (NULL == endpoint->endpoint_cts_mr) { - free(fli->ptr); - BTL_ERROR(("Failed to reg mr!")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Copy the lkey where it needs to go */ - endpoint->endpoint_cts_frag.super.sg_entry.lkey = - endpoint->endpoint_cts_frag.super.super.segment.seg_key.key32[0] = - endpoint->endpoint_cts_mr->lkey; - endpoint->endpoint_cts_frag.super.sg_entry.length = length; - - /* Construct the rest of the recv_frag_t */ - OBJ_CONSTRUCT(&(endpoint->endpoint_cts_frag), mca_btl_openib_recv_frag_t); - endpoint->endpoint_cts_frag.super.super.base.order = - mca_btl_openib_component.credits_qp; - endpoint->endpoint_cts_frag.super.endpoint = endpoint; - OPAL_OUTPUT((-1, "Got a CTS frag for peer %s, addr %p, length %d, lkey %d", - (NULL == endpoint->endpoint_proc->proc_opal->proc_hostname) ? - "unknown" : endpoint->endpoint_proc->proc_opal->proc_hostname, - (void*) endpoint->endpoint_cts_frag.super.sg_entry.addr, - endpoint->endpoint_cts_frag.super.sg_entry.length, - endpoint->endpoint_cts_frag.super.sg_entry.lkey)); - - return OPAL_SUCCESS; -} -#endif -/* This function is needed for CTS packet release on completion.. - * and it is bad idea...it is 2 possible solutions: - * - make the send operation blocking (simple and not optimal). - * - rdmacm should add own progress function (best but not trivial). - */ -#if 0 -int opal_common_ofacm_base_free_cts(mca_btl_base_endpoint_t *endpoint) -{ - if (NULL != endpoint->endpoint_cts_mr) { - ibv_dereg_mr(endpoint->endpoint_cts_mr); - endpoint->endpoint_cts_mr = NULL; - } - if (NULL != endpoint->endpoint_cts_frag.super.super.base.super.ptr) { - free(endpoint->endpoint_cts_frag.super.super.base.super.ptr); - endpoint->endpoint_cts_frag.super.super.base.super.ptr = NULL; - OPAL_OUTPUT((-1, "Freeing CTS frag")); - } - - return OPAL_SUCCESS; -} -#endif - -/* - * Called to start a connection - */ -int opal_common_ofacm_base_start_connect( - opal_common_ofacm_base_local_connection_context_t *context) -{ -#if 0 - /* If the CPC uses the CTS protocol, provide a frag buffer for the - CPC to post. Must allocate these frags up here in the main - thread because the FREE_LIST_WAIT is not thread safe. */ - if (cpc->cbm_uses_cts) { - int rc; - rc = opal_common_ofacm_base_alloc_cts(context); - if (OPAL_SUCCESS != rc) { - return rc; - } - } -#endif - - return context->cpc->cbm_start_connect(context); -} - -/* - * Called during openib btl component close - */ -void opal_common_ofacm_base_finalize(void) -{ - int i; - - if (NULL != available) { - for (i = 0; NULL != available[i]; ++i) { - if (NULL != available[i]->cbc_finalize) { - available[i]->cbc_finalize(); - } - } - free(available); - available = NULL; - } -} diff --git a/opal/mca/common/ofacm/common_ofacm_empty.c b/opal/mca/common/ofacm/common_ofacm_empty.c deleted file mode 100644 index 0af2ef0c9e2..00000000000 --- a/opal/mca/common/ofacm/common_ofacm_empty.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include "base.h" -#include "connect.h" -#include "opal/constants.h" - -static void empty_component_register(void); -static int empty_component_init(void); -static int empty_component_query(opal_common_ofacm_base_dev_desc_t *dev, - opal_common_ofacm_base_module_t **cpc); - -opal_common_ofacm_base_component_t opal_common_ofacm_empty = { - "empty", - empty_component_register, - empty_component_init, - empty_component_query, - NULL -}; - -static void empty_component_register(void) -{ - /* Nothing to do */ -} - -static int empty_component_init(void) -{ - /* Never let this CPC run */ - return OPAL_ERR_NOT_SUPPORTED; -} - -static int empty_component_query(opal_common_ofacm_base_dev_desc_t *dev, - opal_common_ofacm_base_module_t **cpc) -{ - /* Never let this CPC run */ - return OPAL_ERR_NOT_SUPPORTED; -} diff --git a/opal/mca/common/ofacm/common_ofacm_empty.h b/opal/mca/common/ofacm/common_ofacm_empty.h deleted file mode 100644 index dbbc4a0585c..00000000000 --- a/opal/mca/common/ofacm/common_ofacm_empty.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef BTL_OPENIB_CONNECT_EMPTY_H -#define BTL_OPENIB_CONNECT_EMPTY_H - -#include "opal_config.h" - -#include "connect.h" - -extern opal_common_ofacm_base_component_t opal_common_ofacm_empty; - -#endif diff --git a/opal/mca/common/ofacm/common_ofacm_oob.c b/opal/mca/common/ofacm/common_ofacm_oob.c deleted file mode 100644 index 082dfdf231c..00000000000 --- a/opal/mca/common/ofacm/common_ofacm_oob.c +++ /dev/null @@ -1,1672 +0,0 @@ -/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ -/* - * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana - * University Research and Technology - * Corporation. All rights reserved. - * Copyright (c) 2004-2005 The University of Tennessee and The University - * of Tennessee Research Foundation. All rights - * reserved. - * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, - * University of Stuttgart. All rights reserved. - * Copyright (c) 2004-2005 The Regents of the University of California. - * All rights reserved. - * Copyright (c) 2006-2013 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2006-2014 Los Alamos National Security, LLC. All rights - * reserved. - * Copyright (c) 2008-2012 Mellanox Technologies. All rights reserved. - * - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include "opal/runtime/opal_progress.h" -#include "opal/dss/dss.h" -#include "opal/util/alfg.h" -#include "opal/util/error.h" -#include "opal/util/output.h" -#include "opal/util/show_help.h" - -#include "ompi/mca/rte/rte.h" -#include "connect.h" -#include "base.h" -#include "opal/class/opal_hash_table.h" -#include "opal/class/opal_object.h" -#include "opal/constants.h" - -#include "opal_stdint.h" - -#define MAX_LINE_LEN 80 -#define NUM_OF_TOKENS 7 - - -typedef enum { - ENDPOINT_CONNECT_REQUEST, - ENDPOINT_CONNECT_RESPONSE, - ENDPOINT_CONNECT_ACK -} connect_message_type_t; - -typedef struct port_to_switch_lids{ - uint16_t port_lid; - uint16_t switch_lid; - struct port_to_switch_lids* next; -} port_to_switch_lids; - -typedef struct switch_to_switch_sl{ - uint16_t switch_lid; - uint8_t service_level; - struct switch_to_switch_sl* next; -} switch_to_switch_sl; - -static int oob_priority = 50; -static bool rml_recv_posted = false; -static opal_rng_buff_t rand_buff; - -static void oob_component_register(void); -static int oob_component_query(opal_common_ofacm_base_dev_desc_t *dev, - opal_common_ofacm_base_module_t **cpc); -static int oob_component_finalize(void); - -static int oob_module_start_connect(opal_common_ofacm_base_local_connection_context_t* context); -static int reply_start_connect(opal_common_ofacm_base_local_connection_context_t* context, - opal_common_ofacm_base_remote_connection_context_t *remote_info); -static int set_remote_info(opal_common_ofacm_base_local_connection_context_t *context, - opal_common_ofacm_base_remote_connection_context_t *remote_info); -static int qp_connect_all(opal_common_ofacm_base_local_connection_context_t* context); -static int qp_create_all(opal_common_ofacm_base_local_connection_context_t* context); -static int qp_create_one(opal_common_ofacm_base_local_connection_context_t* context, int qp); -static int send_connect_data(opal_common_ofacm_base_local_connection_context_t* context, - uint8_t message_type); -static opal_common_ofacm_base_local_connection_context_t* - oob_endpoint_init(opal_proc_t *proc, - opal_common_ofacm_base_qp_config_t *qp_config, - struct ibv_pd *pd, uint64_t subnet_id, int cpc_type, - uint16_t lid, uint16_t rem_lid, - int32_t user_context_index, void *user_context, - opal_common_ofacm_base_module_t *cpc, - opal_common_ofacm_base_context_connect_cb_fn_t connect_cb, - opal_common_ofacm_base_context_error_cb_fn_t error_cb, - opal_common_ofacm_base_context_prepare_recv_cb_fn_t prepare_recv_cb); -static int oob_endpoint_finalize(opal_common_ofacm_base_local_connection_context_t *context); - -static void report_error(opal_common_ofacm_base_local_connection_context_t* context); - -static void rml_send_cb(int status, opal_process_name_t* endpoint, - opal_buffer_t* buffer, ompi_rml_tag_t tag, - void* cbdata); -static void rml_recv_cb(int status, opal_process_name_t* process_name, - opal_buffer_t* buffer, ompi_rml_tag_t tag, - void* cbdata); - -/* Build service level hashtables per port */ -static int create_service_level_table_for_port(uint16_t lid, - opal_hash_table_t* port_to_switch_hash_table, - opal_hash_table_t* switch_to_switch_hash_table); - -/* Pick the service level of path between to endpoints */ -static int pick_service_level(uint16_t src_port_lid, uint16_t dst_port_lid, - uint8_t* service_level, - opal_hash_table_t* port_to_switch_hash_table, - opal_hash_table_t* switch_to_switch_hash_table); - -/* - * The "component" struct -- the top-level function pointers for the - * oob connection scheme. - */ -opal_common_ofacm_base_component_t opal_common_ofacm_oob = { - "oob", - /* Register */ - oob_component_register, - /* Init */ - NULL, - /* Query */ - oob_component_query, - /* Finalize */ - oob_component_finalize, -}; - -/* Open - this functions sets up any oob specific commandline params */ -static void oob_component_register(void) -{ - oob_priority = 50; - (void) mca_base_var_register("ompi", "common", "ofacm", "connect_oob_priority", - "The selection method priority for oob", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, - &oob_priority); -} - -/* - * Init function. Post non-blocking RML receive to accept incoming - * connection requests. - */ -static int oob_component_query(opal_common_ofacm_base_dev_desc_t *dev, - opal_common_ofacm_base_module_t **cpc) -{ - if (oob_priority > 100) { - oob_priority = 100; - } else if (oob_priority < -1) { - oob_priority = -1; - } - - /* If we have the transport_type member, check to ensure we're on - IB (this CPC will not work with iWarp). If we do not have the - transport_type member, then we must be < OFED v1.2, and - therefore we must be IB. */ -#if defined(HAVE_STRUCT_IBV_DEVICE_TRANSPORT_TYPE) - if (IBV_TRANSPORT_IB != dev->ib_dev->transport_type) { - OFACM_VERBOSE(("OFACM: oob CPC only supported on InfiniBand; skipped on device %s", - ibv_get_device_name(dev->ib_dev))); - return OPAL_ERR_NOT_SUPPORTED; - } -#endif - - if (dev->capabilities & OPAL_COMMON_OFACM_XRC_ONLY) { - OFACM_VERBOSE(("OFACM: oob CPC not supported with XRC receive queues, please try xoob CPC; skipped")); - return OPAL_ERR_NOT_SUPPORTED; - } - /* If this btl supports OOB, then post the RML message. But - ensure to only post it *once*, because another btl may have - come in before this and already posted it. */ - if (!rml_recv_posted) { - ompi_rte_recv_buffer_nb(OMPI_NAME_WILDCARD, - OMPI_RML_TAG_OFACM, - OMPI_RML_PERSISTENT, - rml_recv_cb, - NULL); - rml_recv_posted = true; - } - - *cpc = malloc(sizeof(opal_common_ofacm_base_module_t)); - if (NULL == *cpc) { - ompi_rte_recv_cancel(OMPI_NAME_WILDCARD, OMPI_RML_TAG_OFACM); - rml_recv_posted = false; - OFACM_VERBOSE(("openib BTL: oob CPC system error (malloc failed)")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - /* Init global list of all connection contexts */ - OBJ_CONSTRUCT(&opal_common_ofacm_oob.all_procs, opal_list_t); - (*cpc)->data.cbm_component = &opal_common_ofacm_oob; - (*cpc)->data.cbm_priority = oob_priority; - (*cpc)->data.cbm_modex_message = NULL; - (*cpc)->data.cbm_modex_message_len = 0; - - (*cpc)->cbm_endpoint_init = oob_endpoint_init; - (*cpc)->cbm_start_connect = oob_module_start_connect; - (*cpc)->cbm_endpoint_finalize = oob_endpoint_finalize; - (*cpc)->cbm_finalize = NULL; - (*cpc)->cbm_uses_cts = false; - - /* seed RNG */ - opal_srand(&rand_buff,(uint32_t) getpid()); - OFACM_VERBOSE(("openib BTL: oob CPC available for use on %s", - ibv_get_device_name(dev->ib_dev))); - return OPAL_SUCCESS; -} - -static opal_common_ofacm_base_proc_t* find_proc(opal_proc_t *proc) -{ - opal_common_ofacm_base_proc_t *ret = NULL; - opal_list_item_t *item; - opal_list_t *list = &opal_common_ofacm_oob.all_procs; - - for (item = opal_list_get_first(list); - item != opal_list_get_end(list); - item = opal_list_get_next(item)) { - if (proc == ((opal_common_ofacm_base_proc_t *)item)->proc_opal){ - ret = (opal_common_ofacm_base_proc_t *)item; - } - } - return ret; -} - -/* OOB connection context init */ -static opal_common_ofacm_base_local_connection_context_t* - oob_endpoint_init(opal_proc_t *proc, - opal_common_ofacm_base_qp_config_t *qp_config, - struct ibv_pd *pd, uint64_t subnet_id, int cpc_type, - uint16_t lid, uint16_t rem_lid, - int32_t user_context_index, void *user_context, - opal_common_ofacm_base_module_t *cpc, - opal_common_ofacm_base_context_connect_cb_fn_t connect_cb, - opal_common_ofacm_base_context_error_cb_fn_t error_cb, - opal_common_ofacm_base_context_prepare_recv_cb_fn_t prepare_recv_cb) -{ - int ret; - bool new_proc; - opal_common_ofacm_base_local_connection_context_t *context; - opal_common_ofacm_base_proc_t *context_proc; - - context = (opal_common_ofacm_base_local_connection_context_t*) - OBJ_NEW(opal_common_ofacm_base_local_connection_context_t); - context_proc = find_proc(proc); - - if (NULL == context_proc) { - new_proc = true; - /* constructing new proc */ - context_proc = (opal_common_ofacm_base_proc_t *) - OBJ_NEW(opal_common_ofacm_base_proc_t ); - } else { - new_proc = false; - OBJ_RETAIN(context_proc); - } - - opal_common_ofacm_base_proc_setup(context_proc, context, proc); - ret = opal_common_ofacm_base_context_init(context, cpc, connect_cb, error_cb, - prepare_recv_cb, context_proc, qp_config, - pd, subnet_id, cpc_type, lid, rem_lid, user_context_index, user_context); - if (OPAL_SUCCESS != ret) { - OBJ_DESTRUCT(context_proc); - OBJ_DESTRUCT(context); - return NULL; - } - - if (new_proc) { - opal_list_append(&opal_common_ofacm_oob.all_procs, (opal_list_item_t *)context_proc); - } - - return context; -} - -/* OOB connection context finalization */ -static int oob_endpoint_finalize - (opal_common_ofacm_base_local_connection_context_t *context) -{ - opal_list_item_t *proc_item, *cntx_item, *cntx_item_next; - bool found = false; - bool pfound = false; - int qp; - opal_list_t *proc_list = &opal_common_ofacm_oob.all_procs; - - /* Proc cleanup. We should find the context proc in all proc list and remove - * from the proc list our context. After it we try to release the proc context */ - for (proc_item = opal_list_get_first(proc_list); - proc_item != opal_list_get_end(proc_list); - proc_item = opal_list_get_next(proc_item)) { - if (context->proc == ((opal_common_ofacm_base_proc_t *)proc_item)){ - opal_common_ofacm_base_proc_t *proc = - (opal_common_ofacm_base_proc_t *)proc_item; - opal_list_t *cntx_list = &proc->all_contexts; - pfound = true; - - /* Remove the context from proc list */ - cntx_item = opal_list_get_first(cntx_list); - while(cntx_item != opal_list_get_end(cntx_list)) { - /* take the next before removing from the list */ - cntx_item_next = opal_list_get_next(cntx_item); - if (context == (opal_common_ofacm_base_local_connection_context_t *)cntx_item) { - found = true; - opal_list_remove_item(cntx_list, cntx_item); - } - cntx_item = cntx_item_next; - } - - /* Remove our proc from all list */ - if (opal_list_is_empty(cntx_list)) { - opal_list_remove_item(proc_list, (opal_list_item_t *)proc); - } - OBJ_RELEASE(proc); - } - } - - /* Release QPs */ - for (qp = 0; qp < context->num_of_qps; qp++) { - if(NULL != context->qps[qp].lcl_qp) { - if(ibv_destroy_qp(context->qps[qp].lcl_qp)) { - OFACM_ERROR(("Failed to destroy QP:%d\n", qp)); - } - } - } - - assert(true == found); - assert(true == pfound); - - /* We done with proc release and now we way destroy the context */ - OBJ_RELEASE(context); - - return OPAL_SUCCESS; -} - -/* - * Connect function. Start initiation of connections to a remote - * peer. We send our Queue Pair information over the RML/OOB - * communication mechanism. On completion of our send, a send - * completion handler is called. - */ -static int oob_module_start_connect(opal_common_ofacm_base_local_connection_context_t *context) -{ - int rc; - - if (OPAL_SUCCESS != (rc = qp_create_all(context))) { - return rc; - } - - /* Send connection info over to remote endpoint */ - context->state = MCA_COMMON_OFACM_CONNECTING; - if (OPAL_SUCCESS != - (rc = send_connect_data(context, ENDPOINT_CONNECT_REQUEST))) { - OFACM_ERROR(("error sending connect request, error code %d", rc)); - return rc; - } - - return OPAL_SUCCESS; -} - -/* - * Component finalize function. Cleanup RML non-blocking receive. - */ -static int oob_component_finalize(void) -{ - if (rml_recv_posted) { - ompi_rte_recv_cancel(OMPI_NAME_WILDCARD, OMPI_RML_TAG_OFACM); - rml_recv_posted = false; - } - - return OPAL_SUCCESS; -} - -/**************************************************************************/ - -/* - * Reply to a `start - connect' message - */ -static int reply_start_connect(opal_common_ofacm_base_local_connection_context_t* context, - opal_common_ofacm_base_remote_connection_context_t *remote_info) -{ - int rc; - - OFACM_VERBOSE(("Initialized QPs, LID = %d", context->lid)); - - /* Create local QP's and post receive resources */ - if (OPAL_SUCCESS != (rc = qp_create_all(context))) { - return rc; - } - - /* Set the remote side info */ - set_remote_info(context, remote_info); - - /* Connect to remote endpoint qp's */ - if (OPAL_SUCCESS != (rc = qp_connect_all(context))) { - return rc; - } - - /* Send connection info over to remote endpoint */ - context->state = MCA_COMMON_OFACM_CONNECT_ACK; - if (OPAL_SUCCESS != - (rc = send_connect_data(context, ENDPOINT_CONNECT_RESPONSE))) { - OFACM_ERROR(("error in endpoint send connect request error code is %d", - rc)); - return rc; - } - return OPAL_SUCCESS; -} - - -static int set_remote_info(opal_common_ofacm_base_local_connection_context_t *context, - opal_common_ofacm_base_remote_connection_context_t *remote_info) -{ - /* copy the remote_info stuff */ - memcpy(&context->remote_info, - remote_info, sizeof(opal_common_ofacm_base_remote_connection_context_t )); - - OFACM_VERBOSE(("Setting QP info, LID = %d", context->remote_info.rem_lid)); - return OPAL_SUCCESS; - -} - - -/* - * Connect the local ends of all qp's to the remote side - */ -static int qp_connect_all(opal_common_ofacm_base_local_connection_context_t* context) -{ - int i; - uint8_t service_level = 0; - uint32_t rtr_mask = 0, rts_mask = 0; - int rc = OPAL_SUCCESS; - - static bool is_hash_table_initialized = false; - static opal_hash_table_t switch_to_switch_hash_table; - static opal_hash_table_t port_to_switch_hash_table; - - - /* Create two hash tables for a given port in order to allow - * an efficient search of service level on any route exiting - * from it */ - if((NULL != opal_common_ofacm_three_dim_torus) && - (false == is_hash_table_initialized)){ - - rc = create_service_level_table_for_port(context->lid, &port_to_switch_hash_table, - &switch_to_switch_hash_table); - if(OPAL_SUCCESS != rc){ - /* Failed to create service table for port */ - return OPAL_ERROR; - } - is_hash_table_initialized = true; - } - - - /* Pick the Service Level of each route from the table */ - if(is_hash_table_initialized){ - rc = pick_service_level(context->lid, context->remote_info.rem_lid, &service_level, - &port_to_switch_hash_table, &switch_to_switch_hash_table); - if(OPAL_SUCCESS != rc){ - /* Failed to retrieve service level on the route */ - return OPAL_ERROR; - } - /*printf("Debug: qp_connect_all: lid %hu rem lid %hu num_qps %d SL %c\n", context->lid, - context->remote_info.rem_lid, context->num_of_qps, service_level);*/ - } - - - for (i = 0; i < context->num_of_qps; i++) { - struct ibv_qp_attr attr; - struct ibv_qp* qp = context->qps[i].lcl_qp; - enum ibv_mtu mtu = (context->attr[i].path_mtu < context->remote_info.rem_mtu) ? - context->attr[i].path_mtu : context->remote_info.rem_mtu; - - memset(&attr, 0, sizeof(attr)); - memcpy(&attr, context->attr, sizeof(struct ibv_qp_attr)); - attr.qp_state = IBV_QPS_RTR; - attr.path_mtu = mtu; - attr.dest_qp_num = context->remote_info.rem_qps[i].rem_qp_num; - attr.rq_psn = context->remote_info.rem_qps[i].rem_psn; - attr.ah_attr.dlid = context->remote_info.rem_lid; - - if(is_hash_table_initialized){ - attr.ah_attr.sl = service_level; - } - /* JMS to be filled in later dynamically */ - attr.ah_attr.static_rate = 0; - rtr_mask = IBV_QP_STATE | - IBV_QP_AV | - IBV_QP_PATH_MTU | - IBV_QP_DEST_QPN | - IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC | - IBV_QP_MIN_RNR_TIMER; - - /* applying user specified rtr mask */ - if (NULL != context->custom_rtr_attr_mask) { - rtr_mask |= context->custom_rtr_attr_mask[i]; - } - - OFACM_VERBOSE(("Set MTU to IBV value %d (%s bytes)", mtu, - (mtu == IBV_MTU_256) ? "256" : - (mtu == IBV_MTU_512) ? "512" : - (mtu == IBV_MTU_1024) ? "1024" : - (mtu == IBV_MTU_2048) ? "2048" : - (mtu == IBV_MTU_4096) ? "4096" : - "unknown (!)")); - - if (ibv_modify_qp(qp, &attr, rtr_mask)) { - OFACM_ERROR(("Error modifing QP to RTR errno says %s", - strerror(errno))); - return OPAL_ERROR; - } - attr.qp_state = IBV_QPS_RTS; - /* On PP QPs we have SW flow control, no need for rnr retries. Setting - * it to zero helps to catch bugs */ - /* - attr.rnr_retry = BTL_OPENIB_QP_TYPE_PP(i) ? 0 : - mca_btl_openib_component.ib_rnr_retry; - */ - attr.sq_psn = context->qps[i].lcl_psn; - rts_mask = IBV_QP_STATE | - IBV_QP_TIMEOUT | - IBV_QP_RETRY_CNT | - IBV_QP_RNR_RETRY | - IBV_QP_SQ_PSN | - IBV_QP_MAX_QP_RD_ATOMIC; - - /* applying user specified rts mask */ - if (NULL != context->custom_rts_attr_mask) { - rts_mask |= context->custom_rts_attr_mask[i]; - } - - if (ibv_modify_qp(qp, &attr, rts_mask)) { - OFACM_ERROR(("error modifying QP to RTS errno says %s", - strerror(errno))); - return OPAL_ERROR; - } - } - - return OPAL_SUCCESS; -} - - -/* - * Create the local side of all the qp's. The remote sides will be - * connected later. - */ -static int qp_create_all(opal_common_ofacm_base_local_connection_context_t* context) -{ - int qp, rc; - - for (qp = 0; qp < context->num_of_qps; ++qp) { - rc = qp_create_one(context, qp); - if (OPAL_SUCCESS != rc) { - return rc; - } - } - /* Now that all the qp's are created locally, post some receive - buffers, setup credits, etc. */ - return context->prepare_recv_cb(context->user_context); -} - -/* - * Create the local side of one qp. The remote side will be connected - * later. - */ -static int qp_create_one(opal_common_ofacm_base_local_connection_context_t *context, int qp) -{ - struct ibv_qp *my_qp; - struct ibv_qp_init_attr init_attr; - struct ibv_qp_attr attr; - size_t req_inline = context->init_attr[qp].cap.max_inline_data; - uint32_t init_mask = 0; - - /* Taking default init attributes from user */ - memcpy(&init_attr, &context->init_attr[qp], sizeof(init_attr)); - my_qp = ibv_create_qp(context->ib_pd, &init_attr); - - if (NULL == my_qp) { - OFACM_ERROR(("error creating qp errno says %s", strerror(errno))); - return OPAL_ERROR; - } - context->qps[qp].lcl_qp = my_qp; - - if (init_attr.cap.max_inline_data < req_inline) { - context->qps[qp].ib_inline_max = init_attr.cap.max_inline_data; - opal_show_help("help-mpi-common-ofacm-base.txt", - "inline truncated", true, opal_proc_local_get()->proc_hostname, - req_inline, init_attr.cap.max_inline_data); - } else { - context->qps[qp].ib_inline_max = req_inline; - } - - /* Taking default attributes from user */ - memcpy(&attr, &context->attr[qp], sizeof(attr)); - attr.qp_state = IBV_QPS_INIT; - attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; - init_mask = IBV_QP_STATE | - IBV_QP_PKEY_INDEX | - IBV_QP_PORT | - IBV_QP_ACCESS_FLAGS; - /* apply user specified init mask */ - if (NULL != context->custom_init_attr_mask) { - init_mask |= context->custom_init_attr_mask[qp]; - } - - if (ibv_modify_qp(context->qps[qp].lcl_qp, - &attr, init_mask)) { - OFACM_ERROR(("Error modifying qp to INIT errno says %s", strerror(errno))); - return OPAL_ERROR; - } - - /* Setup meta data on the endpoint */ - context->qps[qp].lcl_psn = opal_rand(&rand_buff) & 0xffffff; - - return OPAL_SUCCESS; -} - - -/* - * RML send connect information to remote endpoint - */ -static int send_connect_data(opal_common_ofacm_base_local_connection_context_t* context, - uint8_t message_type) -{ - opal_buffer_t* buffer = OBJ_NEW(opal_buffer_t); - int rc; - - if (NULL == buffer) { - OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* pack the info in the send buffer */ - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT8)); - OFACM_VERBOSE(("type %d\n", message_type)); - rc = opal_dss.pack(buffer, &message_type, 1, OPAL_UINT8); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT64)); - rc = opal_dss.pack(buffer, &context->subnet_id, 1, OPAL_UINT64); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - - if (message_type != ENDPOINT_CONNECT_REQUEST) { - /* send the QP connect request info we respond to */ - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, - &context->remote_info.rem_qps[0].rem_qp_num, 1, - OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT16)); - rc = opal_dss.pack(buffer, &context->remote_info.rem_lid, 1, OPAL_UINT16); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - } - - if (message_type != ENDPOINT_CONNECT_ACK) { - int qp; - /* send CM type/family */ - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_INT)); - rc = opal_dss.pack(buffer, &context->cpc_type, 1, OPAL_INT); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - /* Pasha: Send number of qp here. We don't must to send number of QPs here, BUT - * recv side callback code is pretty complicated and I don't want to touch - * it now. So best work around on this stage is send another 1byte with number of - * qps. - */ - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT8)); - rc = opal_dss.pack(buffer, &context->num_of_qps, 1, OPAL_UINT8); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - /* stuff all the QP info into the buffer */ - for (qp = 0; qp < context->num_of_qps; qp++) { - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, &context->qps[qp].lcl_qp->qp_num, - 1, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, &context->qps[qp].lcl_psn, 1, - OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - } - - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT16)); - rc = opal_dss.pack(buffer, &context->lid, 1, OPAL_UINT16); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, &context->attr[0].path_mtu, 1, - OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, &context->index, 1, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - } - - /* send to remote endpoint */ - rc = ompi_rte_send_buffer_nb((orte_process_name_t*)&context->proc->proc_opal->proc_name, - buffer, OMPI_RML_TAG_OFACM, - rml_send_cb, NULL); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - OFACM_VERBOSE(("Sent QP Info, LID = %d, SUBNET = %lx\n", - context->lid, - context->subnet_id)); - - return OPAL_SUCCESS; -} - -static void report_error(opal_common_ofacm_base_local_connection_context_t* context) -{ - if (NULL == context || NULL == context->error_cb) { - /* The context is undefined and we can not print specific error */ - opal_show_help("help-mpi-common-ofacm-oob.txt", - "ofacm oob fatal error", true, - opal_proc_local_get()->proc_hostname, - __FILE__, __LINE__); - exit(1); - } - - /* Other way, call to user error callback */ - context->error_cb(context->user_context); -} - -/* - * Callback when we have finished RML sending the connect data to a - * remote peer - */ -static void rml_send_cb(int status, opal_process_name_t* endpoint, - opal_buffer_t* buffer, ompi_rml_tag_t tag, - void* cbdata) -{ - OBJ_RELEASE(buffer); -} - - -/* - * Non blocking RML recv callback. Read incoming QP and other info, - * and if this endpoint is trying to connect, reply with our QP info, - * otherwise try to modify QP's and establish reliable connection - */ -static void rml_recv_cb(int status, opal_process_name_t* process_name, - opal_buffer_t* buffer, ompi_rml_tag_t tag, - void* cbdata) -{ - int context_state; - int rc; - uint32_t lcl_qp = 0; - uint16_t lcl_lid = 0; - int32_t cnt = 1; - opal_common_ofacm_base_remote_connection_context_t remote_info; - opal_common_ofacm_base_local_connection_context_t *l_context; - opal_common_ofacm_base_proc_t *proc; - uint8_t message_type, num_qps; - int cpc_type; - opal_list_t *procs_list = &opal_common_ofacm_oob.all_procs; - opal_list_t *context_list; - bool master; - - /* start by unpacking data first so we know who is knocking at - our door */ - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT8)); - rc = opal_dss.unpack(buffer, &message_type, &cnt, OPAL_UINT8); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT64)); - rc = opal_dss.unpack(buffer, &remote_info.rem_subnet_id, &cnt, OPAL_UINT64); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - - if (ENDPOINT_CONNECT_REQUEST != message_type) { - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); - rc = opal_dss.unpack(buffer, &lcl_qp, &cnt, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16)); - rc = opal_dss.unpack(buffer, &lcl_lid, &cnt, OPAL_UINT16); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - } - - if (ENDPOINT_CONNECT_ACK != message_type) { - int qp; - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_INT)); - rc = opal_dss.unpack(buffer, &cpc_type, &cnt, OPAL_INT); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - /* Pasha: Reading number of qps, in original code we tool it from - * btl component. In future we may change order of operations here. We may start - * lookup for connection descriptor after receiving subnet_id and lid. But in order - * to do it here I need totally to rewrite the recv callback...next time ;) - */ - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT8)); - rc = opal_dss.unpack(buffer, &num_qps, &cnt, OPAL_UINT8); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - /* get ready for the data */ - opal_common_ofacm_base_remote_context_init(&remote_info, - num_qps, 0); - - /* unpack all the qp info */ - for (qp = 0; qp < num_qps; ++qp) { - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); - rc = opal_dss.unpack(buffer, &remote_info.rem_qps[qp].rem_qp_num, &cnt, - OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); - rc = opal_dss.unpack(buffer, &remote_info.rem_qps[qp].rem_psn, &cnt, - OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - } - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16)); - rc = opal_dss.unpack(buffer, &remote_info.rem_lid, &cnt, OPAL_UINT16); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); - rc = opal_dss.unpack(buffer, &remote_info.rem_mtu, &cnt, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); - rc = opal_dss.unpack(buffer, &remote_info.rem_index, &cnt, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - report_error(NULL); - return; - } - } - - OFACM_VERBOSE(("Received QP Info, LID = %d, SUBNET = %lx, CPC_TYPE = %d", - remote_info.rem_lid, - remote_info.rem_subnet_id, - cpc_type)); - - master = ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, OMPI_PROC_MY_NAME, - process_name) >= 0 ? true : false; - for (proc = (opal_common_ofacm_base_proc_t *)opal_list_get_first(procs_list); - proc != (opal_common_ofacm_base_proc_t *)opal_list_get_end(procs_list); - proc = (opal_common_ofacm_base_proc_t *)opal_list_get_next(proc)){ - bool found = false; - if (opal_compare_proc(proc->proc_opal->proc_name, - *process_name) != OPAL_EQUAL) { - continue; - } - context_list = &proc->all_contexts; - if (ENDPOINT_CONNECT_REQUEST != message_type) { - /* This is a reply message. Try to get the endpoint - instance the reply belongs to */ - for (l_context = (opal_common_ofacm_base_local_connection_context_t *)opal_list_get_first(context_list); - l_context != (opal_common_ofacm_base_local_connection_context_t *)opal_list_get_end(context_list); - l_context = (opal_common_ofacm_base_local_connection_context_t *)opal_list_get_next(l_context)) { - if (l_context->qps[0].lcl_qp != NULL && - lcl_lid == l_context->lid && - lcl_qp == l_context->qps[0].lcl_qp->qp_num && - remote_info.rem_subnet_id == l_context->subnet_id) { - found = true; - break; - } - } - } else { - /* This is new connection request. If this is master try - to find endpoint in a connecting state. If this is - slave try to find endpoint in closed state and - initiate connection back */ - opal_common_ofacm_base_local_connection_context_t *context_found = NULL; - for (l_context = (opal_common_ofacm_base_local_connection_context_t *)opal_list_get_first(context_list); - l_context != (opal_common_ofacm_base_local_connection_context_t *)opal_list_get_end(context_list); - l_context = (opal_common_ofacm_base_local_connection_context_t *)opal_list_get_next(l_context)) { - if (l_context->subnet_id != remote_info.rem_subnet_id || - l_context->cpc_type != cpc_type || - (l_context->state != MCA_COMMON_OFACM_CONNECTING - && l_context->state != MCA_COMMON_OFACM_CLOSED)) - continue; - found = true; - context_found = l_context; - if ((master && - MCA_COMMON_OFACM_CONNECTING == l_context->state) || - (!master && - MCA_COMMON_OFACM_CLOSED == l_context->state)) - break; /* Found one. No point to continue */ - } - l_context = context_found; - - /* if this is slave and there is no endpoints in closed - state then all connection are already in progress so - just ignore this connection request */ - if (found && !master && - MCA_COMMON_OFACM_CLOSED != l_context->state) { - return; - } - } - - if (!found) { - OFACM_ERROR(("can't find suitable endpoint for this peer\n")); - report_error(NULL); - return; - } - - OPAL_THREAD_LOCK(&l_context->context_lock); - context_state = l_context->state; - - /* Update status */ - switch (context_state) { - case MCA_COMMON_OFACM_CLOSED: - /* We had this connection closed before. The endpoint is - trying to connect. Move the status of this connection - to CONNECTING, and then reply with our QP - information */ - if (master) { - rc = reply_start_connect(l_context, &remote_info); - } else { - rc = oob_module_start_connect(l_context); - } - - if (OPAL_SUCCESS != rc) { - OFACM_ERROR(("error in endpoint reply start connect")); - report_error(l_context); - break; - } - - /* As long as we expect a message from the peer (in order - to setup the connection) let the event engine pool the - RML events. Note: we increment it once peer active - connection. */ - opal_progress_event_users_increment(); - break; - - case MCA_COMMON_OFACM_CONNECTING: - /* preparing remote info for this context */ - opal_common_ofacm_base_remote_context_init(&l_context->remote_info, - l_context->num_of_qps, 0); - /* need to check status here */ - set_remote_info(l_context, &remote_info); - if (OPAL_SUCCESS != (rc = qp_connect_all(l_context))) { - OFACM_ERROR(("endpoint connect error: %d", rc)); - report_error(l_context); - break; - } - - if (master) { - l_context->state = MCA_COMMON_OFACM_WAITING_ACK; - - /* Send him an ACK */ - send_connect_data(l_context, ENDPOINT_CONNECT_RESPONSE); - } else { - send_connect_data(l_context, ENDPOINT_CONNECT_ACK); - /* Tell main BTL that we're done */ - l_context->state = MCA_COMMON_OFACM_CONNECTED; - l_context->connect_cb(l_context->user_context); - } - break; - - case MCA_COMMON_OFACM_WAITING_ACK: - /* Tell main BTL that we're done */ - l_context->state = MCA_COMMON_OFACM_CONNECTED; - l_context->connect_cb(l_context->user_context); - break; - - case MCA_COMMON_OFACM_CONNECT_ACK: - send_connect_data(l_context, ENDPOINT_CONNECT_ACK); - /* Tell main BTL that we're done */ - l_context->state = MCA_COMMON_OFACM_CONNECTED; - l_context->connect_cb(l_context->user_context); - break; - - case MCA_COMMON_OFACM_CONNECTED: - break; - - default : - OFACM_ERROR(("Invalid endpoint state %d", context_state)); - report_error(l_context); - } - OPAL_THREAD_UNLOCK(&l_context->context_lock); - break; - } -} - -/* - * Get the service level on the route between - * source port LID and destination port LID. - * @Param src_port_lid - LID of the source port. - * @Param dst_port_lid - LID of destination port. - * @Param service_level - Returned value. - * The service level on the route between source port - * to destination port. - * @return - Error Code. Non Zero value on error. - */ -static int pick_service_level(uint16_t src_port_lid, uint16_t dst_port_lid, uint8_t* service_level, - opal_hash_table_t* port_to_switch_hash_table, opal_hash_table_t* switch_to_switch_hash_table) -{ - uint8_t* sl; - uint16_t* dst_switch_lid; - void* p_src_switch_lid = NULL; - void* p_dst_switch_lid = NULL; - void* p_service_level = NULL; - int rc = OPAL_SUCCESS; - - /* Get the switch LID connected tothe source HCA LID */ - rc = opal_hash_table_get_value_ptr(port_to_switch_hash_table, &src_port_lid, sizeof(uint16_t), &p_src_switch_lid); - if(OPAL_SUCCESS != rc){ - /* Could not find source port LID */ - rc = OPAL_ERROR; - return rc; - } - - - /* Get the switch LID connected to the destination HCA LID */ - rc = opal_hash_table_get_value_ptr(port_to_switch_hash_table, &dst_port_lid, sizeof(uint16_t), &p_dst_switch_lid); - if(OPAL_SUCCESS != rc){ - /* Could not find destination port LID */ - rc = OPAL_ERROR; - return rc; - } - dst_switch_lid = (uint16_t*)p_dst_switch_lid; - - - /* Get the service level of the route beween the source HCA LID and destination HCA LID */ - rc = opal_hash_table_get_value_ptr(switch_to_switch_hash_table, dst_switch_lid, sizeof(uint16_t), &p_service_level); - if(OPAL_SUCCESS != rc){ - /* Could not find destination switch LID in hashtable*/ - rc = OPAL_ERROR; - return rc; - } - sl = (uint8_t*)p_service_level; - *service_level = *sl; - - return rc; -} - - -/* - * Get the size of the port to switch hashtable from a file. - - * @Params fp - Descriptor of the input file. - * @Param hash_table_size - Pointer to the size of - * the port to switch hashtable. - * @param head - pointer to a linked list containing - * the pairs to be stored in the hashtable. - * @return - Error code. Non zero value for failure. - */ -static int get_port_to_switch_hashtable_data_from_file(FILE* fp, int* hash_table_size, port_to_switch_lids** head) -{ - int i; - char c; - int num_items; - int rc = OPAL_SUCCESS; - int ret = OPAL_SUCCESS; - - uint64_t guid; - uint16_t port_lid; - uint16_t switch_lid; - uint16_t mtu, rate, lmc; /* TODO: Check binary representation */ - int port_number; - - port_to_switch_lids* item = NULL; - port_to_switch_lids* p_head = *head; - port_to_switch_lids* p_next_item = NULL; - - char str[MAX_LINE_LEN] = "\0"; - char input_str[NUM_OF_TOKENS][MAX_LINE_LEN] = {"\0"}; - char expected_str[NUM_OF_TOKENS][MAX_LINE_LEN] = {"\0"}; - - - c = fgetc(fp); - fseek(fp, -1, SEEK_CUR); - - /* Init expected input strings */ - strcpy(expected_str[0], "Channel"); - strcpy(expected_str[1], "Adapter"); - strcpy(expected_str[2], "base"); - strcpy(expected_str[3], "LID"); - strcpy(expected_str[4], "LMC"); - strcpy(expected_str[5], "port"); - - /* Create list */ - p_head = (port_to_switch_lids*)calloc(1, sizeof(port_to_switch_lids)); - if(NULL == p_head){ - rc = OPAL_ERR_OUT_OF_RESOURCE; - return rc; - } - *head = p_head; - /* Pre-process the port-to-switch table */ - while(EOF != c) - { - ret = fscanf(fp, "%s %s %" PRIx64 " %c", input_str[0], input_str[1], &guid, &c); - ret += fscanf(fp, "%s %s %hx %c", input_str[2], input_str[3], &port_lid, &c); - ret += fscanf(fp, "%s %hu %c", input_str[4], &lmc, &c); - ret += fscanf(fp, "%s %s %d", input_str[6], input_str[5], &port_number); - - - if(14 != ret){ - rc = OPAL_ERR_FILE_READ_FAILURE; - return rc; - } - - for(i = 0; i < 6; i++) - { - /*if(strncmp(str, table_header, hash_table_header_size)){*/ - if(strcmp(input_str[i], expected_str[i])){ - /* Incorrect table header */ - rc = OPAL_ERROR; - return rc; - } - } - - c = fgetc(fp); - fgets(str, MAX_LINE_LEN, fp); - if(strncmp(str, "# LID : MTU : RATE", strlen(str) - 1)){ - /* Incorrect table header */ - rc = OPAL_ERROR; - return rc; - } - - c = fgetc(fp); - fseek(fp, -1, SEEK_CUR); - - - /* Read next line */ - fgets(str, MAX_LINE_LEN, fp); - - /* Update the port to switch hashtable size if read valid data */ - num_items = sscanf(str, "%hx %c %hu %c %hu", &switch_lid, &c, &mtu, &c, &rate); - if(5 == num_items){ - (*hash_table_size)++; - } - else{ - /* Wrong file format */ - rc = OPAL_ERROR; - return rc; - } - /* Store port LID and switch LID */ - item = calloc(1, sizeof(port_to_switch_lids)); - if(NULL == item){ - rc = OPAL_ERR_OUT_OF_RESOURCE; - return rc; - } - item->port_lid = port_lid; - item->switch_lid = switch_lid; - - /* Insert the item to the head of the list */ - p_next_item = p_head->next; - p_head->next = item; - item->next = p_next_item; - - - /* Get Next char */ - c = fgetc(fp); - fseek(fp, -1, SEEK_CUR); - } - - return rc; - } - -/* - * Get from the input file the size of the - * switch-to-switch hashtable dedicated for - * the input switch LID. - - * @Params fp - Descriptor of the input file. - * @Param switch_lid - the source switch local ID (LID). - * @Param hash_table_size - Pointer to the hashtable size. - * Value returned by this routine. - * @Param head - pointer to a linked list containing the pairs - * to be stored in the hashtable. - * @return - Error code. Non zero value for failure. - */ -static int get_switch_to_switch_hashtable_size_from_file(FILE* fp, uint16_t switch_lid, int* hash_table_size, switch_to_switch_sl** head) -{ - int i; - char c; - int num_items; - - int port; - uint64_t guid; - uint16_t source_lid; - uint16_t dest_lid; - - int rc = OPAL_SUCCESS; - int ret = OPAL_SUCCESS; - uint8_t service_level; - - switch_to_switch_sl* item = NULL; - switch_to_switch_sl* p_head = NULL; - switch_to_switch_sl* p_next_item = NULL; - - int table_offset = 0; - int offset_in_table = 0; - - char str[MAX_LINE_LEN] = "\0"; - char input_str[NUM_OF_TOKENS][MAX_LINE_LEN] = {"\0"}; - char expected_str[NUM_OF_TOKENS][MAX_LINE_LEN] = {"\0"}; - - - /* Init expected strings */ - strcpy(expected_str[0], "Switch"); - strcpy(expected_str[1], "base"); - strcpy(expected_str[2], "LID"); - strcpy(expected_str[3], "port"); - - - /* Allocate empty list */ - p_head = (switch_to_switch_sl*)calloc(1, sizeof(switch_to_switch_sl)); - if(NULL == p_head){ - rc = OPAL_ERR_OUT_OF_RESOURCE; - return rc; - } - *head = p_head; - - c = fgetc(fp); - fseek(fp, -1, SEEK_CUR); - - /* Read info */ - while(EOF != c){ - - /* Go over the switch-to-switch routing tables until the requested - * table dedicated for the input switch_lid is found */ - ret = fscanf(fp, "%s %" PRIx64 " %c", input_str[0], &guid, &c); - ret += fscanf(fp, "%s %s %hx %c", input_str[1], input_str[2], &source_lid, &c); - ret += fscanf(fp, "%s %s %d", input_str[4], input_str[3], &port); - c = fgetc(fp); - - if(10 != ret) - { - rc = OPAL_ERR_FILE_READ_FAILURE; - return rc; - } - - for(i = 0; i < 4; i++){ - /* Validate the table header correctness */ - if(strncmp(input_str[i], expected_str[i], strlen(input_str[i]))){ - /* Incorrect table header */ - rc = OPAL_ERROR; - return rc; - } - } - - /* Get next line acording to the currect structure of the file */ - fgets(str, MAX_LINE_LEN, fp); - if(strncmp(str, "# LID : SL : MTU : RATE", strlen(str) - 1)){ - rc = OPAL_ERROR; - return rc; - } - - /* Test if this is the requested table, - * dedicated for the input source switch lid */ - if(source_lid != switch_lid){ - /* Skip to next table */ - - while(EOF != c) - { - offset_in_table = ftell(fp); - fgets(str, MAX_LINE_LEN, fp); - if(!strncmp(str, "Switch", strlen("Switch"))){ - /* Found new table found - start over */ - fseek(fp, offset_in_table, SEEK_SET); - break; - } - /* Receive next charecter */ - c = fgetc(fp); - fseek(fp, -1, SEEK_CUR); - } - if(EOF == c){ - /* End-Of-File was met without - * finding the required routing table*/ - rc = OPAL_ERROR; - } - } - else{ - /* The right table was found */ - while(EOF != c){ - - fgets(str, MAX_LINE_LEN, fp); - - /* Test if a new table was found */ - if(!strncmp(str, "Switch", strlen("Switch"))){ - /* Quit the search - table was fully read */ - return rc; - } - /* Still in the required switch route table */ - else{ - /* Check correcness of the data and update table size */ - num_items = sscanf(str, "%hx %c %c", &dest_lid, &c, &service_level); - if(3 != num_items){ - /* Failed to read input data / wrong input formate */ - rc = OPAL_ERROR; - return rc; - } - (*hash_table_size)++; - - /* Add the data to the list*/ - item = (switch_to_switch_sl*)calloc(1, sizeof(switch_to_switch_sl)); - if(NULL == item){ - rc = OPAL_ERR_OUT_OF_RESOURCE; - return rc; - } - item->switch_lid = dest_lid; - item->service_level = service_level; - - p_next_item = p_head->next; - p_head->next = item; - item->next = p_next_item; - } - /* Get next charecter */ - c = fgetc(fp); - fseek(fp, -1, SEEK_CUR); - } - /* Set file descriptor to the beginning - * of the required table table */ - fseek(fp, table_offset, SEEK_SET); - } - } - return rc; -} - -/* - * Set port to switch hashtable according to data read from an input file. - * The hashtable Key is the port local ID (uint16_t). - * The hashtable Value is the local ID (uint16_t) of the switch connected to the port in the fabric. - * - * @Param hashtable - the hashtable to set. - * @Param hashtable_size - the number of hashtable elements. - * @Param head - Pointer to a linked list containing - * the pairs two be stored in the hashtable. - * @return - Error code. Non Zero value on error. - */ -static int set_port_to_switch_hash_table(opal_hash_table_t* hashtable, size_t hashtable_size, port_to_switch_lids** p_head) -{ - int ret; - uint16_t key; - uint16_t* value = NULL; - unsigned int i; - int rc = OPAL_SUCCESS; - - port_to_switch_lids* head = NULL; - port_to_switch_lids* p_item = NULL; - port_to_switch_lids* p_item_next = NULL; - - - if((NULL == p_head) || (NULL == *p_head)){ - rc = OPAL_ERROR; - return rc; - } - head = *p_head; - - for(i = 0; i < hashtable_size; i++){ - - /* Read pairs of port-lid and witch-lid from - * file and store them in the input hashtable */ - value = (uint16_t*)calloc(1, sizeof(uint16_t)); - if(NULL == value){ - rc = OPAL_ERR_OUT_OF_RESOURCE; - return rc; - } - - /* Get next pair to store */ - p_item = head->next; - if(NULL == p_item){ - rc = OPAL_ERROR; - return rc; - } - key = p_item->port_lid; - *value = p_item->switch_lid; - /* Remove item from list */ - p_item_next = p_item->next; - head->next = p_item_next; - free(p_item); - - /* Set the port to switch LIDS hashtable */ - ret = opal_hash_table_set_value_ptr(hashtable, &key, sizeof(uint16_t), (void*)value); - if(OPAL_SUCCESS != ret){ - OFACM_ERROR(("Failed to set port2switch hashtable\n")); - rc = OPAL_ERROR; - break; - } - } - - free(*p_head); - *p_head = NULL; - return rc; -} - -/* - * Set switch to switch hashtable according to data read from an input file. - * The hashtable Key is a switch local ID (uint16_t). - * The hashtable Value is the service level (uint8_t) of the route in the - * fabric between local switch LID (represented by key) and remote switch LID. - * - * @Param hashtable - The hashtable to set. - * @Param hashtable_size - The number of hashtable elements. - * @Param head - Pointer to a list of all the data - * pair to be inserted into the hashtable. - * @return - Error code. Non Zero value on error. - */ -static int set_switch_to_switch_hash_table(opal_hash_table_t* hashtable, size_t hashtable_size, switch_to_switch_sl** p_head) -{ - uint16_t key; /* switch lid */ - uint8_t* value = NULL; - unsigned int i; - int rc = OPAL_SUCCESS; - int ret = OPAL_SUCCESS; - - switch_to_switch_sl* head = NULL; - switch_to_switch_sl* item = NULL; - switch_to_switch_sl* p_next_item = NULL; - - - if((NULL == p_head) || (NULL == *p_head)){ - rc = OPAL_ERROR; - return rc; - } - head = *p_head; - - /* Read pairs of remote switch (LID) and - * route service level (SL) from file - * and store the in the input hashtable */ - for(i = 0; i < hashtable_size; i++) - { - - value = (uint8_t*)calloc(1, sizeof(uint8_t)); - if(NULL == value){ - rc = OPAL_ERR_OUT_OF_RESOURCE; - return rc; - } - - /* Get data from list */ - item = head->next; - if(NULL == item){ - rc = OPAL_ERROR; - return rc; - } - key = item->switch_lid; - *value = item->service_level; - - /* Remove data item from list */ - p_next_item = item->next; - head->next = p_next_item; - free(item); - - ret = opal_hash_table_set_value_ptr(hashtable, &key, sizeof(uint16_t), value); - if(OPAL_SUCCESS != ret){ - OFACM_ERROR(("Failed to set sw2sw hashtable\n")); - rc = OPAL_ERROR; - break; - } - } - - free(*p_head); - *p_head = NULL; - return rc; -} - -/* - * An efficient method that allows to find the service level of any - * any route from an input port to any other port in the fabric. - * - * Create two hashtables according to data read from an input file. - * The first table maps any port LID in the fabric to the LID of - * the switch it is connected to. - * The second table is dedicated to the switch LID to which the - * local port is connected. - * - * The table maps a remote switch LID to the service level - * of the route between the table's LID and this remote LID. - * - * @Param lid - the local ID of the port. - * @return - Error Code. Non Zero value in case of error. - */ -static int create_service_level_table_for_port(uint16_t lid, opal_hash_table_t* port_to_switch_hash_table, - opal_hash_table_t* switch_to_switch_hash_table) -{ - FILE* fp = NULL; - uint16_t* switch_lid; - void* p_switch_lid = NULL; - - int rc = OPAL_SUCCESS; - int ret = OPAL_SUCCESS; - - int file_name_len; - char* switch_to_sl = NULL; - - int port_to_switch_hash_table_size = 0; - int switch_to_switch_hash_table_size = 0; - - port_to_switch_lids* port_switch_lids = NULL; - switch_to_switch_sl* switch_sl = NULL; - - - - /* Open input configuration file */ - fp = fopen(opal_common_ofacm_three_dim_torus, "rt"); - if(NULL == fp){ - /* File Opening failed */ - fprintf(stderr, "Failed to open the input file for the fabric's service level\n"); - rc = OPAL_ERR_FILE_OPEN_FAILURE; - goto ERROR; - } - - /* Get port-to-switch hashtable size */ - rc = get_port_to_switch_hashtable_data_from_file(fp, &port_to_switch_hash_table_size, &port_switch_lids); - if(OPAL_SUCCESS != rc){ - goto ERROR; - } - fclose(fp); - fp = NULL; - - /* Build and initialize the port-to-swich hashtable */ - OBJ_CONSTRUCT(port_to_switch_hash_table, opal_hash_table_t); - opal_hash_table_init(port_to_switch_hash_table, port_to_switch_hash_table_size); - - /* Set the port-to-switch hashtable */ - rc = set_port_to_switch_hash_table(port_to_switch_hash_table, port_to_switch_hash_table_size, &port_switch_lids); - if(OPAL_SUCCESS != rc){ - goto ERROR; - } - - /* Get the LID of the switch connected to the port's LID */ - ret = opal_hash_table_get_value_ptr(port_to_switch_hash_table, &lid, sizeof(uint16_t), &p_switch_lid); - if(OPAL_SUCCESS != ret){ - rc = OPAL_ERROR; - goto ERROR; - } - - - /* Open the file containing the mapping from switch-to-switch route to service level */ - file_name_len = strlen(opal_common_ofacm_three_dim_torus); - switch_to_sl = (char*)calloc(file_name_len + 7, sizeof(char)); - if(NULL == switch_to_sl){ - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto ERROR; - } - /* Build the switch-to-switch file name based on the port-to-switch file name */ - strncpy(switch_to_sl, opal_common_ofacm_three_dim_torus, - strlen(opal_common_ofacm_three_dim_torus) - strlen("peer-paths.dump") - 1); - strcat(switch_to_sl, "-sw2sw-path-records.dump"); - - /* Open path-to-SL file */ - fp = fopen(switch_to_sl, "rt"); - if(NULL == fp){ - /* File Opening failed */ - fprintf(stderr, "Failed to open the input file for the fabric's service level\n"); - rc = OPAL_ERR_FILE_OPEN_FAILURE; - goto ERROR; - } - free(switch_to_sl); - - switch_lid = (uint16_t*)p_switch_lid; - rc = get_switch_to_switch_hashtable_size_from_file(fp, *(uint16_t*)switch_lid, - &switch_to_switch_hash_table_size, &switch_sl); - if(OPAL_SUCCESS != rc){ - - goto ERROR; - } - fclose(fp); - fp = NULL; - - /* Build and initialize the switch-to-switch hashtable */ - OBJ_CONSTRUCT(switch_to_switch_hash_table, opal_hash_table_t); - opal_hash_table_init(switch_to_switch_hash_table, switch_to_switch_hash_table_size); - - /* Set the switch-to-switch hashtable */ - rc = set_switch_to_switch_hash_table(switch_to_switch_hash_table, - switch_to_switch_hash_table_size, &switch_sl); - if(OPAL_SUCCESS != rc){ - goto ERROR; - } - - - /* Use: opal_hash_table_get_value_uint64 */ - return OPAL_SUCCESS; -ERROR: - /* Close open files */ - if(NULL != fp){ - fclose(fp); - } - /* Release allocated resources */ - if(NULL != port_switch_lids){ - port_to_switch_lids* p_list = port_switch_lids; - port_to_switch_lids* p_item = NULL; - while(p_list->next != NULL){ - p_item = p_list->next; - if(NULL != p_item){ - p_list->next = p_item->next; - free(p_item); - } - } - free(p_list); - } - if(NULL != switch_sl){ - switch_to_switch_sl* p_list = switch_sl; - switch_to_switch_sl* p_item = NULL; - while(p_list->next != NULL){ - p_item = p_list->next; - if(NULL != p_item){ - p_list->next = p_item->next; - free(p_item); - } - } - free(p_list); - } - return rc; -} - diff --git a/opal/mca/common/ofacm/common_ofacm_oob.h b/opal/mca/common/ofacm/common_ofacm_oob.h deleted file mode 100644 index 4b4d9b42956..00000000000 --- a/opal/mca/common/ofacm/common_ofacm_oob.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef COMMON_OFACM_OOB_H -#define COMMON_OFACM_OOB_H - -#include "connect.h" - -extern opal_common_ofacm_base_component_t opal_common_ofacm_oob; - -#endif diff --git a/opal/mca/common/ofacm/common_ofacm_xoob.c b/opal/mca/common/ofacm/common_ofacm_xoob.c deleted file mode 100644 index 0970a2da96d..00000000000 --- a/opal/mca/common/ofacm/common_ofacm_xoob.c +++ /dev/null @@ -1,1539 +0,0 @@ -/* - * Copyright (c) 2007-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2012 Los Alamos National Security, LLC. - * All rights reserved. - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2013 NVIDIA Corporation. All rights reserved. - * Copyright (c) 2014 Research Organization for Information Science - * and Technology (RIST). All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#include "opal_config.h" - -#include "opal/runtime/opal_progress.h" -#include "opal/dss/dss.h" -#include "opal/util/alfg.h" -#include "opal/util/error.h" -#include "opal/util/output.h" -#include "opal/util/show_help.h" - -#include "ompi/mca/rte/rte.h" -#include "common_ofacm_xoob.h" -#include "opal/class/opal_hash_table.h" -#include "base.h" -#include "connect.h" -#include "opal/constants.h" - -#define SIZE_OF3(A, B, C) (sizeof(A) + sizeof(B) + sizeof(C)) -#define BASE_TO_XOOB(context) (opal_common_ofacm_xoob_local_connection_context_t *)context -#define XOOB_TO_BASE(xcontext) (opal_common_ofacm_base_local_connection_context_t *)xcontext - -static void xoob_component_register(void); -static int xoob_component_query(opal_common_ofacm_base_dev_desc_t *dev, - opal_common_ofacm_base_module_t **cpc); -static int xoob_component_finalize(void); - -static int xoob_module_start_connect - (opal_common_ofacm_base_local_connection_context_t *context); - -static void xoob_ib_address_constructor(ofacm_ib_address_t *ib_addr); -static void xoob_ib_address_destructor(ofacm_ib_address_t *ib_addr); - -OBJ_CLASS_INSTANCE(ofacm_ib_address_t, - opal_list_item_t, - xoob_ib_address_constructor, - xoob_ib_address_destructor); -/* - * The "component" struct -- the top-level function pointers for the - * xoob connection scheme. - */ -opal_common_ofacm_base_component_t opal_common_ofacm_xoob = { - "xoob", - /* Register */ - xoob_component_register, - /* Init */ - NULL, - /* Query */ - xoob_component_query, - /* Finalize */ - xoob_component_finalize, -}; - -typedef enum { - ENDPOINT_XOOB_CONNECT_REQUEST, - ENDPOINT_XOOB_CONNECT_RESPONSE, - ENDPOINT_XOOB_CONNECT_XRC_REQUEST, - ENDPOINT_XOOB_CONNECT_XRC_RESPONSE, - ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE /* The xrc recv qp already was destroyed */ -} connect_message_type_t; - -static int xoob_priority = 60; -static bool rml_recv_posted = false; -static opal_rng_buff_t rand_buff; - -#define XOOB_SET_REMOTE_INFO(EP, INFO) \ -do { \ - /* copy the rem_info stuff */ \ - EP.rem_lid = INFO.rem_lid; \ - EP.rem_subnet_id = INFO.rem_subnet_id; \ - EP.rem_mtu = INFO.rem_mtu; \ - EP.rem_index = INFO.rem_index; \ - memcpy((void*)EP.rem_qps, (void*)INFO.rem_qps, \ - sizeof(mca_btl_openib_rem_qp_info_t)); \ - /* copy the rem_info stuff */ \ - memcpy((void*)EP.rem_srqs, (void*)INFO.rem_srqs, \ - sizeof(mca_btl_openib_rem_srq_info_t) * \ - mca_btl_openib_component.num_xrc_qps); \ -} while (0) - -/* Constructor destructor for xoob context. */ -static void xoob_local_context_constructor - (opal_common_ofacm_xoob_local_connection_context_t *context) -{ - context->addr = NULL; - context->xrc_recv_psn = 0; -} - -static void xoob_local_context_destructor - (opal_common_ofacm_xoob_local_connection_context_t *context) -{ - if(NULL != context->addr) { - OBJ_RELEASE(context->addr); - } -} - -OBJ_CLASS_INSTANCE(opal_common_ofacm_xoob_local_connection_context_t, - opal_common_ofacm_base_local_connection_context_t, - xoob_local_context_constructor, - xoob_local_context_destructor); - -static void xoob_pending_context_constructor(pending_context_t *pcontext) -{ - pcontext->xcontext = NULL; -} - -static void xoob_pending_context_destructor(pending_context_t *pcontext) -{ - /* I have nothing to do !*/ -} - -static void xoob_pending_context_init(pending_context_t *pcontext, - opal_common_ofacm_xoob_local_connection_context_t *xcontext) -{ - pcontext->xcontext = xcontext; -} - -OBJ_CLASS_INSTANCE(pending_context_t, - opal_list_item_t, - xoob_pending_context_constructor, - xoob_pending_context_destructor); - -static void xoob_ib_address_constructor(ofacm_ib_address_t *ib_addr) -{ - ib_addr->key = NULL; - ib_addr->subnet_id = 0; - ib_addr->lid = 0; - ib_addr->status = XOOB_ADDR_CLOSED; - ib_addr->qps = NULL; - OBJ_CONSTRUCT(&ib_addr->addr_lock, opal_mutex_t); - OBJ_CONSTRUCT(&ib_addr->pending_contexts, opal_list_t); -} - -static void xoob_ib_address_destructor(ofacm_ib_address_t *ib_addr) -{ - if(NULL != ib_addr->qps && NULL != ib_addr->qps[0].lcl_qp) { - if(ibv_destroy_qp(ib_addr->qps[0].lcl_qp)) { - OFACM_ERROR(("Failed to destroy QP:%d\n", 0)); - } - } - if (NULL != ib_addr->key) { - free(ib_addr->key); - } - OBJ_DESTRUCT(&ib_addr->addr_lock); - OBJ_DESTRUCT(&ib_addr->pending_contexts); -} - -static int xoob_ib_address_init(ofacm_ib_address_t *ib_addr, uint16_t lid, uint64_t s_id, ompi_jobid_t ep_jobid) -{ - ib_addr->key = malloc(SIZE_OF3(s_id, lid, ep_jobid)); - if (NULL == ib_addr->key) { - OFACM_ERROR(("Failed to allocate memory for key\n")); - return OPAL_ERROR; - } - memset(ib_addr->key, 0, SIZE_OF3(s_id, lid, ep_jobid)); - /* creating the key = lid + s_id + ep_jobid */ - memcpy(ib_addr->key, &lid, sizeof(lid)); - memcpy((void*)((char*)ib_addr->key + sizeof(lid)), &s_id, sizeof(s_id)); - memcpy((void*)((char*)ib_addr->key + sizeof(lid) + sizeof(s_id)), - &ep_jobid, sizeof(ep_jobid)); - /* caching lid and subnet id */ - ib_addr->subnet_id = s_id; - ib_addr->lid = lid; - - return OPAL_SUCCESS; -} - -/* Create new entry in hash table for subnet_id and lid, - * update the context pointer. - * Before call to this function you need to protect with - */ -static ofacm_ib_address_t* xoob_ib_address_add_new (opal_common_ofacm_xoob_module_t *xcpc, - uint16_t lid, uint64_t s_id, ompi_jobid_t ep_jobid) -{ - void *tmp; - int ret; - struct ofacm_ib_address_t *ib_addr = OBJ_NEW(ofacm_ib_address_t); - - ret = xoob_ib_address_init(ib_addr, lid, s_id, ep_jobid); - if (OPAL_SUCCESS != ret ) { - OFACM_ERROR(("XRC Internal error. Failed to init ib_addr\n")); - OBJ_DESTRUCT(ib_addr); - return NULL; - } - /* is it already in the table ?*/ - if (OPAL_SUCCESS != opal_hash_table_get_value_ptr(&xcpc->ib_addr_table, - ib_addr->key, - SIZE_OF3(s_id, lid, ep_jobid), &tmp)) { - /* It is new one, lets put it on the table */ - ret = opal_hash_table_set_value_ptr(&xcpc->ib_addr_table, - ib_addr->key, SIZE_OF3(s_id, lid, ep_jobid), (void*)ib_addr); - if (OPAL_SUCCESS != ret) { - OFACM_ERROR(("XRC Internal error." - " Failed to add element to ib_addr_table\n")); - OBJ_DESTRUCT(ib_addr); - return NULL; - } - } else { - /* so we have this one in the table, just return the pointer */ - OBJ_DESTRUCT(ib_addr); - ib_addr = (ofacm_ib_address_t *)tmp; - OBJ_RETAIN(ib_addr); - assert(lid == ib_addr->lid && s_id == ib_addr->subnet_id); - } - - /* update the context with pointer to ib address */ - return ib_addr; -} - -static void xoob_connection_complete(opal_common_ofacm_xoob_local_connection_context_t *xcontext) -{ - bool master = false; - pending_context_t *pcon; - opal_common_ofacm_base_local_connection_context_t *con; - opal_common_ofacm_base_local_connection_context_t *context = - XOOB_TO_BASE(xcontext); - - OFACM_VERBOSE(("Now we are CONNECTED")); - OPAL_THREAD_LOCK(&xcontext->addr->addr_lock); - if (XOOB_ADDR_CONNECTED == xcontext->addr->status) { - /* We are not xrc master */ - /* set our qp pointer to master qp */ - master = false; - } else { - /* I'm master of XRC */ - xcontext->addr->status = XOOB_ADDR_CONNECTED; - master = true; - } - - /* The status was moved down to cpc */ - context->state = MCA_COMMON_OFACM_CONNECTED; - - while(master && !opal_list_is_empty(&xcontext->addr->pending_contexts)) { - pcon = (pending_context_t *)opal_list_remove_first(&xcontext->addr->pending_contexts); - con = XOOB_TO_BASE(pcon->xcontext); - OBJ_RELEASE(pcon); - if (OPAL_SUCCESS != - xoob_module_start_connect(con)) { - OFACM_ERROR(("Failed to connect pending endpoint\n")); - } - } - OPAL_THREAD_UNLOCK(&xcontext->addr->addr_lock); - - context->connect_cb(context->user_context); -} - -static int xoob_init_rem_info_alloc_qp(opal_common_ofacm_base_remote_connection_context_t *rem_info) -{ - rem_info->rem_qps = (opal_common_ofacm_base_rem_qp_info_t *) - malloc(sizeof(opal_common_ofacm_base_rem_qp_info_t)); - if (NULL == rem_info->rem_qps) { - OFACM_ERROR(("Failed to allocate memory for remote QP data\n")); - return OPAL_ERROR; - } - return OPAL_SUCCESS; -} - -static int xoob_init_rem_info_alloc_srq(opal_common_ofacm_base_remote_connection_context_t *rem_info, uint8_t num_srqs) -{ - rem_info->rem_srqs = (opal_common_ofacm_base_rem_srq_info_t*) - calloc(num_srqs, sizeof(opal_common_ofacm_base_rem_srq_info_t)); - if (NULL == rem_info->rem_srqs) { - OFACM_ERROR(("Failed to allocate memory for remote SRQ data\n")); - return OPAL_ERROR; - } - return OPAL_SUCCESS; -} - -/* Free remote information structs */ -static void xoob_free_rem_info(opal_common_ofacm_base_remote_connection_context_t *rem_info) -{ - if (NULL != rem_info->rem_qps) { - free(rem_info->rem_qps); - } - if (NULL != rem_info->rem_srqs) { - free(rem_info->rem_srqs); - } -} - -static int xoob_set_remote_info(opal_common_ofacm_xoob_local_connection_context_t *xcontext, - opal_common_ofacm_base_remote_connection_context_t *remote_info) -{ - opal_common_ofacm_base_local_connection_context_t *context = XOOB_TO_BASE(xcontext); - - /* If we got qp information - copy it */ - if (NULL != remote_info->rem_qps) { - xoob_init_rem_info_alloc_qp(&context->remote_info); - memcpy(context->remote_info.rem_qps, - remote_info->rem_qps, - sizeof(opal_common_ofacm_base_rem_qp_info_t)); - } - - if (NULL != remote_info->rem_srqs) { - xoob_init_rem_info_alloc_srq(&context->remote_info, context->num_of_srqs); - memcpy(context->remote_info.rem_srqs, remote_info->rem_srqs, - sizeof(opal_common_ofacm_base_rem_srq_info_t)*context->num_of_srqs); - } - - context->remote_info.rem_lid = remote_info->rem_lid; - context->remote_info.rem_subnet_id = remote_info->rem_subnet_id; - context->remote_info.rem_mtu = remote_info->rem_mtu; - context->remote_info.rem_index = remote_info->rem_index; - - OFACM_VERBOSE(("Setting QP info, LID = %d", context->remote_info.rem_lid)); - return OPAL_SUCCESS; - -} - -static void xoob_report_error(opal_common_ofacm_xoob_local_connection_context_t *xcontext) -{ - if (NULL == xcontext || NULL == (XOOB_TO_BASE(xcontext))->error_cb) { - /* The context is undefined and we can not print specific error */ - opal_show_help("help-mpi-common-ofacm-oob.txt", - "ofacm oob fatal error", true, - opal_proc_local_get()->proc_hostname, - __FILE__, __LINE__); - exit(1); - } - - /* Other way, call to user error callback */ - (XOOB_TO_BASE(xcontext))->error_cb((XOOB_TO_BASE(xcontext))->user_context); -} - -static int xoob_context_init(opal_common_ofacm_xoob_local_connection_context_t *xcontext, - opal_common_ofacm_xoob_module_t *xcpc, - opal_common_ofacm_base_context_connect_cb_fn_t connect_cb, - opal_common_ofacm_base_context_error_cb_fn_t error_cb, - opal_common_ofacm_base_context_prepare_recv_cb_fn_t prepare_recv_cb, - opal_common_ofacm_base_proc_t *proc, - opal_common_ofacm_base_qp_config_t *qp_config, - struct ibv_pd *pd, uint64_t subnet_id, int cpc_type, - uint16_t lid, uint16_t rem_lid, - int32_t user_context_index, void *user_context) -{ - int ret; - opal_common_ofacm_base_local_connection_context_t *context = - XOOB_TO_BASE(xcontext); - opal_common_ofacm_base_module_t *cpc = - (opal_common_ofacm_base_module_t *)xcpc; - - /* Set IB address for this context */ - xcontext->addr = xoob_ib_address_add_new(xcpc, rem_lid, subnet_id, proc->proc_opal->proc_name.jobid); - if (NULL == xcontext->addr) { - OFACM_ERROR(("Failed to allocate or found xoob ib address")); - return OPAL_ERROR; - } - - /* Allocate memory for QPs */ - if (NULL == xcontext->addr->qps) { - xcontext->addr->qps = - calloc(qp_config->num_qps, sizeof(opal_common_ofacm_base_qp_t)); - if(NULL == xcontext->addr->qps) { - OFACM_ERROR(("Failed to allocate memory for qps")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - /* Update QP pointers */ - context->qps = xcontext->addr->qps; - - /* Init base context */ - ret = opal_common_ofacm_base_context_init(context, cpc, connect_cb, error_cb, - prepare_recv_cb, proc, qp_config, - pd, subnet_id, cpc_type, lid, rem_lid, user_context_index, user_context); - if (OPAL_SUCCESS != ret) { - return ret; - } - - return OPAL_SUCCESS; -} - -/* XOOB connection context init */ -static opal_common_ofacm_base_local_connection_context_t* - xoob_endpoint_init(opal_proc_t *proc, - opal_common_ofacm_base_qp_config_t *qp_config, - struct ibv_pd *pd, uint64_t subnet_id, int cpc_type, - uint16_t lid, uint16_t rem_lid, int32_t user_context_index, void *user_context, - opal_common_ofacm_base_module_t *cpc, - opal_common_ofacm_base_context_connect_cb_fn_t connect_cb, - opal_common_ofacm_base_context_error_cb_fn_t error_cb, - opal_common_ofacm_base_context_prepare_recv_cb_fn_t prepare_recv_cb) -{ - int ret; - bool new_proc; - opal_common_ofacm_xoob_local_connection_context_t *xcontext; - opal_common_ofacm_base_proc_t *context_proc; - opal_common_ofacm_xoob_module_t *xcpc = - (opal_common_ofacm_xoob_module_t *)cpc; - - xcontext = (opal_common_ofacm_xoob_local_connection_context_t*) - OBJ_NEW(opal_common_ofacm_xoob_local_connection_context_t); - context_proc = opal_common_ofacm_base_find_proc(&opal_common_ofacm_xoob, proc); - - if (NULL == context_proc) { - new_proc = true; - /* constructing new proc */ - context_proc = (opal_common_ofacm_base_proc_t *) - OBJ_NEW(opal_common_ofacm_base_proc_t ); - } else { - new_proc = false; - OBJ_RETAIN(context_proc); - } - - OFACM_VERBOSE(("Xoob endpoint init: cpc_type %d, rem_lid %d, my_lid %d, subnet id %d", - cpc_type, rem_lid, lid, subnet_id)); - - opal_common_ofacm_base_proc_setup(context_proc, XOOB_TO_BASE(xcontext), proc); - ret = xoob_context_init(xcontext, xcpc, connect_cb, error_cb, - prepare_recv_cb, context_proc, qp_config, - pd, subnet_id, cpc_type, lid, rem_lid, user_context_index, user_context); - if (OPAL_SUCCESS != ret) { - OBJ_DESTRUCT(context_proc); - OBJ_DESTRUCT(xcontext); - return NULL; - } - if(new_proc) { - opal_list_append(&opal_common_ofacm_xoob.all_procs, - (opal_list_item_t *)context_proc); - } - - return &xcontext->super; -} - -static int xoob_endpoint_finalize - (opal_common_ofacm_base_local_connection_context_t *context) -{ - opal_list_item_t *proc_item, *cntx_item, *cntx_item_next; - opal_list_t *proc_list = &opal_common_ofacm_xoob.all_procs; - opal_common_ofacm_xoob_local_connection_context_t *xcontext; - - /* Proc cleanup. We should find the context proc in all proc list and remove - * from the proc list our context. After it we try to release the proc context */ - for (proc_item = opal_list_get_first(proc_list); - proc_item != opal_list_get_end(proc_list); - proc_item = opal_list_get_next(proc_item)) { - if (context->proc == ((opal_common_ofacm_base_proc_t *)proc_item)){ - opal_common_ofacm_base_proc_t *proc = - (opal_common_ofacm_base_proc_t *)proc_item; - opal_list_t *cntx_list = &proc->all_contexts; - - /* Remove the context from proc list */ - cntx_item = opal_list_get_first(cntx_list); - while(cntx_item != opal_list_get_end(cntx_list)) { - /* take the next before removing from the list */ - cntx_item_next = opal_list_get_next(cntx_item); - if (context == (opal_common_ofacm_base_local_connection_context_t *)cntx_item) { - opal_list_remove_item(cntx_list, cntx_item); - } - cntx_item = cntx_item_next; - } - /* Remove our proc from all list */ - if (opal_list_is_empty(cntx_list)) { - opal_list_remove_item(proc_list, (opal_list_item_t *)proc); - } - OBJ_RELEASE(proc); - } - } - - if (0 != context->xrc_recv_qp_num) { - if(ibv_unreg_xrc_rcv_qp(context->init_attr[0].xrc_domain, - context->xrc_recv_qp_num)) { - OFACM_ERROR(("Failed to unregister XRC recv QP:%d\n", context->xrc_recv_qp_num)); - } - } - - xcontext = BASE_TO_XOOB(context); - - /* We done with proc release and now we way destroy the context */ - OBJ_DESTRUCT(xcontext); - - return OPAL_SUCCESS; -} - -/* - * Callback when we have finished RML sending the connect data to a - * remote peer - */ -static void xoob_rml_send_cb(int status, opal_process_name_t* context, - opal_buffer_t* buffer, ompi_rml_tag_t tag, - void* cbdata) -{ - OBJ_RELEASE(buffer); -} - -/* Receive connect information to remote context */ -static int xoob_receive_connect_data(opal_common_ofacm_base_remote_connection_context_t *info, uint16_t *lid, int *cpc_type, - uint8_t *message_type, opal_buffer_t* buffer) -{ - int cnt = 1, rc, srq; - uint8_t num_srqs; - - /* Recv standart header */ - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT8)); - rc = opal_dss.unpack(buffer, message_type, &cnt, OPAL_UINT8); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack Message type = %d", *message_type)); - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT64)); - rc = opal_dss.unpack(buffer, &info->rem_subnet_id, &cnt, OPAL_UINT64); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack sid = %d", info->rem_subnet_id)); - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16)); - rc = opal_dss.unpack(buffer, &info->rem_lid, &cnt, OPAL_UINT16); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack lid = %d", info->rem_lid)); - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_INT)); - rc = opal_dss.unpack(buffer, cpc_type, &cnt, OPAL_INT); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack cpc_type = %d", *cpc_type)); - - /* Till now we got the standart header, now we continue to recieve data for - * different packet types - */ - if (ENDPOINT_XOOB_CONNECT_REQUEST == *message_type || - ENDPOINT_XOOB_CONNECT_RESPONSE == *message_type) { - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); - rc = opal_dss.unpack(buffer, &info->rem_qps->rem_qp_num, &cnt, - OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack remote qp = %x", info->rem_qps->rem_qp_num)); - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); - rc = opal_dss.unpack(buffer, &info->rem_qps->rem_psn, &cnt, - OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack remote psn = %d", info->rem_qps->rem_psn)); - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); - rc = opal_dss.unpack(buffer, &info->rem_mtu, &cnt, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack remote mtu = %d", info->rem_mtu)); - } - - if (ENDPOINT_XOOB_CONNECT_REQUEST == *message_type || - ENDPOINT_XOOB_CONNECT_XRC_REQUEST == *message_type) { - /* unpack requested lid info */ - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT16)); - rc = opal_dss.unpack(buffer, lid, &cnt, OPAL_UINT16); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack requested lid = %d", *lid)); - } - - /* Unpack requested recv qp number */ - if (ENDPOINT_XOOB_CONNECT_XRC_REQUEST == *message_type) { - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); - /* In XRC request case we will use rem_qp_num as container for requested qp number */ - rc = opal_dss.unpack(buffer, &info->rem_qps->rem_qp_num, &cnt, - OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - OFACM_VERBOSE(("Recv unpack requested qp = %x", info->rem_qps->rem_qp_num)); - } - - if (ENDPOINT_XOOB_CONNECT_RESPONSE == *message_type || - ENDPOINT_XOOB_CONNECT_XRC_RESPONSE == *message_type) { - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT32)); - rc = opal_dss.unpack(buffer, &info->rem_index, &cnt, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack remote index = %d", info->rem_index)); - - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT8)); - rc = opal_dss.unpack(buffer, &num_srqs, &cnt, OPAL_UINT8); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack remote num of srqs = %d", num_srqs)); - - rc = xoob_init_rem_info_alloc_srq(info, num_srqs); - if (OPAL_SUCCESS != rc) { - return OPAL_ERROR; - } - for (srq = 0; srq < num_srqs; srq++) { - OFACM_VERBOSE(("unpacking %d of %d\n", cnt, OPAL_UINT8)); - rc = opal_dss.unpack(buffer, &info->rem_srqs[srq].rem_srq_num, &cnt, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return OPAL_ERROR; - } - OFACM_VERBOSE(("Recv unpack remote index srq num[%d]= %d", srq, info->rem_srqs[srq].rem_srq_num)); - } - } - return OPAL_SUCCESS; -} - -/* - * send connect information to remote context - */ -static int xoob_send_connect_data(opal_common_ofacm_xoob_local_connection_context_t* xcontext, - uint8_t message_type) -{ - opal_buffer_t* buffer = OBJ_NEW(opal_buffer_t); - int rc, srq; - opal_common_ofacm_base_local_connection_context_t *context = XOOB_TO_BASE(xcontext); - - if (NULL == buffer) { - OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - /* Bulding standart header that we use in all messages: - * - Message type, - * - Our subnet id - * - Our LID - */ - /* pack the info in the send buffer */ - OFACM_VERBOSE(("Send pack Message type = %d", message_type)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT8)); - rc = opal_dss.pack(buffer, &message_type, 1, OPAL_UINT8); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - - OFACM_VERBOSE(("Send pack sid = %d", context->subnet_id)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT64)); - rc = opal_dss.pack(buffer, &context->subnet_id, 1, OPAL_UINT64); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - - OFACM_VERBOSE(("Send pack lid = %d", context->lid)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT16)); - rc = opal_dss.pack(buffer, &context->lid, 1, OPAL_UINT16); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - - OFACM_VERBOSE(("Send pack cpc type = %d", context->cpc_type)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_INT)); - rc = opal_dss.pack(buffer, &context->cpc_type, 1, OPAL_INT); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - - /* Now we append to standart header additional information - * that is required for full (open qp,etc..) connect request and response: - * - qp_num of first qp - * - psn of first qp - * - MTU - */ - if (ENDPOINT_XOOB_CONNECT_REQUEST == message_type || - ENDPOINT_XOOB_CONNECT_RESPONSE == message_type) { - uint32_t psn, qp_num; - - if (ENDPOINT_XOOB_CONNECT_REQUEST == message_type) { - qp_num = context->qps[0].lcl_qp->qp_num; - psn = context->qps[0].lcl_psn; - } else { - qp_num = context->xrc_recv_qp_num; - psn = xcontext->xrc_recv_psn; - } - /* stuff all the QP info into the buffer */ - /* we need to send only one QP */ - OFACM_VERBOSE(("Send pack qp num = %x", qp_num)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, &qp_num, 1, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - OFACM_VERBOSE(("Send pack lpsn = %d", psn)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, &psn, 1, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - - OFACM_VERBOSE(("Send pack mtu = %d", context->attr[0].path_mtu)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, &context->attr[0].path_mtu, 1, - OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - } - - /* We append to header above additional information - * that is required for full & XRC connect request: - * - The lid ob btl on remote site that we want to connect - */ - if (ENDPOINT_XOOB_CONNECT_REQUEST == message_type || - ENDPOINT_XOOB_CONNECT_XRC_REQUEST == message_type) { - /* when we are sending request we add remote lid that we want to connect */ - - OFACM_VERBOSE(("Send pack remote lid = %d", context->rem_lid)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT16)); - rc = opal_dss.pack(buffer, &context->rem_lid, 1, OPAL_UINT16); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - } - - /* when we are sending xrc request we add remote - * recv qp number that we want to connect. */ - if (ENDPOINT_XOOB_CONNECT_XRC_REQUEST == message_type) { - OFACM_VERBOSE(("Send pack remote qp = %x", xcontext->addr->remote_xrc_rcv_qp_num)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, &xcontext->addr->remote_xrc_rcv_qp_num, - 1, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - } - /* We append to header above additional information - * that is required for full & XRC connect response: - * - index of our context - * - array of xrc-srq numbers - */ - if (ENDPOINT_XOOB_CONNECT_RESPONSE == message_type || - ENDPOINT_XOOB_CONNECT_XRC_RESPONSE == message_type) { - /* we need to send the context index for immidate send */ - OFACM_VERBOSE(("Send pack index = %d", context->index)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, &context->index, 1, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - - OFACM_VERBOSE(("Send pack number of srqs = %d", context->num_of_srqs)); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT8)); - rc = opal_dss.pack(buffer, &context->num_of_srqs, 1, OPAL_UINT8); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - /* on response we add all SRQ numbers */ - for (srq = 0; srq < context->num_of_srqs; srq++) { - OFACM_VERBOSE(("Send pack srq[%d] num = %d", srq, context->srq_num[srq])); - OFACM_VERBOSE(("packing %d of %d\n", 1, OPAL_UINT32)); - rc = opal_dss.pack(buffer, &context->srq_num[srq], - 1, OPAL_UINT32); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - } - } - - /* send to remote endpoint */ - rc = ompi_rte_send_buffer_nb(&context->proc->proc_opal->proc_name, - buffer, OMPI_RML_TAG_XOFACM, - xoob_rml_send_cb, NULL); - if (OPAL_SUCCESS != rc) { - OPAL_ERROR_LOG(rc); - return rc; - } - - OFACM_VERBOSE(("Send QP Info, LID = %d, SUBNET = %d, Message type = %d", - context->lid, - context->subnet_id, - message_type)); - - return OPAL_SUCCESS; -} - -/* Create XRC send qp */ -static int xoob_send_qp_create - (opal_common_ofacm_xoob_local_connection_context_t* xcontext) -{ - struct ibv_qp *qp; - struct ibv_qp_init_attr init_attr; - struct ibv_qp_attr attr; - int ret; - size_t req_inline; - uint32_t init_mask = 0; - opal_common_ofacm_base_local_connection_context_t *context = XOOB_TO_BASE(xcontext); - - /* Prepare QP structs */ - memcpy(&init_attr, &context->init_attr[0], sizeof(init_attr)); - req_inline = init_attr.cap.max_inline_data; - qp = ibv_create_qp(context->ib_pd, &init_attr); - if (NULL == qp) { - OFACM_ERROR(("Error creating QP, errno says: %s", strerror(errno))); - return OPAL_ERROR; - } - - context->qps[0].lcl_qp = qp; - - if (init_attr.cap.max_inline_data < req_inline) { - context->qps[0].ib_inline_max = init_attr.cap.max_inline_data; - opal_show_help("help-mpi-common-ofacm-base.txt", - "inline truncated", true, opal_proc_local_get()->proc_hostname, - req_inline, init_attr.cap.max_inline_data); - } else { - context->qps[0].ib_inline_max = req_inline; - } - - memcpy(&attr, &context->attr[0], sizeof(attr)); - attr.qp_state = IBV_QPS_INIT; - attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; - init_mask = IBV_QP_STATE | - IBV_QP_PKEY_INDEX | - IBV_QP_PORT | - IBV_QP_ACCESS_FLAGS; - - /* applying user specified init mask */ - if (NULL != context->custom_init_attr_mask) { - init_mask |= context->custom_init_attr_mask[0]; - } - - ret = ibv_modify_qp(qp, &attr, init_mask); - if (ret) { - OFACM_ERROR(("Error modifying QP[%x] to IBV_QPS_INIT errno says: %s [%d]", - qp->qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } - - /* Setup meta data on the context */ - context->qps[0].lcl_psn = opal_rand(&rand_buff) & 0xffffff; - - /* Now that all the qp's are created locally, post some receive - buffers, setup credits, etc. */ - return context->prepare_recv_cb(context->user_context); -} - -/* Send qp connect */ -static int xoob_send_qp_connect(opal_common_ofacm_xoob_local_connection_context_t *xcontext) -{ - struct ibv_qp* qp; - struct ibv_qp_attr attr; - uint32_t psn, rtr_mask = 0, rts_mask = 0; - int ret; - opal_common_ofacm_base_local_connection_context_t *context = XOOB_TO_BASE(xcontext); - enum ibv_mtu mtu = (context->attr[0].path_mtu < context->remote_info.rem_mtu) ? - context->attr[0].path_mtu : context->remote_info.rem_mtu; - - OFACM_VERBOSE(("Connecting Send QP\n")); - assert(NULL != context->qps); - qp = context->qps[0].lcl_qp; - psn = context->qps[0].lcl_psn; - - memset(&attr, 0, sizeof(attr)); - memcpy(&attr, context->attr, sizeof(struct ibv_qp_attr)); - attr.qp_state = IBV_QPS_RTR; - attr.path_mtu = mtu; - attr.dest_qp_num = context->remote_info.rem_qps[0].rem_qp_num; - attr.rq_psn = context->remote_info.rem_qps[0].rem_psn; - attr.ah_attr.dlid = context->remote_info.rem_lid; - attr.ah_attr.static_rate = 0; - rtr_mask = IBV_QP_STATE | - IBV_QP_AV | - IBV_QP_PATH_MTU | - IBV_QP_DEST_QPN | - IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC | - IBV_QP_MIN_RNR_TIMER; - - /* applying user specified rtr mask */ - if (NULL != context->custom_rtr_attr_mask) { - rtr_mask |= context->custom_rtr_attr_mask[0]; - } - - OFACM_VERBOSE(("Set MTU to IBV value %d (%s bytes)", attr.path_mtu, - (attr.path_mtu == IBV_MTU_256) ? "256" : - (attr.path_mtu == IBV_MTU_512) ? "512" : - (attr.path_mtu == IBV_MTU_1024) ? "1024" : - (attr.path_mtu == IBV_MTU_2048) ? "2048" : - (attr.path_mtu == IBV_MTU_4096) ? "4096" : - "unknown (!)")); - - ret = ibv_modify_qp(qp, &attr, rtr_mask); - if (ret) { - OFACM_ERROR(("Error modifying QP[%x] to IBV_QPS_RTR errno says: %s [%d]", - qp->qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } - - attr.qp_state = IBV_QPS_RTS; - attr.sq_psn = context->qps[0].lcl_psn; - /* applying user specified rts mask */ - rts_mask = IBV_QP_STATE | - IBV_QP_TIMEOUT | - IBV_QP_RETRY_CNT | - IBV_QP_RNR_RETRY | - IBV_QP_SQ_PSN | - IBV_QP_MAX_QP_RD_ATOMIC; - - /* applying user specified rts mask */ - - if (NULL != context->custom_rts_attr_mask) { - rts_mask |= context->custom_rts_attr_mask[0]; - } - - ret = ibv_modify_qp(qp, &attr, rts_mask); - if (ret) { - OFACM_ERROR(("Error modifying QP[%x] to IBV_QPS_RTS errno says: %s [%d]", - qp->qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -/* Recv qp create */ -static int xoob_recv_qp_create(opal_common_ofacm_xoob_local_connection_context_t *xcontext, - opal_common_ofacm_base_remote_connection_context_t *remote_info) -{ - struct ibv_qp_init_attr init_attr; - struct ibv_qp_attr attr; - int ret; - uint32_t init_mask = 0, rtr_mask = 0; - struct ibv_xrc_domain *xrc_domain; - opal_common_ofacm_base_local_connection_context_t *context = XOOB_TO_BASE(xcontext); - enum ibv_mtu mtu = (context->attr[0].path_mtu < remote_info->rem_mtu) ? - context->attr[0].path_mtu : remote_info->rem_mtu; - - OFACM_VERBOSE(("Connecting Recv QP\n")); - - memcpy(&init_attr, &context->init_attr[0], sizeof(init_attr)); - xrc_domain = init_attr.xrc_domain; - /* Only xrc_domain is required, all other are ignored */ - ret = ibv_create_xrc_rcv_qp(&init_attr, &context->xrc_recv_qp_num); - if (ret) { - OFACM_ERROR(("Error creating XRC recv QP[%x], errno says: %s [%d]", - context->xrc_recv_qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } - - memcpy(&attr, &context->attr[0], sizeof(attr)); - attr.qp_state = IBV_QPS_INIT; - attr.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ; - init_mask = IBV_QP_STATE | - IBV_QP_PKEY_INDEX | - IBV_QP_PORT | - IBV_QP_ACCESS_FLAGS; - - /* applying user specified init mask */ - if (NULL != context->custom_init_attr_mask) { - init_mask |= context->custom_init_attr_mask[0]; - } - - ret = ibv_modify_xrc_rcv_qp(xrc_domain, context->xrc_recv_qp_num, - &attr, init_mask); - if (ret) { - OFACM_ERROR(("Error modifying XRC recv QP[%x] to IBV_QPS_INIT, errno says: %s [%d]", - context->xrc_recv_qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } - - memcpy(&attr, &context->attr[0], sizeof(attr)); - attr.qp_state = IBV_QPS_RTR; - attr.path_mtu = mtu; - attr.dest_qp_num = remote_info->rem_qps[0].rem_qp_num; - attr.rq_psn = remote_info->rem_qps[0].rem_psn; - attr.ah_attr.dlid = remote_info->rem_lid; - attr.ah_attr.static_rate = 0; - rtr_mask = IBV_QP_STATE | - IBV_QP_AV | - IBV_QP_PATH_MTU | - IBV_QP_DEST_QPN | - IBV_QP_RQ_PSN | - IBV_QP_MAX_DEST_RD_ATOMIC| - IBV_QP_MIN_RNR_TIMER; - - /* applying user specified rtr mask */ - if (NULL != context->custom_rtr_attr_mask) { - rtr_mask |= context->custom_rtr_attr_mask[0]; - } - - ret = ibv_modify_xrc_rcv_qp(xrc_domain, context->xrc_recv_qp_num, - &attr, rtr_mask); - if (ret) { - OFACM_ERROR(("Error modifying XRC recv QP[%x] to IBV_QPS_RTR, errno says: %s [%d]", - context->xrc_recv_qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } - - return OPAL_SUCCESS; -} - -/* Recv qp connect */ -static int xoob_recv_qp_connect(opal_common_ofacm_xoob_local_connection_context_t *xcontext, - opal_common_ofacm_base_remote_connection_context_t *rem_info) -{ - int ret; - opal_common_ofacm_base_local_connection_context_t *context = XOOB_TO_BASE(xcontext); - - struct ibv_xrc_domain *xrc_domain = context->init_attr[0].xrc_domain; - - OFACM_VERBOSE(("Connecting Recv QP\n")); - ret = ibv_reg_xrc_rcv_qp(xrc_domain, rem_info->rem_qps->rem_qp_num); - if (ret) { /* failed to regester the qp, so it is already die and we should create new one */ - /* Return NOT READY !!!*/ - OFACM_ERROR(("Failed to register qp_num: %d , get error: %s (%d)\n. Replying with RNR", - rem_info->rem_qps->rem_qp_num, strerror(ret), ret)); - return OPAL_ERROR; - } else { - /* save the qp number for unregister */ - context->xrc_recv_qp_num = rem_info->rem_qps->rem_qp_num; - return OPAL_SUCCESS; - } -} - -/* - * Reply to a `start - connect' message - */ -static int xoob_reply_first_connect(opal_common_ofacm_xoob_local_connection_context_t *xcontext, - opal_common_ofacm_base_remote_connection_context_t *remote_info) -{ - int rc; - opal_common_ofacm_base_local_connection_context_t *context = - XOOB_TO_BASE(xcontext); - - OFACM_VERBOSE(("Initialized QPs, LID = %d", (XOOB_TO_BASE(xcontext))->lid)); - - /* Create local QP's and post receive resources */ - if (OPAL_SUCCESS != (rc = xoob_recv_qp_create(xcontext, remote_info))) { - return rc; - } - - /* prepost data on receiver site */ - if (OPAL_SUCCESS != (rc = context->prepare_recv_cb(context->user_context))) { - OFACM_ERROR(("Failed to post on XRC SRQs")); - xoob_report_error(xcontext); - return rc; - } - - if (OPAL_SUCCESS != - (rc = xoob_send_connect_data(xcontext, ENDPOINT_XOOB_CONNECT_RESPONSE))) { - OFACM_ERROR(("Error in send connect request error code is %d", - rc)); - return rc; - } - - return OPAL_SUCCESS; -} - -/* Find context for specific subnet/lid/message/cpc type */ -static opal_common_ofacm_xoob_local_connection_context_t* xoob_find_context - (opal_process_name_t* process_name, uint64_t subnet_id, - uint16_t lid, uint8_t message_type, int cpc_type) -{ - opal_common_ofacm_xoob_local_connection_context_t *xcontext = NULL; - opal_common_ofacm_base_proc_t *context_proc = NULL; - bool found = false; - opal_list_t *all_procs = - &opal_common_ofacm_xoob.all_procs; - - OFACM_VERBOSE(("Searching for ep and proc with follow parameters:" - "jobid %" PRIu32 ", vpid %" PRIu32 ", sid %d, lid %d, cpc type %d", - process_name->jobid, process_name->vpid, subnet_id, lid, cpc_type)); - /* find ibproc */ - for (context_proc = (opal_common_ofacm_base_proc_t*)opal_list_get_first(all_procs); - context_proc != (opal_common_ofacm_base_proc_t*)opal_list_get_end(all_procs); - context_proc = (opal_common_ofacm_base_proc_t*)opal_list_get_next(context_proc)) { - if (ompi_rte_compare_name_fields(OMPI_RTE_CMP_ALL, - &context_proc->proc_opal->proc_name, process_name) == OPAL_EQUAL) { - found = true; - break; - } - } - - /* we found our context_proc, lets find context now */ - if (found) { - opal_list_t *context_list = &context_proc->all_contexts; - opal_common_ofacm_base_local_connection_context_t *context; - for (context = (opal_common_ofacm_base_local_connection_context_t *) - opal_list_get_first(context_list); - context != (opal_common_ofacm_base_local_connection_context_t *) - opal_list_get_end(context_list); - context = (opal_common_ofacm_base_local_connection_context_t *) - opal_list_get_next(context)) { - /* we need to check different - * lid for different message type */ - if (ENDPOINT_XOOB_CONNECT_RESPONSE == message_type || - ENDPOINT_XOOB_CONNECT_XRC_RESPONSE == message_type) { - /* response message */ - if (context->subnet_id == subnet_id && - context->rem_lid == lid) { - xcontext = BASE_TO_XOOB(context); - break; /* Found one */ - } - } else { - /* request message */ - if (context->subnet_id == subnet_id && - context->lid == lid) { - xcontext = BASE_TO_XOOB(context); - break; /* Found one */ - } - } - } - if (NULL == xcontext) { - OFACM_ERROR(("can't find suitable context for this peer\n")); - } - } else { - OFACM_ERROR(("can't find suitable context for this peer\n")); - } - return xcontext; -} - -/* In case if XRC recv qp was closed and sender still don't know about it - * we need close the qp, reset the ib_adrr status to CLOSED and start everything - * from scratch. - */ -static void xoob_restart_connect - (opal_common_ofacm_xoob_local_connection_context_t *xcontext) -{ - opal_common_ofacm_base_local_connection_context_t *context = - XOOB_TO_BASE(xcontext); - OFACM_VERBOSE(("Restarting the connection for the context")); - OPAL_THREAD_LOCK(&xcontext->addr->addr_lock); - switch (xcontext->addr->status) { - case XOOB_ADDR_CONNECTED: - /* so we have the send qp, we just need the recive site. - * Send request for SRQ numbers */ - OFACM_VERBOSE(("Restart The IB addr: sid %d lid %d" - "in XOOB_ADDR_CONNECTED status," - " Changing to XOOB_ADDR_CLOSED and starting from scratch\n", - context->subnet_id, context->lid)); - /* Switching back to closed and starting from scratch */ - xcontext->addr->status = XOOB_ADDR_CLOSED; - /* destroy the qp */ - if(ibv_destroy_qp(context->qps[0].lcl_qp)) - OFACM_ERROR(("Failed to destroy QP")); - case XOOB_ADDR_CLOSED: - case XOOB_ADDR_CONNECTING: - OFACM_VERBOSE(("Restart The IB addr: sid %d lid %d" - "in XOOB_ADDR_CONNECTING or XOOB_ADDR_CLOSED status," - " starting from scratch\n", - context->subnet_id, context->lid)); - OPAL_THREAD_UNLOCK(&xcontext->addr->addr_lock); - /* xoob_module_start_connect() should automaticly handle all other cases */ - if (OPAL_SUCCESS != xoob_module_start_connect(XOOB_TO_BASE(xcontext))) - OFACM_ERROR(("Failed to restart connection from XOOB_ADDR_CONNECTING/CLOSED")); - break; - default : - OFACM_ERROR(("Invalid context status %d", xcontext->addr->status)); - OPAL_THREAD_UNLOCK(&xcontext->addr->addr_lock); - } -} - -/* - * Non blocking RML recv callback. Read incoming QP and other info, - * and if this endpoint is trying to connect, reply with our QP info, - * otherwise try to modify QP's and establish reliable connection - */ -static void xoob_rml_recv_cb(int status, opal_process_name_t* process_name, - opal_buffer_t* buffer, ompi_rml_tag_t tag, - void* cbdata) -{ - int rc; - uint8_t message_type; - uint16_t requested_lid = 0; - int cpc_type = -1; - opal_common_ofacm_base_local_connection_context_t *context; - opal_common_ofacm_xoob_local_connection_context_t *xcontext; - opal_common_ofacm_base_remote_connection_context_t remote_info; - - /* Init remote info */ - memset(&remote_info, 0, - sizeof(opal_common_ofacm_base_remote_connection_context_t)); - - if ( OPAL_SUCCESS != xoob_init_rem_info_alloc_qp(&remote_info)) { - return; - } - - /* Get data. */ - if ( OPAL_SUCCESS != - xoob_receive_connect_data(&remote_info, &requested_lid, &cpc_type, &message_type, buffer)) { - OFACM_ERROR(("Failed to read data\n")); - xoob_report_error(NULL); - return; - } - - /* Processing message */ - switch (message_type) { - case ENDPOINT_XOOB_CONNECT_REQUEST: - OFACM_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_REQUEST: lid %d, sid %d, rlid %d\n", - remote_info.rem_lid, - remote_info.rem_subnet_id, - requested_lid)); - xcontext = xoob_find_context(process_name,remote_info.rem_subnet_id, - requested_lid, message_type, cpc_type); - if ( NULL == xcontext) { - OFACM_ERROR(("Got ENDPOINT_XOOB_CONNECT_REQUEST." - " Failed to find context with subnet %d and LID %d", - remote_info.rem_subnet_id, requested_lid)); - xoob_free_rem_info(&remote_info); - xoob_report_error(xcontext); - return; - } - context = XOOB_TO_BASE(xcontext); - OPAL_THREAD_LOCK(&context->context_lock); - /* we should create qp and send the info + srq to requestor */ - rc = xoob_reply_first_connect(xcontext, &remote_info); - if (OPAL_SUCCESS != rc) { - OFACM_ERROR(("error in context reply start connect")); - xoob_free_rem_info(&remote_info); - xoob_report_error(xcontext); - return; - } - /* enable pooling for this btl */ - OPAL_THREAD_UNLOCK(&context->context_lock); - break; - case ENDPOINT_XOOB_CONNECT_XRC_REQUEST: - OFACM_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_REQUEST: lid %d, sid %d\n", - remote_info.rem_lid, - remote_info.rem_subnet_id)); - xcontext = xoob_find_context(process_name, remote_info.rem_subnet_id, - requested_lid, message_type, cpc_type); - if (NULL == xcontext) { - OFACM_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_REQUEST." - " Failed to find context with subnet %d and LID %d", - remote_info.rem_subnet_id, requested_lid)); - xoob_free_rem_info(&remote_info); - xoob_report_error(xcontext); - return; - } - - context = XOOB_TO_BASE(xcontext); - - if (OPAL_SUCCESS == xoob_recv_qp_connect(xcontext, &remote_info)) { - if (OPAL_SUCCESS != context->prepare_recv_cb(context->user_context)) { - OFACM_ERROR(("Failed to post on XRC SRQs")); - xoob_free_rem_info(&remote_info); - xoob_report_error(xcontext); - return; - } - OPAL_THREAD_LOCK(&context->context_lock); - rc = xoob_send_connect_data(xcontext, ENDPOINT_XOOB_CONNECT_XRC_RESPONSE); - if (OPAL_SUCCESS != rc) { - OFACM_ERROR(("error in context reply start connect")); - xoob_free_rem_info(&remote_info); - xoob_report_error(xcontext); - return; - } - OPAL_THREAD_UNLOCK(&context->context_lock); - } else { - /* The XRC recv qp was destroyed */ - OPAL_THREAD_LOCK(&context->context_lock); - rc = xoob_send_connect_data(xcontext, ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE); - if (OPAL_SUCCESS != rc) { - OFACM_ERROR(("error in context reply start connect")); - xoob_free_rem_info(&remote_info); - xoob_report_error(xcontext); - return; - } - OPAL_THREAD_UNLOCK(&context->context_lock); - } - break; - case ENDPOINT_XOOB_CONNECT_RESPONSE: - OFACM_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_RESPONSE: lid %d, sid %d\n", - remote_info.rem_lid, - remote_info.rem_subnet_id)); - xcontext = xoob_find_context(process_name, remote_info.rem_subnet_id, - remote_info.rem_lid, message_type, cpc_type); - if (NULL == xcontext) { - OFACM_ERROR(("Got ENDPOINT_XOOB_CONNECT_RESPONSE." - " Failed to find context with subnet %d and LID %d", - remote_info.rem_subnet_id, remote_info.rem_lid)); - xoob_free_rem_info(&remote_info); - xoob_report_error(xcontext); - return; - } - - context = XOOB_TO_BASE(xcontext); - OPAL_THREAD_LOCK(&context->context_lock); - /* we got all the data srq. switch the context to connect mode */ - xoob_set_remote_info(xcontext, &remote_info); - /* update ib_addr with remote qp number */ - xcontext->addr->remote_xrc_rcv_qp_num = - remote_info.rem_qps->rem_qp_num; - OFACM_VERBOSE(("rem_info: lid %d, sid %d ep %d %d", - remote_info.rem_lid, - remote_info.rem_subnet_id, - context->remote_info.rem_lid, - context->remote_info.rem_subnet_id)); - if (OPAL_SUCCESS != xoob_send_qp_connect(xcontext)) { - OFACM_ERROR(("Failed to connect context\n")); - xoob_free_rem_info(&remote_info); - xoob_report_error(xcontext); - return; - } - xoob_connection_complete(xcontext); - OPAL_THREAD_UNLOCK(&context->context_lock); - break; - case ENDPOINT_XOOB_CONNECT_XRC_RESPONSE: - OFACM_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_RESPONSE: lid %d, sid %d\n", - remote_info.rem_lid, - remote_info.rem_subnet_id)); - xcontext = xoob_find_context(process_name, remote_info.rem_subnet_id, - remote_info.rem_lid, message_type, cpc_type); - if ( NULL == xcontext) { - OFACM_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_RESPONSE." - " Failed to find context with subnet %d and LID %d", - remote_info.rem_subnet_id, remote_info.rem_lid)); - xoob_report_error(xcontext); - return; - } - context = XOOB_TO_BASE(xcontext); - OPAL_THREAD_LOCK(&context->context_lock); - /* we got srq numbers on our request */ - xoob_set_remote_info(xcontext, &remote_info); - xoob_connection_complete(xcontext); - OPAL_THREAD_UNLOCK(&context->context_lock); - break; - case ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE: - /* The XRC recv site already was destroyed so we need - * start to bringup the connection from scratch */ - OFACM_VERBOSE(("Received ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE: lid %d, sid %d\n", - remote_info.rem_lid, - remote_info.rem_subnet_id)); - xcontext = xoob_find_context(process_name, remote_info.rem_subnet_id, - remote_info.rem_lid, message_type, cpc_type); - if ( NULL == xcontext) { - OFACM_ERROR(("Got ENDPOINT_XOOB_CONNECT_XRC_NR_RESPONSE." - " Failed to find context with subnet %d and LID %d", - remote_info.rem_subnet_id, remote_info.rem_lid)); - xoob_report_error(xcontext); - return; - } - xoob_restart_connect(xcontext); - break; - default : - OFACM_ERROR(("Invalid message type %d", message_type)); - } - - xoob_free_rem_info(&remote_info); -} - -/* - * XOOB interface functions - */ - -/* Quere for the XOOB priority - will be highest in XRC case */ -static int xoob_component_query(opal_common_ofacm_base_dev_desc_t *dev, - opal_common_ofacm_base_module_t **cpc) -{ - opal_common_ofacm_xoob_module_t *xcpc; /* xoob cpc module */ - opal_common_ofacm_base_module_t *bcpc; /* base cpc module */ - - if (xoob_priority > 100) { - xoob_priority = 100; - } else if (xoob_priority < -1) { - xoob_priority = -1; - } - - if (!(dev->capabilities & OPAL_COMMON_OFACM_XRC_ONLY)) { - OFACM_VERBOSE(("openib BTL: xoob CPC only supported with XRC receive queues; skipped on device %s", - ibv_get_device_name(dev->ib_dev))); - return OPAL_ERR_NOT_SUPPORTED; - } - - xcpc = malloc(sizeof(opal_common_ofacm_xoob_module_t)); - if (NULL == xcpc) { - OFACM_VERBOSE(("openib BTL: xoob CPC system error (malloc failed)")); - return OPAL_ERR_OUT_OF_RESOURCE; - } - - bcpc = &xcpc->super; - - /* If this btl supports XOOB, then post the RML message. But - ensure to only post it *once*, because another btl may have - come in before this and already posted it. */ - if (!rml_recv_posted) { - ompi_rte_recv_buffer_nb(OMPI_NAME_WILDCARD, - OMPI_RML_TAG_XOFACM, - OMPI_RML_PERSISTENT, - xoob_rml_recv_cb, - NULL); - rml_recv_posted = true; - } - - OBJ_CONSTRUCT(&opal_common_ofacm_xoob.all_procs, opal_list_t); - bcpc->data.cbm_component = &opal_common_ofacm_xoob; - bcpc->data.cbm_priority = xoob_priority; - bcpc->data.cbm_modex_message = NULL; - bcpc->data.cbm_modex_message_len = 0; - - bcpc->cbm_endpoint_init = xoob_endpoint_init; - bcpc->cbm_start_connect = xoob_module_start_connect; - bcpc->cbm_endpoint_finalize = xoob_endpoint_finalize; - bcpc->cbm_finalize = NULL; - bcpc->cbm_uses_cts = false; - - /* seed RNG */ - opal_srand(&rand_buff,(uint32_t)(getpid())); - /* Build our hash table for subnetid-lid */ - OBJ_CONSTRUCT(&xcpc->ib_addr_table, opal_hash_table_t); - - assert(ompi_process_info.num_procs > 1); - if(NULL == xcpc->ib_addr_table.ht_table) { - if(OPAL_SUCCESS != opal_hash_table_init( - &xcpc->ib_addr_table, ompi_process_info.num_procs)) { - OFACM_ERROR(("XRC internal error. Failed to allocate ib_table")); - return OPAL_ERROR; - } - } - - *cpc = bcpc; - - OFACM_VERBOSE(("openib BTL: xoob CPC available for use on %s", - ibv_get_device_name(dev->ib_dev))); - - return OPAL_SUCCESS; -} - -/* Open - this functions sets up any xoob specific commandline params */ -static void xoob_component_register(void) -{ - xoob_priority = 60; - (void) mca_base_var_register("ompi", "common", "ofacm", "connect_xoob_priority", - "The selection method priority for xoob", - MCA_BASE_VAR_TYPE_INT, NULL, 0, 0, - OPAL_INFO_LVL_9, - MCA_BASE_VAR_SCOPE_READONLY, &xoob_priority); -} - -/* - * Connect function. Start initiation of connections to a remote - * peer. We send our Queue Pair information over the RML/OOB - * communication mechanism. On completion of our send, a send - * completion handler is called. - */ -static int xoob_module_start_connect - (opal_common_ofacm_base_local_connection_context_t *context) -{ - int rc = OPAL_SUCCESS; - opal_common_ofacm_xoob_local_connection_context_t *xcontext = - (opal_common_ofacm_xoob_local_connection_context_t *)context; - pending_context_t *pcontext; - - OPAL_THREAD_LOCK(&xcontext->addr->addr_lock); - switch (xcontext->addr->status) { - case XOOB_ADDR_CLOSED: - OFACM_VERBOSE(("The IB addr: sid %d lid %d" - "in XOOB_ADDR_CLOSED status," - " sending ENDPOINT_XOOB_CONNECT_REQUEST\n", - xcontext->addr->subnet_id, xcontext->addr->lid)); - if (OPAL_SUCCESS != (rc = xoob_send_qp_create(xcontext))) { - break; - } - - /* Send connection info over to remote endpoint */ - xcontext->super.state = MCA_COMMON_OFACM_CONNECTING; - xcontext->addr->status = XOOB_ADDR_CONNECTING; - if (OPAL_SUCCESS != - (rc = xoob_send_connect_data(xcontext, ENDPOINT_XOOB_CONNECT_REQUEST))) { - OFACM_ERROR(("Error sending connect request, error code %d", rc)); - } - break; - case XOOB_ADDR_CONNECTING: - OFACM_VERBOSE(("The IB addr: sid %d lid %d" - "in XOOB_ADDR_CONNECTING status," - " Subscribing to this address\n", - xcontext->addr->subnet_id, xcontext->addr->lid)); - pcontext = OBJ_NEW(pending_context_t); - xoob_pending_context_init(pcontext, xcontext); - /* some body already connectng to this machine, lets wait */ - opal_list_append(&xcontext->addr->pending_contexts, - (opal_list_item_t *)pcontext); - xcontext->super.state = MCA_COMMON_OFACM_CONNECTING; - break; - case XOOB_ADDR_CONNECTED: - /* so we have the send qp, we just need the recive site. - * Send request for SRQ numbers */ - OFACM_VERBOSE(("The IB addr: sid %d lid %d" - "in XOOB_ADDR_CONNECTED status," - " sending ENDPOINT_XOOB_CONNECT_XRC_REQUEST\n", - context->subnet_id, context->lid)); - xcontext->super.state = MCA_COMMON_OFACM_CONNECTING; - if (OPAL_SUCCESS != - (rc = xoob_send_connect_data(xcontext, ENDPOINT_XOOB_CONNECT_XRC_REQUEST))) { - OFACM_ERROR(("error sending xrc connect request, error code %d", rc)); - } - break; - default : - OFACM_ERROR(("Invalid context status %d", xcontext->addr->status)); - - } - OPAL_THREAD_UNLOCK(&xcontext->addr->addr_lock); - return rc; -} - - -/* - * Finalize function. Cleanup RML non-blocking receive. - */ -static int xoob_component_finalize(void) -{ - if (rml_recv_posted) { - ompi_rte_recv_cancel(OMPI_NAME_WILDCARD, OMPI_RML_TAG_XOFACM); - rml_recv_posted = false; - } - return OPAL_SUCCESS; -} diff --git a/opal/mca/common/ofacm/common_ofacm_xoob.h b/opal/mca/common/ofacm/common_ofacm_xoob.h deleted file mode 100644 index 51555a5ae8c..00000000000 --- a/opal/mca/common/ofacm/common_ofacm_xoob.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2007-2012 Mellanox Technologies. All rights reserved. - * Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. - * - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -#ifndef COMMON_OFACM_XOOB_H -#define COMMON_OFACM_XOOB_H - -#include "opal/class/opal_hash_table.h" -#include "connect.h" - -extern opal_common_ofacm_base_component_t opal_common_ofacm_xoob; - -typedef enum { - XOOB_ADDR_CONNECTING = 100, - XOOB_ADDR_CONNECTED, - XOOB_ADDR_CLOSED -} opal_common_ofacm_ib_addr_state_t; - -struct ofacm_ib_address_t { - opal_list_item_t super; - void *key; /* the key with size 80bit - [subnet(64) LID(16bit)] */ - uint64_t subnet_id; /* caching subnet_id */ - uint16_t lid; /* caching lid */ - opal_list_t pending_contexts; /* list of endpoints that use this ib_address */ - struct opal_common_ofacm_base_qp_t *qps; /* pointer to qp that will be used - for communication with the - destination */ - uint32_t remote_xrc_rcv_qp_num; /* remote xrc qp number */ - opal_mutex_t addr_lock; /* protection */ - opal_common_ofacm_ib_addr_state_t status; /* ib port status */ -}; -typedef struct ofacm_ib_address_t - ofacm_ib_address_t; - -struct opal_common_ofacm_xoob_local_connection_context_t { - opal_common_ofacm_base_local_connection_context_t super; - ofacm_ib_address_t *addr; - uint32_t xrc_recv_qp_num; /* in xrc we will use it as recv qp */ - uint32_t xrc_recv_psn; -}; -typedef struct opal_common_ofacm_xoob_local_connection_context_t - opal_common_ofacm_xoob_local_connection_context_t; -OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_common_ofacm_xoob_local_connection_context_t); - -struct opal_common_ofacm_xoob_module_t { - opal_common_ofacm_base_module_t super; - opal_hash_table_t ib_addr_table; /**< used only for xrc.hash-table that - keeps table of all lids/subnets */ -}; -typedef struct opal_common_ofacm_xoob_module_t - opal_common_ofacm_xoob_module_t; - -struct pending_context_t { - opal_list_item_t super; - opal_common_ofacm_xoob_local_connection_context_t *xcontext; -}; -typedef struct pending_context_t - pending_context_t; -OPAL_DECLSPEC OBJ_CLASS_DECLARATION(pending_context_t); - -#endif diff --git a/opal/mca/common/ofacm/configure.m4 b/opal/mca/common/ofacm/configure.m4 deleted file mode 100644 index c088cb6c055..00000000000 --- a/opal/mca/common/ofacm/configure.m4 +++ /dev/null @@ -1,61 +0,0 @@ -# -*- shell-script -*- -# -# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana -# University Research and Technology -# Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University -# of Tennessee Research Foundation. All rights -# reserved. -# Copyright (c) 2004-2005 High Performance Computing Center Stuttgart, -# University of Stuttgart. All rights reserved. -# Copyright (c) 2004-2005 The Regents of the University of California. -# All rights reserved. -# Copyright (c) 2007-2009 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# - -# MCA_opal_common_ofacm_CONFIG([should_build]) -# ------------------------------------------ -AC_DEFUN([MCA_opal_common_ofacm_POST_CONFIG], [ - AM_CONDITIONAL([MCA_common_ofacm_have_xrc], [test $1 -eq 1 -a "x$common_ofacm_have_xrc" = "x1"]) -]) - - -# MCA_opal_common_ofacm_CONFIG([action-if-can-compile], -# [action-if-cant-compile]) -# ------------------------------------------------ -AC_DEFUN([MCA_opal_common_ofacm_CONFIG],[ - AC_CONFIG_FILES([opal/mca/common/ofacm/Makefile]) - OPAL_VAR_SCOPE_PUSH([modules ofacm_have_threads]) - modules="oob" - - common_ofacm_happy="no" - OPAL_CHECK_OPENFABRICS([common_ofacm], - [common_ofacm_happy="yes" - OPAL_CHECK_OPENFABRICS_CM([common_ofacm])]) - - AS_IF([test "$common_ofacm_happy" = "yes"], - [$1], - [$2]) - - AS_IF([test "$common_ofacm_happy" = "yes"], - [if test "x$common_ofacm_have_xrc" = "x1"; then - modules="$modules xoob" - fi - AC_MSG_CHECKING([which OpenFabrics CM modules will be built]) - AC_MSG_RESULT([$modules])]) - - # substitute in the things needed to build openib - AC_SUBST([common_ofacm_CFLAGS]) - AC_SUBST([common_ofacm_CPPFLAGS]) - AC_SUBST([common_ofacm_LDFLAGS]) - AC_SUBST([common_ofacm_LIBS]) - - OPAL_VAR_SCOPE_POP -])dnl diff --git a/opal/mca/common/ofacm/connect.h b/opal/mca/common/ofacm/connect.h deleted file mode 100644 index 326a2a07056..00000000000 --- a/opal/mca/common/ofacm/connect.h +++ /dev/null @@ -1,542 +0,0 @@ -/* - * Copyright (c) 2007-2008 Cisco Systems, Inc. All rights reserved. - * Copyright (c) 2009 Mellanox Technogies, Inc. All rights reserved. - * - * Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. - * Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. - * $COPYRIGHT$ - * - * Additional copyrights may follow - * - * $HEADER$ - */ - -/** - * @file - * - * This interface is designed to hide the back-end details of how IB - * RC connections are made from the rest of the openib BTL. There are - * module-like instances of the implemented functionality (dlopen and - * friends are not used, but all the functionality is accessed through - * struct's of function pointers, so you can swap between multiple - * different implementations at run time, just like real components). - * Hence, these entities are referred to as "Connect - * Pseudo-Components" (CPCs). - * - * The CPCs are referenced by their names (e.g., "oob", "rdma_cm"). - * - * CPCs are split into components and modules, similar to all other - * MCA frameworks in this code base. - * - * Before diving into the CPC interface, let's discuss some - * terminology and mappings of data structures: - * - * - a BTL module represents a network port (in the case of the openib - * BTL, a LID) - * - a CPC module represents one way to make connections to a BTL module - * - hence, a BTL module has potentially multiple CPC modules - * associated with it - * - an endpoint represnts a connection between a local BTL module and - * a remote BTL module (in the openib BTL, because of BSRQ, an - * endpoint can contain multiple QPs) - * - when an endpoint is created, one of the CPC modules associated - * with the local BTL is selected and associated with the endpoint - * (obviously, it is a CPC module that is common between the local - * and remote BTL modules) - * - endpoints may be created and destroyed during the MPI job - * - endpoints are created lazily, during the first communication - * between two peers - * - endpoints are destroyed when two MPI processes become - * disconnected (e.g., MPI-2 dynamics or MPI_FINALIZE) - * - hence, BTL modules and CPC modules outlive endpoints. - * Specifically, BTL modules and CPC modules live from MPI_INIT to - * MPI_FINALIZE. endpoints come and go as MPI semantics demand it. - * - therefore, CPC modules need to cache information on endpoints that - * are specific to that connection. - * - * Component interface: - * - * - component_register(): The openib BTL's component_open() function - * calls the connect_base_register() function, which scans all - * compiled-in CPC's. If they have component_register() functions, - * they are called (component_register() functions are only allowed to - * register MCA parameters). - * - * NOTE: The connect_base_register() function will process the - * btl_openib_cpc_include and btl_openib_cpc_exclude MCA parameters - * and automatically include/exclude CPCs as relevant. If a CPC is - * excluded, none of its other interface functions will be invoked for - * the duration of the process. - * - * - component_init(): The openib BTL's component_init() function - * calls connect_base_init(), which will invoke this query function on - * each CPC to see if it wants to run at all. CPCs can gracefully - * remove themselves from consideration in this process by returning - * OPAL_ERR_NOT_SUPPORTED. - * - * - component_query(): The openib BTL's init_one_port() calls the - * connect_base_select_for_local_port() function, which, for each LID - * on that port, calls the component_query() function on every - * available CPC on that LID. This function is intended to see if a - * CPC can run on a sepcific openib BTL module (i.e., LID). If it - * can, the CPC is supposed to create a CPC module that is specific to - * that BTL/LID and return it. If it cannot, it should return - * OPAL_ERR_NOT_SUPPORTED and be gracefully skipped for this - * OpenFabrics port. - * - * component_finalize(): The openib BTL's component_close() function - * calls connect_base_finalize(), which, in turn, calls the - * component_finalize() function on all available CPCs. Note that all - * CPC modules will have been finalized by this point; the CPC - * component_finalize() function is a chance for the CPC to clean up - * any component-specific resources. - * - * Module interface: - * - * cbm_component member: A pointer pointing to the single, global - * instance of the CPC component. This member is used for creating a - * unique index representing the modules' component so that it can be - * shared with remote peer processes. - * - * cbm_priority member: An integer between 0 and 100, inclusive, - * representing the priority of this CPC. - * - * cbm_modex_message member: A pointer to a blob buffer that will be - * included in the modex message for this port for this CPC (it is - * assumed that this blob is a) only understandable by the - * corresponding CPC in the peer process, and b) contains specific - * addressing/contact information for *this* port's CPC module). - * - * cbm_modex_message_len member: The length of the cbm_modex_message - * blob, in bytes. - * - * cbm_endpoint_init(): Called during endpoint creation, allowing a - * CPC module to cache information on the endpoint. A pointer to the - * endpoint's CPC module is already cached on the endpoint. - * - * cbm_start_connect(): initiate a connection to a remote peer. The - * CPC is responsible for setting itself up for asyncronous operation - * for progressing the outgoing connection request. - * - * cbm_endpoint_finalize(): Called during the endpoint destrouction, - * allowing the CPC module to destroy anything that it cached on the - * endpoint. - * - * cbm_finalize(): shut down all asynchronous handling and clean up - * any state that was setup for this CPC module/BTL. Some CPCs setup - * asynchronous support on a per-HCA/NIC basis (vs. per-port/LID). It - * is the reponsibility of the CPC to figure out such issues (e.g., - * via reference counting) -- there is no notification from the - * upper-level BTL about when an entire HCA/NIC is no longer being - * used. There is only this function, which tells when a specific - * CPC/BTL module is no longer being used. - * - * cbm_uses_cts: a bool that indicates whether the CPC will use the - * CTS protocol or not. - * - if true: the CPC will post the fragment on - * endpoint->endpoint_cts_frag as a receive buffer and will *not* - * call opal_btl_openib_post_recvs(). - * - if false: the CPC will call opal_btl_openib_post_recvs() before - * calling opal_btl_openib_cpc_complete(). - * - * There are two functions in the main openib BTL that the CPC may - * call: - * - * - opal_btl_openib_post_recvs(endpoint): once a QP is locally - * connected to the remote side (but we don't know if the remote side - * is connected to us yet), this function is invoked to post buffers - * on the QP, setup credits for the endpoint, etc. This function is - * *only* invoked if the CPC's cbm_uses_cts is false. - * - * - opal_btl_openib_cpc_complete(endpoint): once that a CPC knows - * that a QP is connected on *both* sides, this function is invoked to - * tell the main openib BTL "ok, you can use this connection now." - * (e.g., the main openib BTL will either invoke the CTS protocol or - * start sending out fragments that were queued while the connection - * was establishing, etc.). - */ -#ifndef OPAL_COMMON_OFACM_CONNECT_H -#define OPAL_COMMON_OFACM_CONNECT_H - -/* System includes */ -#include - -#include "opal/threads/mutex.h" -#include "opal/class/opal_list.h" -#include "opal/util/proc.h" - -BEGIN_C_DECLS - -#define BCF_MAX_NAME 64 - -/** - * Must forward declare these structs to avoid include file loops. - */ - -/** - * This is struct is defined below - */ -struct opal_common_ofacm_base_module_t; - -/* special capabilities */ -#define OPAL_COMMON_OFACM_XRC_ONLY 1 -#define OPAL_COMMON_OFACM_IWARP_ONLY 1 << 1 - -/** - * State of OFACM connection. - */ - -typedef enum { - /* Defines the state in which this BTL instance - * has started the process of connection */ - MCA_COMMON_OFACM_CONNECTING, - - /* Waiting for ack from endpoint */ - MCA_COMMON_OFACM_CONNECT_ACK, - - /*Waiting for final connection ACK from endpoint */ - MCA_COMMON_OFACM_WAITING_ACK, - - /* Connected ... both sender & receiver have - * buffers associated with this connection */ - MCA_COMMON_OFACM_CONNECTED, - - /* Connection is closed, there are no resources - * associated with this */ - MCA_COMMON_OFACM_CLOSED, - - /* Maximum number of retries have been used. - * Report failure on send to upper layer */ - MCA_COMMON_OFACM_FAILED, - - /* We found is useful to have one more - * state that maybe utilized for user needs */ - MCA_COMMON_OFACM_USER_CUSTOM -} opal_common_ofacm_connection_state_t; - -typedef enum { - MCA_COMMON_OFACM_BTL = 0, - MCA_COMMON_OFACM_COLL = 100 -} opal_common_ofacm_type; - -typedef struct opal_common_ofacm_base_dev_desc_t { - struct ibv_device* ib_dev; /* device */ - struct ibv_context* ib_dev_context; /* device context */ - int capabilities; /* Special capabilities like: XRC, Iwarp, etc.. */ -} opal_common_ofacm_base_dev_desc_t; - -/* QPs configuration container that should be filled by - * upper layer, for example - btl */ -typedef struct opal_common_ofacm_base_qp_config_t { - int num_qps; - int num_srqs; - struct ibv_qp_init_attr *init_attr; - struct ibv_qp_attr *attr; - uint32_t *srq_num; - uint32_t *init_attr_mask; - uint32_t *rtr_attr_mask; - uint32_t *rts_attr_mask; -} opal_common_ofacm_base_qp_config_t; - -/* QP base data */ -typedef struct opal_common_ofacm_base_qp_t { - struct ibv_qp *lcl_qp; - size_t ib_inline_max; /**< max size of IB inline send */ - uint32_t lcl_psn; - int32_t sd_wqe; /**< number of available send wqe entries */ - int users; - opal_mutex_t lock; -} opal_common_ofacm_base_qp_t; - -/* Remote QP info */ -typedef struct opal_common_ofacm_base_rem_qp_info_t { - uint32_t rem_qp_num; - /* Remote QP number */ - uint32_t rem_psn; - /* Remote processes port sequence number */ -} opal_common_ofacm_base_rem_qp_info_t; - -/* Remote SRQ info */ -typedef struct opal_common_ofacm_base_rem_srq_info_t { - /* Remote SRQ number */ - uint32_t rem_srq_num; -} opal_common_ofacm_base_rem_srq_info_t; - -/* Remote connection context */ -typedef struct opal_common_ofacm_base_remote_connection_context_t { - opal_object_t super; - /* Local identifier of the remote process */ - uint16_t rem_lid; - /* subnet id of remote process */ - uint64_t rem_subnet_id; - /* MTU of remote process */ - uint32_t rem_mtu; /* TBD: not sure that i need this one */ - /* index of remote endpoint in endpoint array */ - uint32_t rem_index; /* TBD: the index we use as immidiate data */ - /* Remote QPs */ - opal_common_ofacm_base_rem_qp_info_t *rem_qps; - /* Remote xrc_srq info, used only with XRC connections */ - opal_common_ofacm_base_rem_srq_info_t *rem_srqs; -} opal_common_ofacm_base_remote_connection_context_t; - -typedef struct opal_common_ofacm_base_proc_t { - opal_list_item_t super; - opal_proc_t *proc_opal; /* target proc */ - opal_list_t all_contexts; /* list of all contexts connected to - this endpoint*/ -} opal_common_ofacm_base_proc_t; -OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_common_ofacm_base_proc_t); - -/* Connection call back function that is called on connection setup */ -typedef void (*opal_common_ofacm_base_context_connect_cb_fn_t)(void *); - -/* Connection call back function that is called on context error */ -typedef void (*opal_common_ofacm_base_context_error_cb_fn_t)(void *); -/* Prepare recive call back function that is when recv side should be prepared, - * for example recv packet prepost */ -typedef int (*opal_common_ofacm_base_context_prepare_recv_cb_fn_t)(void *); - -/* Basic connection context - * ======================== - * The initial connection contxet is created during endpoint initialazation call. - * Each CPC will return opal_common_ofacm_base_local_connection_context_t that - * is based on CPC connection context. - * - * As Input for context creation user must provide: - * ================================================ - * number of QPs - * qp init atributes - * qp standart attribute - * pointer to protection domain - * pointer to user context (for example pointer to endpoint in case of btl) - */ -typedef struct opal_common_ofacm_base_local_connection_context_t { - opal_list_item_t super; - struct opal_common_ofacm_base_proc_t *proc; /* target proc */ - struct opal_common_ofacm_base_module_t *cpc; /* Pointer to context cpc */ - opal_common_ofacm_connection_state_t state; /* Connection context status */ - uint64_t subnet_id; /* caching subnet_id */ - int cpc_type; /* connection manager family: openib, coll, etc..*/ - uint16_t lid; /* caching lid */ - uint16_t rem_lid; /* remote lid */ - uint8_t num_of_qps; /* Number of qps that we want to open */ - struct opal_common_ofacm_base_qp_t *qps; /* qps data */ - uint8_t num_of_srqs; /* Number of qps that we want to open */ - uint32_t *srq_num; /* srq numbers for recv on this context */ - struct ibv_qp_init_attr *init_attr; /* list of initial attr for each qp */ - struct ibv_qp_attr *attr; /* qp attributes */ - struct ibv_pd* ib_pd; /* protection domain */ - uint32_t *custom_init_attr_mask; /* in additional to standard attr_mask we want allow to user - specify special custom masks for init */ - uint32_t *custom_rtr_attr_mask; /* in additional to standard attr_mask we want allow to user - specify special custom masks for rtr */ - uint32_t *custom_rts_attr_mask; /* in additional to standard attr_mask we want allow to user - specify special custom masks for rts */ - void *user_context; /* back pointer to endpoint */ - opal_common_ofacm_base_context_connect_cb_fn_t connect_cb; /* Connection callback function */ - opal_common_ofacm_base_context_error_cb_fn_t error_cb; /* Error callback function */ - opal_common_ofacm_base_context_prepare_recv_cb_fn_t prepare_recv_cb; /* Prepare recv side - (prepost) callback function */ - /* TBD: Need to check when we can update the index. I think during endpoint creation we do not - * have it. It mean that BTL should some how to update it later ...*/ - int32_t index; /* user context index */ - bool initiator; /* initiator of connection ? */ - opal_common_ofacm_base_remote_connection_context_t remote_info; /* data about remote side of this - connection*/ - uint32_t xrc_recv_qp_num ; /* in xrc we will use it as recv qp */ - - opal_mutex_t context_lock; /* protection */ -} opal_common_ofacm_base_local_connection_context_t; -OPAL_DECLSPEC OBJ_CLASS_DECLARATION(opal_common_ofacm_base_local_connection_context_t); -/* Constructor and destructor are located in common_ofacm_base.c */ - -/************************************************************************/ - -/** - * Function to register MCA params in the connect functions. It - * returns no value, so it cannot fail. - */ -typedef void (*opal_common_ofacm_base_component_register_fn_t)(void); - -/** - * This function is invoked once by the openib BTL component during - * startup. It is intended to have CPC component-wide startup. - * - * Return value: - * - * - OPAL_SUCCESS: this CPC component will be used in selection during - * this process. - * - * - OPAL_ERR_NOT_SUPPORTED: this CPC component will be silently - * ignored in this process. - * - * - Other OPAL_ERR_* values: the error will be propagated upwards, - * likely causing a fatal error (and/or the openib BTL component - * being ignored). - */ -typedef int (*opal_common_ofacm_base_component_init_fn_t)(void); - -/** - * Query the CPC to see if it wants to run on a specific port (i.e., a - * specific BTL module). If the component init function previously - * returned OPAL_SUCCESS, this function is invoked once per BTL module - * creation (i.e., for each port found by an MPI process). If this - * CPC wants to be used on this BTL module, it returns a CPC module - * that is specific to this BTL module. - * - * The BTL module in question is passed to the function; all of its - * attributes can be used to query to see if it's eligible for this - * CPC. - * - * If it is eligible, the CPC is responsible for creating a - * corresponding CPC module, filling in all the relevant fields on the - * modules, and for setting itself up to run (per above) and returning - * a CPC module (this is effectively the "module_init" function). - * Note that the module priority must be between 0 and 100 - * (inclusive). When multiple CPCs are eligible for a single module, - * the CPC with the highest priority will be used. - * - * Return value: - * - * - OPAL_SUCCESS if this CPC is eligible for and was able to be setup - * for this BTL module. It is assumed that the CPC is now completely - * setup to run on this openib module (per description above). - * - * - OPAL_ERR_NOT_SUPPORTED if this CPC cannot support this BTL - * module. This is not an error; it's just the CPC saying "sorry, I - * cannot support this BTL module." - * - * - Other OPAL_ERR_* code: an error occurred. - */ -typedef int (*opal_common_ofacm_base_func_component_query_t) - (struct opal_common_ofacm_base_dev_desc_t *dev, - struct opal_common_ofacm_base_module_t **cpc); - -/** - * This function is invoked once by the openib BTL component during - * shutdown. It is intended to have CPC component-wide shutdown. - */ -typedef int (*opal_common_ofacm_base_component_finalize_fn_t)(void); - -/** - * CPC component struct - */ -typedef struct opal_common_ofacm_base_component_t { - /** Name of this set of connection functions */ - char cbc_name[BCF_MAX_NAME]; - - /** Register function. Can be NULL. */ - opal_common_ofacm_base_component_register_fn_t cbc_register; - - /** CPC component init function. Can be NULL. */ - opal_common_ofacm_base_component_init_fn_t cbc_init; - - /** Query the CPC component to get a CPC module corresponding to - an openib BTL module. Cannot be NULL. */ - opal_common_ofacm_base_func_component_query_t cbc_query; - - /** CPC component finalize function. Can be NULL. */ - opal_common_ofacm_base_component_finalize_fn_t cbc_finalize; - /** All connection contexts that are using this CPC **/ - opal_list_t all_procs; -} opal_common_ofacm_base_component_t; - -/************************************************************************/ - -/** - * Function called when an endpoint has been created and has been - * associated with a CPC. - */ -typedef opal_common_ofacm_base_local_connection_context_t* - (*opal_common_ofacm_base_module_endpoint_init_fn_t) - (opal_proc_t *proc, - opal_common_ofacm_base_qp_config_t *qp_config, struct ibv_pd *pd, - uint64_t subnet_id, int cpc_type, uint16_t lid, uint16_t rem_lid, - int32_t user_context_index, void *user_context, - struct opal_common_ofacm_base_module_t *cpc, - opal_common_ofacm_base_context_connect_cb_fn_t connect_cb, - opal_common_ofacm_base_context_error_cb_fn_t error_cb, - opal_common_ofacm_base_context_prepare_recv_cb_fn_t prepare_recv_cb); - -/** - * Function to initiate a connection to a remote process. - */ -typedef int (*opal_common_ofacm_base_module_start_connect_fn_t) - (struct opal_common_ofacm_base_local_connection_context_t *context); - -/** - * Function called when an endpoint is being destroyed. - */ -typedef int (*opal_common_ofacm_base_module_endpoint_finalize_fn_t) - (struct opal_common_ofacm_base_local_connection_context_t *context); - -/** - * Function to finalize the CPC module. It is called once when the - * CPC module's corresponding openib BTL module is being finalized. - */ -typedef int (*opal_common_ofacm_base_module_finalize_fn_t)(void); - -/** - * Error callback that is called by cpc module on error. - * The callback should be set on upper layer (for example BTL) - */ -typedef int (*opal_common_ofacm_base_module_error_cb_fn_t)(void *); -/** - * Meta data about a CPC module. This is in a standalone struct - * because it is used in both the CPC module struct and the - * openib_btl_proc_t struct to hold information received from the - * modex. - */ -typedef struct opal_common_ofacm_base_module_data_t { - /** Pointer back to the component. Used by the base and openib - btl to calculate this module's index for the modex. */ - opal_common_ofacm_base_component_t *cbm_component; - - /** Priority of the CPC module (must be >=0 and <=100) */ - uint8_t cbm_priority; - - /** Blob that the CPC wants to include in the openib modex message - for a specific port, or NULL if the CPC does not want to - include a message in the modex. */ - void *cbm_modex_message; - - /** Length of the cbm_modex_message blob (0 if - cbm_modex_message==NULL). The message is intended to be short - (because the size of the modex broadcast is a function of - sum(cbm_modex_message_len[i]) for - i=(0...total_num_ports_in_MPI_job) -- e.g., IBCM imposes its - own [very short] limits (per IBTA volume 1, chapter 12). */ - uint8_t cbm_modex_message_len; -} opal_common_ofacm_base_module_data_t; - -/** - * Struct for holding CPC module and associated meta data - */ -typedef struct opal_common_ofacm_base_module_t { - /** Meta data about the module */ - opal_common_ofacm_base_module_data_t data; - - /** Endpoint initialization function */ - opal_common_ofacm_base_module_endpoint_init_fn_t cbm_endpoint_init; - - /** Connect function */ - opal_common_ofacm_base_module_start_connect_fn_t cbm_start_connect; - - /** Endpoint finalization function */ - opal_common_ofacm_base_module_endpoint_finalize_fn_t cbm_endpoint_finalize; - - /** Finalize the cpc module */ - opal_common_ofacm_base_module_finalize_fn_t cbm_finalize; - - /** Whether this module will use the CTS protocol or not. This - directly states whether this module will call - mca_btl_openib_endpoint_post_recvs() or not: true = this - module will *not* call _post_recvs() and instead will post the - receive buffer provided at endpoint->endpoint_cts_frag on qp - 0. */ - bool cbm_uses_cts; -} opal_common_ofacm_base_module_t; - -END_C_DECLS - -#endif diff --git a/opal/mca/common/ofacm/help-mpi-common-ofacm-base.txt b/opal/mca/common/ofacm/help-mpi-common-ofacm-base.txt deleted file mode 100644 index de4dfa5587e..00000000000 --- a/opal/mca/common/ofacm/help-mpi-common-ofacm-base.txt +++ /dev/null @@ -1,41 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2008 Cisco Systems, Inc. All rights reserved. -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -# This is the US/English help file for Open MPI's OpenFabrics IB CPC -# support. -# -[no cpcs for port] -No OpenFabrics connection schemes reported that they were able to be -used on a specific port. As such, the openib BTL (OpenFabrics -support) will be disabled for this port. - - Local host: %s - Local device: %s - CPCs attempted: %s -# -[cpc name not found] -An invalid CPC name was specified via the btl_openib_cpc_%s MCA -parameter. - - Local host: %s - btl_openib_cpc_%s value: %s - Invalid name: %s - All possible valid names: %s -# -[inline truncated] -WARNING: The btl_openib_max_inline_data MCA parameter was used to -specify how much inline data should be used, but a device reduced this -value. This is not an error; it simply means that your run will use -a smaller inline data value than was requested. - - Local host: %s - Requested value: %d - Value used by device: %d diff --git a/opal/mca/common/ofacm/help-mpi-common-ofacm-oob.txt b/opal/mca/common/ofacm/help-mpi-common-ofacm-oob.txt deleted file mode 100644 index 8b1de30af14..00000000000 --- a/opal/mca/common/ofacm/help-mpi-common-ofacm-oob.txt +++ /dev/null @@ -1,20 +0,0 @@ -# -*- text -*- -# -# Copyright (c) 2009-2012 Mellanox Technologies. All rights reserved. -# Copyright (c) 2009-2012 Oak Ridge National Laboratory. All rights reserved. -# $COPYRIGHT$ -# -# Additional copyrights may follow -# -# $HEADER$ -# -[ofacm oob fatal error] -The OOB OpenFabrics Connection Manager module tried to raise fatal error, -but failed. - - Local host: %s - Source file: %s - Source line: %d - -Your job is now going to abort, sorry. -# diff --git a/opal/mca/common/ofacm/owner.txt b/opal/mca/common/ofacm/owner.txt deleted file mode 100644 index 5f137050801..00000000000 --- a/opal/mca/common/ofacm/owner.txt +++ /dev/null @@ -1,7 +0,0 @@ -# -# owner/status file -# owner: institution that is responsible for this package -# status: e.g. active, maintenance, unmaintained -# -owner: nobody -status: dead? diff --git a/opal/mca/pmix/native/usock.c b/opal/mca/pmix/native/usock.c index 4d06639e188..f1a10f1f31b 100644 --- a/opal/mca/pmix/native/usock.c +++ b/opal/mca/pmix/native/usock.c @@ -347,8 +347,9 @@ int usock_send_connect_ack(void) pmix_usock_hdr_t hdr; int rc; size_t sdsize; - opal_sec_cred_t *cred; - + char *cred; + size_t credsize; + opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s SEND CONNECT ACK", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); @@ -359,15 +360,15 @@ int usock_send_connect_ack(void) hdr.type = PMIX_USOCK_IDENT; /* get our security credential */ - if (OPAL_SUCCESS != (rc = opal_sec.get_my_credential(NULL, opal_dstore_internal, &OPAL_PROC_MY_NAME, &cred))) { + if (OPAL_SUCCESS != (rc = opal_sec.get_my_credential(NULL, opal_dstore_internal, &OPAL_PROC_MY_NAME, &cred, &credsize))) { return rc; } /* set the number of bytes to be read beyond the header */ - hdr.nbytes = strlen(opal_version_string) + 1 + strlen(cred->method) + 1 + cred->size; + hdr.nbytes = strlen(opal_version_string) + 1 + credsize; /* create a space for our message */ - sdsize = (sizeof(hdr) + strlen(opal_version_string) + 1 + strlen(cred->method) + 1 + cred->size); + sdsize = (sizeof(hdr) + strlen(opal_version_string) + 1 + credsize); if (NULL == (msg = (char*)malloc(sdsize))) { return OPAL_ERR_OUT_OF_RESOURCE; } @@ -376,9 +377,10 @@ int usock_send_connect_ack(void) /* load the message */ memcpy(msg, &hdr, sizeof(hdr)); memcpy(msg+sizeof(hdr), opal_version_string, strlen(opal_version_string)); - memcpy(msg+sizeof(hdr)+strlen(opal_version_string)+1, cred->method, strlen(cred->method)); - memcpy(msg+sizeof(hdr)+strlen(opal_version_string)+1+strlen(cred->method)+1, cred->credential, cred->size); - + memcpy(msg+sizeof(hdr)+strlen(opal_version_string)+1, cred, credsize); + if (NULL != cred) { + free(cred); + } if (OPAL_SUCCESS != usock_send_blocking(msg, sdsize)) { free(msg); diff --git a/opal/mca/pmix/native/usock_sendrecv.c b/opal/mca/pmix/native/usock_sendrecv.c index bef50785829..b012d36be2e 100644 --- a/opal/mca/pmix/native/usock_sendrecv.c +++ b/opal/mca/pmix/native/usock_sendrecv.c @@ -545,7 +545,8 @@ static int usock_recv_connect_ack(void) char *msg; char *version; int rc; - opal_sec_cred_t creds; + char *cred; + size_t credsize; pmix_usock_hdr_t hdr; opal_output_verbose(2, opal_pmix_base_framework.framework_output, @@ -632,11 +633,14 @@ static int usock_recv_connect_ack(void) OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* check security token */ - creds.method = (char*)(msg + strlen(version) + 1); - creds.credential = (char*)(msg + strlen(version) + 1 + strlen(creds.method) + 1); - creds.size = hdr.nbytes - strlen(version) - 1 - strlen(creds.method) - 1; - if (OPAL_SUCCESS != (rc = opal_sec.authenticate(&creds))) { + cred = (char*)(msg + strlen(version) + 1); + credsize = hdr.nbytes - strlen(version) - 1; + if (OPAL_SUCCESS != (rc = opal_sec.authenticate(cred, credsize, NULL))) { OPAL_ERROR_LOG(rc); + mca_pmix_native_component.state = PMIX_USOCK_FAILED; + CLOSE_THE_SOCKET(mca_pmix_native_component.sd); + free(msg); + return OPAL_ERR_UNREACH; } free(msg); diff --git a/opal/mca/sec/base/base.h b/opal/mca/sec/base/base.h index 8e007cf649f..09437cb6fef 100644 --- a/opal/mca/sec/base/base.h +++ b/opal/mca/sec/base/base.h @@ -45,9 +45,9 @@ OPAL_DECLSPEC int opal_sec_base_select(void); OPAL_DECLSPEC int opal_sec_base_get_cred(char *method, int dstorehandle, opal_process_name_t *my_id, - opal_sec_cred_t **cred); + char **payload, size_t *size); -OPAL_DECLSPEC int opal_sec_base_validate(opal_sec_cred_t *cred); +OPAL_DECLSPEC int opal_sec_base_validate(char *payload, size_t size, char **method); END_C_DECLS diff --git a/opal/mca/sec/base/sec_base_stubs.c b/opal/mca/sec/base/sec_base_stubs.c index 896bce86263..d6897866379 100644 --- a/opal/mca/sec/base/sec_base_stubs.c +++ b/opal/mca/sec/base/sec_base_stubs.c @@ -12,58 +12,153 @@ #include "opal/constants.h" #include "opal/mca/mca.h" +#include "opal/util/error.h" #include "opal/util/output.h" #include "opal/mca/base/base.h" #include "opal/dss/dss_types.h" #include "opal/mca/sec/base/base.h" +static void cleanup_cred(opal_sec_cred_t *cred) +{ + if (NULL == cred) { + return; + } + if (NULL != cred->method) { + free(cred->method); + } + if (NULL != cred->credential) { + free(cred->credential); + } +} + int opal_sec_base_get_cred(char *method, int dstorehandle, opal_process_name_t *my_id, - opal_sec_cred_t **cred) + char **payload, size_t *size) { opal_sec_handle_t *hdl; - + opal_sec_cred_t cred; + opal_buffer_t buf; + int rc; + opal_output_verbose(5, opal_sec_base_framework.framework_output, "Requesting credential from source %s", (NULL == method) ? "ANY" : method); - + + OBJ_CONSTRUCT(&buf, opal_buffer_t); OPAL_LIST_FOREACH(hdl, &opal_sec_base_actives, opal_sec_handle_t) { if (NULL != method && 0 != strcmp(method, hdl->component->mca_component_name)) { continue; } - if (OPAL_SUCCESS == hdl->module->get_my_credential(dstorehandle, my_id, cred)) { + if (OPAL_SUCCESS == hdl->module->get_my_credential(dstorehandle, my_id, &cred)) { opal_output_verbose(5, opal_sec_base_framework.framework_output, "Created credential from source %s", hdl->component->mca_component_name); - /* record the source */ - (*cred)->method = strdup(hdl->component->mca_component_name); - return OPAL_SUCCESS; + /* pack the credential */ + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &cred.method, 1, OPAL_STRING))) { + OPAL_ERROR_LOG(rc); + cleanup_cred(&cred); + OBJ_DESTRUCT(&buf); + return rc; + } + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &cred.size, 1, OPAL_SIZE))) { + OPAL_ERROR_LOG(rc); + cleanup_cred(&cred); + OBJ_DESTRUCT(&buf); + return rc; + } + if (0 < cred.size) { + if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, cred.credential, cred.size, OPAL_BYTE))) { + OPAL_ERROR_LOG(rc); + cleanup_cred(&cred); + OBJ_DESTRUCT(&buf); + return rc; + } + } + opal_output_verbose(5, opal_sec_base_framework.framework_output, + "opal_sec: Created credential %s of size %lu", + cred.credential, (unsigned long)cred.size); + cleanup_cred(&cred); } } - return OPAL_ERROR; + if (0 == buf.bytes_used) { + OBJ_DESTRUCT(&buf); + return OPAL_ERROR; + } + *payload = buf.base_ptr; + *size = buf.bytes_used; + buf.base_ptr = NULL; + buf.bytes_used = 0; + OBJ_DESTRUCT(&buf); + return OPAL_SUCCESS; } -int opal_sec_base_validate(opal_sec_cred_t *cred) +int opal_sec_base_validate(char *payload, size_t size, char **method) { opal_sec_handle_t *hdl; - + opal_buffer_t buf; + int cnt, rc; + opal_sec_cred_t cred; + opal_output_verbose(5, opal_sec_base_framework.framework_output, - "Received credential %s from source %s", - (NULL == cred->credential) ? "NULL" : cred->credential, - (NULL == cred->method) ? "NULL" : cred->method); + "opal_sec: Received credential of size %lu", + (unsigned long)size); - OPAL_LIST_FOREACH(hdl, &opal_sec_base_actives, opal_sec_handle_t) { - if (NULL != cred->method && - 0 != strcmp(cred->method, hdl->component->mca_component_name)) { - continue; + OBJ_CONSTRUCT(&buf, opal_buffer_t); + opal_dss.load(&buf, payload, size); + + cnt = 1; + while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &cred.method, &cnt, OPAL_STRING))) { + opal_output_verbose(5, opal_sec_base_framework.framework_output, + "Received credential from source %s", cred.method); + cnt=1; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &cred.size, &cnt, OPAL_SIZE))) { + OPAL_ERROR_LOG(rc); + cleanup_cred(&cred); + goto done; } - if (OPAL_SUCCESS == hdl->module->authenticate(cred)) { - return OPAL_SUCCESS; + opal_output_verbose(5, opal_sec_base_framework.framework_output, + "Received credential of size %lu", (unsigned long)cred.size); + if (0 < cred.size) { + cred.credential = (char*)malloc(cred.size); + cnt=cred.size; + if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, cred.credential, &cnt, OPAL_BYTE))) { + OPAL_ERROR_LOG(rc); + cleanup_cred(&cred); + goto done; + } + opal_output_verbose(5, opal_sec_base_framework.framework_output, + "Received credential %s", cred.credential); } + OPAL_LIST_FOREACH(hdl, &opal_sec_base_actives, opal_sec_handle_t) { + if (NULL != cred.method && + 0 != strcmp(cred.method, hdl->component->mca_component_name)) { + continue; + } + if (OPAL_SUCCESS == hdl->module->authenticate(&cred)) { + rc = OPAL_SUCCESS; + /* record the method */ + if (NULL != method) { + if (NULL != *method) { + free(*method); + } + *method = strdup(cred.method); + } + cleanup_cred(&cred); + goto done; + } + } + cleanup_cred(&cred); + cnt = 1; } - return OPAL_ERROR; + /* if we get here, then nothing authenticated */ + rc = OPAL_ERR_AUTHENTICATION_FAILED; + + done: + buf.base_ptr = NULL; + OBJ_DESTRUCT(&buf); + return rc; } diff --git a/opal/mca/sec/basic/sec_basic.c b/opal/mca/sec/basic/sec_basic.c index 0e3753b0297..37845f1ef7a 100644 --- a/opal/mca/sec/basic/sec_basic.c +++ b/opal/mca/sec/basic/sec_basic.c @@ -29,7 +29,7 @@ static int init(void); static void finalize(void); static int get_my_cred(int dstorehandle, opal_process_name_t *my_id, - opal_sec_cred_t **cred); + opal_sec_cred_t *cred); static int authenticate(opal_sec_cred_t *cred); opal_sec_base_module_t opal_sec_basic_module = { @@ -56,7 +56,7 @@ static void finalize(void) static int get_my_cred(int dstorehandle, opal_process_name_t *my_id, - opal_sec_cred_t **cred) + opal_sec_cred_t *cred) { opal_list_t vals; opal_value_t *kv; @@ -77,26 +77,31 @@ static int get_my_cred(int dstorehandle, my_cred.size = strlen(my_cred.credential)+1; // include the NULL } else { my_cred.credential = strdup(kv->data.string); - my_cred.size = strlen(kv->data.string); + my_cred.size = strlen(kv->data.string)+1; // include the NULL OBJ_RELEASE(kv); } } else { - my_cred.credential = strdup("12345"); + my_cred.credential = strdup("1234567"); my_cred.size = strlen(my_cred.credential)+1; // include the NULL } OPAL_LIST_DESTRUCT(&vals); } initialized = true; - *cred = &my_cred; + cred->method = strdup("basic"); + cred->credential = strdup(my_cred.credential); + cred->size = my_cred.size; return OPAL_SUCCESS; } static int authenticate(opal_sec_cred_t *cred) { + opal_output_verbose(5, opal_sec_base_framework.framework_output, + "opal_sec:basic Received credential %s of size %lu", + cred->credential, (unsigned long)cred->size); - if (0 == strncmp(cred->credential, "12345", strlen("12345"))) { + if (0 == strncmp(cred->credential, "1234567", strlen("1234567"))) { return OPAL_SUCCESS; } return OPAL_ERR_AUTHENTICATION_FAILED; diff --git a/opal/mca/sec/keystone/sec_keystone.c b/opal/mca/sec/keystone/sec_keystone.c index b08b3f0ec07..cdaf5c09881 100644 --- a/opal/mca/sec/keystone/sec_keystone.c +++ b/opal/mca/sec/keystone/sec_keystone.c @@ -34,7 +34,7 @@ static int init(void); static void finalize(void); static int get_my_cred(int dstorehandle, opal_process_name_t *my_id, - opal_sec_cred_t **cred); + opal_sec_cred_t *cred); static int authenticate(opal_sec_cred_t *cred); opal_sec_base_module_t opal_sec_keystone_module = { @@ -66,7 +66,7 @@ static size_t op_cbfunc(void *ptr, size_t size, size_t count, void *stream) static int get_my_cred(int dstorehandle, opal_process_name_t *my_id, - opal_sec_cred_t **cred) + opal_sec_cred_t *cred) { char *cmd; CURL *curl; diff --git a/opal/mca/sec/munge/sec_munge.c b/opal/mca/sec/munge/sec_munge.c index 32650b3f605..e6980f1c0fb 100644 --- a/opal/mca/sec/munge/sec_munge.c +++ b/opal/mca/sec/munge/sec_munge.c @@ -32,7 +32,7 @@ static int init(void); static void finalize(void); static int get_my_cred(int dstorehandle, opal_process_name_t *my_id, - opal_sec_cred_t **cred); + opal_sec_cred_t *cred); static int authenticate(opal_sec_cred_t *cred); opal_sec_base_module_t opal_sec_munge_module = { @@ -79,13 +79,12 @@ static void finalize(void) static int get_my_cred(int dstorehandle, opal_process_name_t *my_id, - opal_sec_cred_t **cred) + opal_sec_cred_t *cred) { int rc; if (initialized) { if (!refresh) { - *cred = &my_cred; refresh = true; } else { /* get a new credential as munge will not @@ -98,10 +97,12 @@ static int get_my_cred(int dstorehandle, } /* include the '\0' termination string character */ my_cred.size = strlen(my_cred.credential)+1; - *cred = &my_cred; } + cred->method = strdup("munge"); + cred->credential = strdup(my_cred.credential); + cred->size = my_cred.size; } else { - *cred = NULL; + rc = OPAL_ERROR; } return OPAL_SUCCESS; diff --git a/opal/mca/sec/sec.h b/opal/mca/sec/sec.h index 4fff46bfe09..242c442658e 100644 --- a/opal/mca/sec/sec.h +++ b/opal/mca/sec/sec.h @@ -79,12 +79,12 @@ typedef void (*opal_sec_base_module_finalize_fn_t)(void); */ typedef int (*opal_sec_base_module_get_my_cred_fn_t)(int dstorehandle, opal_process_name_t *my_id, - opal_sec_cred_t **cred); + opal_sec_cred_t *cred); typedef int (*opal_sec_API_module_get_my_cred_fn_t)(char *method, int dstorehandle, opal_process_name_t *my_id, - opal_sec_cred_t **cred); + char **payload, size_t *size); /* * Authenticate a security credential - given a security credential, * determine if the credential is valid. The credential is passed in @@ -95,6 +95,8 @@ typedef int (*opal_sec_API_module_get_my_cred_fn_t)(char *method, */ typedef int (*opal_sec_base_module_auth_fn_t)(opal_sec_cred_t *cred); +typedef int (*opal_sec_API_module_auth_fn_t)(char *payload, size_t size, char **method); + /* * the standard module data structure */ @@ -110,7 +112,7 @@ typedef struct opal_sec_base_module_1_0_0_t opal_sec_base_module_t; /* the API structure */ typedef struct { opal_sec_API_module_get_my_cred_fn_t get_my_credential; - opal_sec_base_module_auth_fn_t authenticate; + opal_sec_API_module_auth_fn_t authenticate; } opal_sec_API_module_t; /* diff --git a/orte/mca/oob/tcp/help-oob-tcp.txt b/orte/mca/oob/tcp/help-oob-tcp.txt index c5eda4009c6..e79f2abffe7 100644 --- a/orte/mca/oob/tcp/help-oob-tcp.txt +++ b/orte/mca/oob/tcp/help-oob-tcp.txt @@ -10,7 +10,7 @@ # University of Stuttgart. All rights reserved. # Copyright (c) 2004-2005 The Regents of the University of California. # All rights reserved. -# Copyright (c) 2014 Intel, Inc. All rights reserved. +# Copyright (c) 2014-2015 Intel, Inc. All rights reserved. # $COPYRIGHT$ # # Additional copyrights may follow @@ -64,3 +64,16 @@ value will be ignored. Local host: %s Value: %s Message: %s +# +[authent-fail] +An attempt was made to make a TCP connection between two hosts: + + Initiating host: %s + Receiving host: %s + +Unfortunately, the connection was refused due to a failure to +authenticate. This is usually caused by a mis-match between +the security domains of the two hosts - e.g., one might be +using Munge while the other is not. This can typically be +resolved by specifying the desired security method. For +example, adding "--mca sec basic" to your command line. diff --git a/orte/mca/oob/tcp/oob_tcp_connection.c b/orte/mca/oob/tcp/oob_tcp_connection.c index 647ec804d01..4a5bca883c7 100644 --- a/orte/mca/oob/tcp/oob_tcp_connection.c +++ b/orte/mca/oob/tcp/oob_tcp_connection.c @@ -13,7 +13,7 @@ * All rights reserved. * Copyright (c) 2009-2014 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved. - * Copyright (c) 2013-2014 Intel, Inc. All rights reserved. + * Copyright (c) 2013-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014-2015 Research Organization for Information Science * and Technology (RIST). All rights reserved. * $COPYRIGHT$ @@ -355,8 +355,9 @@ static int tcp_peer_send_connect_ack(mca_oob_tcp_peer_t* peer) mca_oob_tcp_hdr_t hdr; int rc; size_t sdsize; - opal_sec_cred_t *cred; - + char *cred; + size_t credsize; + opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s SEND CONNECT ACK", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); @@ -369,17 +370,22 @@ static int tcp_peer_send_connect_ack(mca_oob_tcp_peer_t* peer) /* get our security credential*/ if (OPAL_SUCCESS != (rc = opal_sec.get_my_credential(peer->auth_method, opal_dstore_internal, - ORTE_PROC_MY_NAME, &cred))) { + ORTE_PROC_MY_NAME, + &cred, &credsize))) { ORTE_ERROR_LOG(rc); return rc; } - + opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, + "%s SENDING CREDENTIAL OF SIZE %lu", + ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), + (unsigned long)credsize); + /* set the number of bytes to be read beyond the header */ - hdr.nbytes = strlen(orte_version_string) + 1 + strlen(cred->method) + 1 + cred->size; + hdr.nbytes = strlen(orte_version_string) + 1 + credsize; MCA_OOB_TCP_HDR_HTON(&hdr); /* create a space for our message */ - sdsize = (sizeof(hdr) + strlen(orte_version_string) + 1 + strlen(cred->method) + 1 + cred->size); + sdsize = sizeof(hdr) + strlen(orte_version_string) + 1 + credsize; if (NULL == (msg = (char*)malloc(sdsize))) { return ORTE_ERR_OUT_OF_RESOURCE; } @@ -388,9 +394,12 @@ static int tcp_peer_send_connect_ack(mca_oob_tcp_peer_t* peer) /* load the message */ memcpy(msg, &hdr, sizeof(hdr)); memcpy(msg+sizeof(hdr), orte_version_string, strlen(orte_version_string)); - memcpy(msg+sizeof(hdr)+strlen(orte_version_string)+1, cred->method, strlen(cred->method)); - memcpy(msg+sizeof(hdr)+strlen(orte_version_string)+1+strlen(cred->method)+1, cred->credential, cred->size); - + memcpy(msg+sizeof(hdr)+strlen(orte_version_string)+1, cred, credsize); + /* clear the memory */ + if (NULL != cred) { + free(cred); + } + /* send it */ if (ORTE_SUCCESS != tcp_peer_send_blocking(peer->sd, msg, sdsize)) { free(msg); @@ -618,7 +627,8 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr, char *msg; char *version; int rc; - opal_sec_cred_t creds; + char *cred; + size_t credsize; mca_oob_tcp_hdr_t hdr; mca_oob_tcp_peer_t *peer; uint64_t *ui64; @@ -799,18 +809,19 @@ int mca_oob_tcp_peer_recv_connect_ack(mca_oob_tcp_peer_t* pr, ORTE_NAME_PRINT(&peer->name)); /* check security token */ - creds.method = (char*)(msg + strlen(version) + 1); - creds.credential = (char*)(msg + strlen(version) + 1 + strlen(creds.method) + 1); - creds.size = hdr.nbytes - strlen(version) - 1 - strlen(creds.method) - 1; - if (OPAL_SUCCESS != (rc = opal_sec.authenticate(&creds))) { - ORTE_ERROR_LOG(rc); + cred = (char*)(msg + strlen(version) + 1); + credsize = hdr.nbytes - strlen(version) - 1; + if (OPAL_SUCCESS != (rc = opal_sec.authenticate(cred, credsize, &peer->auth_method))) { + char *hostname; + hostname = orte_get_proc_hostname(&peer->name); + orte_show_help("help-oob-tcp.txt", "authent-fail", true, + (NULL == hostname) ? "unknown" : hostname, + orte_process_info.nodename); + peer->state = MCA_OOB_TCP_FAILED; + mca_oob_tcp_peer_close(peer); free(msg); return ORTE_ERR_CONNECTION_REFUSED; } - /* record the method they used so we can reciprocate */ - if (NULL == peer->auth_method) { - peer->auth_method = strdup(creds.method); - } free(msg); opal_output_verbose(OOB_TCP_DEBUG_CONNECT, orte_oob_base_framework.framework_output, diff --git a/orte/mca/oob/usock/oob_usock_connection.c b/orte/mca/oob/usock/oob_usock_connection.c index 8201793628c..ab9ba573cfa 100644 --- a/orte/mca/oob/usock/oob_usock_connection.c +++ b/orte/mca/oob/usock/oob_usock_connection.c @@ -277,8 +277,9 @@ static int usock_peer_send_connect_ack(mca_oob_usock_peer_t* peer) mca_oob_usock_hdr_t hdr; int rc; size_t sdsize; - opal_sec_cred_t *cred; - + char *cred; + size_t credsize; + opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, "%s SEND CONNECT ACK", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); @@ -293,16 +294,16 @@ static int usock_peer_send_connect_ack(mca_oob_usock_peer_t* peer) /* get our security credential*/ if (OPAL_SUCCESS != (rc = opal_sec.get_my_credential(peer->auth_method, opal_dstore_internal, - ORTE_PROC_MY_NAME, &cred))) { + ORTE_PROC_MY_NAME, &cred, &credsize))) { ORTE_ERROR_LOG(rc); return rc; } /* set the number of bytes to be read beyond the header */ - hdr.nbytes = strlen(orte_version_string) + 1 + strlen(cred->method) + 1 + cred->size; + hdr.nbytes = strlen(orte_version_string) + 1 + credsize; /* create a space for our message */ - sdsize = (sizeof(hdr) + strlen(orte_version_string) + 1 + strlen(cred->method) + 1 + cred->size); + sdsize = (sizeof(hdr) + strlen(orte_version_string) + 1 + credsize); if (NULL == (msg = (char*)malloc(sdsize))) { return ORTE_ERR_OUT_OF_RESOURCE; } @@ -311,10 +312,9 @@ static int usock_peer_send_connect_ack(mca_oob_usock_peer_t* peer) /* load the message */ memcpy(msg, &hdr, sizeof(hdr)); memcpy(msg+sizeof(hdr), orte_version_string, strlen(orte_version_string)); - memcpy(msg+sizeof(hdr)+strlen(orte_version_string)+1, cred->method, strlen(cred->method)); - memcpy(msg+sizeof(hdr)+strlen(orte_version_string)+1+strlen(cred->method)+1, cred->credential, cred->size); - - + memcpy(msg+sizeof(hdr)+strlen(orte_version_string)+1, cred, credsize); + free(cred); + if (ORTE_SUCCESS != usock_peer_send_blocking(peer, peer->sd, msg, sdsize)) { ORTE_ERROR_LOG(ORTE_ERR_UNREACH); free(msg); @@ -488,7 +488,8 @@ int mca_oob_usock_peer_recv_connect_ack(mca_oob_usock_peer_t* pr, int sd, char *msg; char *version; int rc, cmpval; - opal_sec_cred_t creds; + char *cred; + size_t credsize; mca_oob_usock_peer_t *peer; mca_oob_usock_hdr_t hdr; uint64_t *ui64; @@ -668,16 +669,11 @@ int mca_oob_usock_peer_recv_connect_ack(mca_oob_usock_peer_t* pr, int sd, ORTE_NAME_PRINT(&peer->name)); /* check security token */ - creds.method = (char*)(msg + strlen(version) + 1); - creds.credential = (char*)(msg + strlen(version) + 1 + strlen(creds.method) + 1); - creds.size = hdr.nbytes - strlen(version) - 1; - if (OPAL_SUCCESS != (rc = opal_sec.authenticate(&creds))) { + cred = (char*)(msg + strlen(version) + 1); + credsize = hdr.nbytes - strlen(version) - 1; + if (OPAL_SUCCESS != (rc = opal_sec.authenticate(cred, credsize, &peer->auth_method))) { ORTE_ERROR_LOG(rc); } - /* record the method they used so we can reciprocate */ - if (NULL == peer->auth_method) { - peer->auth_method = strdup(creds.method); - } free(msg); opal_output_verbose(OOB_USOCK_DEBUG_CONNECT, orte_oob_base_framework.framework_output, diff --git a/orte/orted/pmix/pmix_server_connection.c b/orte/orted/pmix/pmix_server_connection.c index d57f13e8857..e18b7c919a5 100644 --- a/orte/orted/pmix/pmix_server_connection.c +++ b/orte/orted/pmix/pmix_server_connection.c @@ -83,8 +83,9 @@ int pmix_server_send_connect_ack(pmix_server_peer_t* peer) pmix_server_hdr_t hdr; int rc; size_t sdsize; - opal_sec_cred_t *cred; - + char *cred; + size_t credsize; + opal_output_verbose(2, pmix_server_output, "%s SEND CONNECT ACK", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); @@ -98,16 +99,16 @@ int pmix_server_send_connect_ack(pmix_server_peer_t* peer) /* get our security credential*/ if (OPAL_SUCCESS != (rc = opal_sec.get_my_credential(peer->auth_method, opal_dstore_internal, - ORTE_PROC_MY_NAME, &cred))) { + ORTE_PROC_MY_NAME, &cred, &credsize))) { ORTE_ERROR_LOG(rc); return rc; } /* set the number of bytes to be read beyond the header */ - hdr.nbytes = strlen(orte_version_string) + 1 + + strlen(cred->method) + 1 + cred->size; + hdr.nbytes = strlen(orte_version_string) + 1 + credsize; /* create a space for our message */ - sdsize = (sizeof(hdr) + strlen(opal_version_string) + 1 + strlen(cred->method) + 1 + cred->size); + sdsize = (sizeof(hdr) + strlen(opal_version_string) + 1 + credsize); if (NULL == (msg = (char*)malloc(sdsize))) { return ORTE_ERR_OUT_OF_RESOURCE; } @@ -116,9 +117,8 @@ int pmix_server_send_connect_ack(pmix_server_peer_t* peer) /* load the message */ memcpy(msg, &hdr, sizeof(hdr)); memcpy(msg+sizeof(hdr), opal_version_string, strlen(opal_version_string)); - memcpy(msg+sizeof(hdr)+strlen(opal_version_string)+1, cred->method, strlen(cred->method)); - memcpy(msg+sizeof(hdr)+strlen(opal_version_string)+1+strlen(cred->method)+1, cred->credential, cred->size); - + memcpy(msg+sizeof(hdr)+strlen(opal_version_string)+1, cred, credsize); + free(cred); if (ORTE_SUCCESS != usock_peer_send_blocking(peer, peer->sd, msg, sdsize)) { ORTE_ERROR_LOG(ORTE_ERR_UNREACH); @@ -212,7 +212,8 @@ int pmix_server_recv_connect_ack(pmix_server_peer_t* pr, int sd, char *msg; char *version; int rc; - opal_sec_cred_t creds; + char *cred; + size_t credsize; pmix_server_peer_t *peer; pmix_server_hdr_t hdr; orte_process_name_t sender; @@ -367,15 +368,14 @@ int pmix_server_recv_connect_ack(pmix_server_peer_t* pr, int sd, ORTE_NAME_PRINT(&peer->name)); /* check security token */ - creds.method = (char*)(msg + strlen(version) + 1); - creds.credential = (char*)(msg + strlen(version) + 1 + strlen(creds.method) + 1); - creds.size = strlen(creds.credential); - if (OPAL_SUCCESS != (rc = opal_sec.authenticate(&creds))) { + cred = (char*)(msg + strlen(version) + 1); + credsize = hdr.nbytes - strlen(version) - 1; + if (OPAL_SUCCESS != (rc = opal_sec.authenticate(cred, credsize, &peer->auth_method))) { ORTE_ERROR_LOG(rc); - } - /* record the method they used so we can reciprocate */ - if (NULL == peer->auth_method) { - peer->auth_method = strdup(creds.method); + peer->state = PMIX_SERVER_FAILED; + CLOSE_THE_SOCKET(peer->sd); + free(msg); + return ORTE_ERR_UNREACH; } free(msg); @@ -459,8 +459,10 @@ static bool usock_peer_recv_blocking(pmix_server_peer_t* peer, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == peer) ? "UNKNOWN" : ORTE_NAME_PRINT(&(peer->name)), (NULL == peer) ? 0 : peer->state); - peer->state = PMIX_SERVER_FAILED; - CLOSE_THE_SOCKET(peer->sd); + if (NULL != peer) { + peer->state = PMIX_SERVER_FAILED; + CLOSE_THE_SOCKET(peer->sd); + } return false; } diff --git a/orte/orted/pmix/pmix_server_sendrecv.c b/orte/orted/pmix/pmix_server_sendrecv.c index b1ab68d8360..146326af7ae 100644 --- a/orte/orted/pmix/pmix_server_sendrecv.c +++ b/orte/orted/pmix/pmix_server_sendrecv.c @@ -569,7 +569,8 @@ int pmix_server_peer_recv_connect_ack(pmix_server_peer_t* pr, char *msg; char *version; int rc; - opal_sec_cred_t creds; + char *cred; + size_t credsize; pmix_server_hdr_t hdr; pmix_server_peer_t *peer; uint64_t *ui64; @@ -720,9 +721,9 @@ int pmix_server_peer_recv_connect_ack(pmix_server_peer_t* pr, ORTE_NAME_PRINT(&peer->name)); /* check security token */ - creds.credential = (char*)(msg + strlen(version) + 1); - creds.size = hdr.nbytes - strlen(version) - 1; - if (OPAL_SUCCESS != (rc = opal_sec.authenticate(&creds))) { + cred = (char*)(msg + strlen(version) + 1); + credsize = hdr.nbytes - strlen(version) - 1; + if (OPAL_SUCCESS != (rc = opal_sec.authenticate(cred, credsize, NULL))) { ORTE_ERROR_LOG(rc); } free(msg); diff --git a/test/Makefile.am b/test/Makefile.am index 7aaecb7ced6..813ccc43115 100644 --- a/test/Makefile.am +++ b/test/Makefile.am @@ -2,7 +2,7 @@ # Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana # University Research and Technology # Corporation. All rights reserved. -# Copyright (c) 2004-2005 The University of Tennessee and The University +# Copyright (c) 2004-2015 The University of Tennessee and The University # of Tennessee Research Foundation. All rights # reserved. # Copyright (c) 2004-2009 High Performance Computing Center Stuttgart, @@ -19,5 +19,5 @@ # # support needs to be first for dependencies -SUBDIRS = support asm class threads datatype util +SUBDIRS = support asm class threads datatype util monitoring DIST_SUBDIRS = event $(SUBDIRS) diff --git a/test/monitoring/Makefile.am b/test/monitoring/Makefile.am new file mode 100644 index 00000000000..c328898cb44 --- /dev/null +++ b/test/monitoring/Makefile.am @@ -0,0 +1,22 @@ +# +# Copyright (c) 2013-2015 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2013-2015 Inria. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# This test requires multiple processes to run. Don't run it as part +# of 'make check' +if PROJECT_OMPI + noinst_PROGRAMS = monitoring_test + + monitoring_test_SOURCES = monitoring_test.c + monitoring_test_LDFLAGS = $(WRAPPER_EXTRA_LDFLAGS) + monitoring_test_LDADD = $(top_builddir)/ompi/libmpi.la $(top_builddir)/opal/libopen-pal.la +endif + diff --git a/test/monitoring/aggregate_profile.pl b/test/monitoring/aggregate_profile.pl new file mode 100644 index 00000000000..da6d3780b00 --- /dev/null +++ b/test/monitoring/aggregate_profile.pl @@ -0,0 +1,71 @@ +#!/usr/bin/perl -w + +# +# Copyright (c) 2013-2015 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2013-2015 Inria. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# +# Author Emmanuel Jeannot +# +# This script aggregates the profiles generated by the flush_monitoring function. +# The files need to be in in given format: name__ +# They are then aggregated by phases. +# If one needs the profile of all the phases he can concatenate the different files, +# or use the output of the monitoring system done at MPI_Finalize +# in the example it should be call as: +# ./aggregate_profile.pl prof/phase to generate +# prof/phase_1.prof +# prof/phase_2.prof +# +# ensure that this script as the executable right: chmod +x ... +# + +die "$0 \n\tProfile files should be of the form \"name_phaseid_processesid.prof\"\n\tFor instance if you saved the monitoring into phase_0_0.prof, phase_0_1.prof, ..., phase_1_0.prof etc you should call: $0 phase\n" if ($#ARGV!=0); + +$name = $ARGV[0]; + +@files = glob ($name."*"); + +%phaseid = (); + + +# Detect the different phases +foreach $file (@files) { + ($id)=($file =~ m/$name\_(\d+)_\d+/); + $phaseid{$id} = 1 if ($id); +} + +# for each phases aggregate the files +foreach $id (sort {$a <=> $b} keys %phaseid) { + aggregate($name."_".$id); +} + + + + +sub aggregate{ + $phase = $_[0]; + + print "Building $phase.prof\n"; + + open OUT,">$phase.prof"; + + @files = glob ($phase."*"); + + foreach $file ( @files) { + open IN,$file; + while () { + print OUT; + } + close IN; + } + close OUT; +} diff --git a/test/monitoring/monitoring_test.c b/test/monitoring/monitoring_test.c new file mode 100644 index 00000000000..d15439e17c1 --- /dev/null +++ b/test/monitoring/monitoring_test.c @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2013-2015 The University of Tennessee and The University + * of Tennessee Research Foundation. All rights + * reserved. + * Copyright (c) 2013-2015 Inria. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +/* +pml monitoring tester. + +Designed by George Bosilca and Emmanuel Jeannot +Contact the authors for questions. + +To be run as: + +mpirun -np 4 --mca pml_monitoring_enable 2 ./monitoring_test +pm +Then, the output should be: + +flushing to ./prof/phase_1_2.prof +flushing to ./prof/phase_1_0.prof +flushing to ./prof/phase_1_3.prof +flushing to ./prof/phase_2_1.prof +flushing to ./prof/phase_2_3.prof +flushing to ./prof/phase_2_0.prof +flushing to ./prof/phase_2_2.prof +I 0 1 108 bytes 27 msgs sent +E 0 1 1012 bytes 30 msgs sent +E 0 2 23052 bytes 61 msgs sent +I 1 2 104 bytes 26 msgs sent +I 1 3 208 bytes 52 msgs sent +E 1 0 860 bytes 24 msgs sent +E 1 3 2552 bytes 56 msgs sent +I 2 3 104 bytes 26 msgs sent +E 2 0 22804 bytes 49 msgs sent +E 2 3 860 bytes 24 msgs sent +I 3 0 104 bytes 26 msgs sent +I 3 1 204 bytes 51 msgs sent +E 3 1 2304 bytes 44 msgs sent +E 3 2 860 bytes 24 msgs sent + +or as + +mpirun -np 4 --mca pml_monitoring_enable 1 ./monitoring_test + +for an output as: + +flushing to ./prof/phase_1_1.prof +flushing to ./prof/phase_1_0.prof +flushing to ./prof/phase_1_2.prof +flushing to ./prof/phase_1_3.prof +flushing to ./prof/phase_2_1.prof +flushing to ./prof/phase_2_3.prof +flushing to ./prof/phase_2_2.prof +flushing to ./prof/phase_2_0.prof +I 0 1 1120 bytes 57 msgs sent +I 0 2 23052 bytes 61 msgs sent +I 1 0 860 bytes 24 msgs sent +I 1 2 104 bytes 26 msgs sent +I 1 3 2760 bytes 108 msgs sent +I 2 0 22804 bytes 49 msgs sent +I 2 3 964 bytes 50 msgs sent +I 3 0 104 bytes 26 msgs sent +I 3 1 2508 bytes 95 msgs sent +I 3 2 860 bytes 24 msgs sent +*/ + + + +#include +#include "mpi.h" + +/* opal mca header taken from opal/mca/base/mca_base_var.h + Required to flush monitoring phases +*/ +int mca_base_var_find_by_name (const char *full_name, int *vari); +int mca_base_var_get_value (int vari, const void *value, + void *source, /* should be mca_base_var_source_t *source, + but we do not need it + and we do not know what is mca_base_var_source_t */ + const char **source_file); + +int main(argc, argv) + int argc; + char **argv; +{ + int rank, size, n, to, from, tagno; + MPI_Status status; + MPI_Comm newcomm; + MPI_Request request; + char filename[1024]; + + + /* first phase : make a token circulated in MPI_COMM_WORLD */ + n = -1; + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + to = (rank + 1) % size; + from = (rank - 1) % size; + tagno = 201; + if (rank == 0){ + n=25; + MPI_Isend(&n,1,MPI_INT,to,tagno,MPI_COMM_WORLD,&request); + } + while (1){ + MPI_Irecv(&n,1,MPI_INT,from,tagno,MPI_COMM_WORLD, &request); + MPI_Wait(&request,&status); + if (rank == 0) {n--;tagno++;} + MPI_Isend(&n,1,MPI_INT,to,tagno,MPI_COMM_WORLD, &request); + if (rank != 0) {n--;tagno++;} + if (n<0){ + break; + } + } + + + /* flush the monitoring of the first phase */ + int fctidx; + void* fct; + int (*flush_monitoring)(char*) = NULL; + /* + Get the function pointer of the flushing function of the monitoring + This uses Opal low level interface + */ + mca_base_var_find_by_name( "pml_monitoring_flush", &fctidx); + if(fctidx){ + mca_base_var_get_value(fctidx, &fct, NULL, NULL); + flush_monitoring = *(unsigned long*)fct; + } + /* Build one file per processes + Evevry thing that has been monitored by each + process since the last flush will be output in filename*/ + + /* + Requires directory prof to be created. + Filename format should display the phase number + and the process rank for ease of parsing with + aggregate_profile.pl script + */ + sprintf(filename,"./prof/phase_1_%d.prof",rank); + if(flush_monitoring){ + int r = flush_monitoring(filename); + if(r == -1){ + fprintf(stderr, "Process %d cannot save monitoring in %s\n", rank, filename); + } + } + + /* + Second phase. Work with different communicators. + even ranls will circulate a token + while odd ranks wil perform a all_to_all + */ + MPI_Comm_split(MPI_COMM_WORLD,rank%2,rank,&newcomm); + + /* the filename for flushing monitoring now uses 2 as phase number! */ + sprintf(filename,"./prof/phase_2_%d.prof",rank); + + + if(rank%2){ /*even ranks (in COMM_WORD) circulate a token*/ + int old_rank=rank; + MPI_Comm_rank(newcomm,&rank); + MPI_Comm_size(newcomm,&size); + if( size > 1 ) { + to = (rank + 1) % size;; + from = (rank - 1) % size ; + tagno = 201; + if (rank == 0){ + n=50; + MPI_Send(&n,1,MPI_INT,to,tagno,newcomm); + } + while (1){ + MPI_Recv(&n,1,MPI_INT,from,tagno,newcomm, &status); + if (rank == 0) {n--;tagno++;} + MPI_Send(&n,1,MPI_INT,to,tagno,newcomm); + if (rank != 0) {n--;tagno++;} + if (n<0){ + if(flush_monitoring){ + int r = flush_monitoring(filename); + if(r == -1){ + fprintf(stderr, "Process %d cannot save monitoring in %s\n", old_rank, filename); + } + } + break; + } + } + } + }else{ /*odd ranks (in COMM_WORD) will perform a all_to_all and a barrier*/ + int send_buff[10240]; + int recv_buff[10240]; + MPI_Comm_rank(newcomm,&rank); + MPI_Comm_size(newcomm,&size); + MPI_Alltoall(send_buff,10240/size, MPI_INT,recv_buff,10240/size,MPI_INT,newcomm); + MPI_Comm_split(newcomm,rank%2,rank,&newcomm); + MPI_Barrier(newcomm); + if(flush_monitoring){ + int r = flush_monitoring(filename); + if(r == -1){ + fprintf(stderr, "Process %d cannot save monitoring in %s\n", rank, filename); + } + } + } + + /* Now, in MPI_Finalize(), the pml_monitoring library outputs, in STDERR, the aggregated recorded monitoring of all the phases*/ + MPI_Finalize(); + return 0; +} diff --git a/test/monitoring/prof/phase_1_0.prof b/test/monitoring/prof/phase_1_0.prof new file mode 100644 index 00000000000..500401685b3 --- /dev/null +++ b/test/monitoring/prof/phase_1_0.prof @@ -0,0 +1 @@ +I 0 1 108 bytes 27 msgs sent diff --git a/test/monitoring/prof/phase_1_1.prof b/test/monitoring/prof/phase_1_1.prof new file mode 100644 index 00000000000..1a314c842c6 --- /dev/null +++ b/test/monitoring/prof/phase_1_1.prof @@ -0,0 +1 @@ +I 1 2 104 bytes 26 msgs sent diff --git a/test/monitoring/prof/phase_1_2.prof b/test/monitoring/prof/phase_1_2.prof new file mode 100644 index 00000000000..da71c785a6b --- /dev/null +++ b/test/monitoring/prof/phase_1_2.prof @@ -0,0 +1 @@ +I 2 3 104 bytes 26 msgs sent diff --git a/test/monitoring/prof/phase_1_3.prof b/test/monitoring/prof/phase_1_3.prof new file mode 100644 index 00000000000..c2da6320791 --- /dev/null +++ b/test/monitoring/prof/phase_1_3.prof @@ -0,0 +1 @@ +I 3 0 104 bytes 26 msgs sent diff --git a/test/monitoring/prof/phase_2_0.prof b/test/monitoring/prof/phase_2_0.prof new file mode 100644 index 00000000000..a74fe682445 --- /dev/null +++ b/test/monitoring/prof/phase_2_0.prof @@ -0,0 +1,2 @@ +I 0 1 20 bytes 4 msgs sent +I 0 2 20528 bytes 9 msgs sent diff --git a/test/monitoring/prof/phase_2_1.prof b/test/monitoring/prof/phase_2_1.prof new file mode 100644 index 00000000000..43328d35205 --- /dev/null +++ b/test/monitoring/prof/phase_2_1.prof @@ -0,0 +1,2 @@ +I 1 0 20 bytes 4 msgs sent +I 1 3 236 bytes 56 msgs sent diff --git a/test/monitoring/prof/phase_2_2.prof b/test/monitoring/prof/phase_2_2.prof new file mode 100644 index 00000000000..5dc7d16ada6 --- /dev/null +++ b/test/monitoring/prof/phase_2_2.prof @@ -0,0 +1,2 @@ +I 2 0 20528 bytes 9 msgs sent +I 2 3 20 bytes 4 msgs sent diff --git a/test/monitoring/prof/phase_2_3.prof b/test/monitoring/prof/phase_2_3.prof new file mode 100644 index 00000000000..bc2c365b29c --- /dev/null +++ b/test/monitoring/prof/phase_2_3.prof @@ -0,0 +1,2 @@ +I 3 1 232 bytes 55 msgs sent +I 3 2 20 bytes 4 msgs sent diff --git a/test/monitoring/profile2mat.pl b/test/monitoring/profile2mat.pl new file mode 100644 index 00000000000..a6ea6a52bb4 --- /dev/null +++ b/test/monitoring/profile2mat.pl @@ -0,0 +1,123 @@ +#!/usr/bin/perl -w + +# +# Copyright (c) 2013-2015 The University of Tennessee and The University +# of Tennessee Research Foundation. All rights +# reserved. +# Copyright (c) 2013-2015 Inria. All rights reserved. +# $COPYRIGHT$ +# +# Additional copyrights may follow +# +# $HEADER$ +# + +# +# Author Emmanuel Jeannot +# +# Take a profile file and aggregates all the recorded communicaton into matrices. +# It generated a matrices for teh number of messages, (msg), +# for the total bytes transmitted (size) and +# the average nulber of bytes per messages (avg) +# +# The output matix is symetric +# +# If possible it creates file with "internal" tags (collexctive and eta data), +# "external" tags (point to point messages) and "all" (every messgaes). +# +# ensure that this script as the executable right: chmod +x ... +# + + +if($#ARGV < 0){ + die("Usage: $0 <\".prof\" filename>\n"); +}else{ + $filename=$ARGV[0]; +} + +profile($filename,"I|E","all"); +if ( profile($filename,"E","external") ){ + profile($filename,"I","internal"); +} + +sub profile{ + my $filename= $_[0]; + my $filter= $_[1]; + my $suffix= $_[2]; + my $done = 0; + + $outfile=$filename; + $outfile=~s/\.prof$/_size_$suffix\.mat/; + + + open IN,"<$filename"; + $n=0; + @mat1=(); + @mat2=(); + @mat3=(); + $i=0; + while () { + $i++; + if (($f,$p1,$p2,$s,$m)=/^($filter)\s+(\d+)\s+(\d+)\s+(\d+)\D+(\d+)/){ + $done = 1; + $f++; + #print "$p1 | $p2 | $s | $m\n"; + $mat1[$p1][$p2]+=$s; + $mat1[$p2][$p1]+=$s; + $mat2[$p1][$p2]+=$m; + $mat2[$p2][$p1]+=$m; + $n=$p1 if ($p1>$n); + $n=$p2 if ($p2>$n); + }else { + # print("file $filename line $i: $_\n"); + } + } + close IN; + + #print "$done\n"; + + foreach $i (0..$n) { + foreach $j (0..$n) { + $mat1[$i][$j]+=0; + $mat2[$i][$j]+=0; + $mat1[$i][$j]/=2; + $mat2[$i][$j]/=2; + if ($mat2[$i][$j]){ + $mat3[$i][$j]=$mat1[$i][$j]/$mat2[$i][$j] ; + #printf"%f\t%f\t%f\n",$mat1[$i][$j],$mat2[$i][$j],$mat3[$i][$j]; + }else{ + $mat3[$i][$j]=0; + } + } + } + + + if ($done) { + print "$filename -> $suffix\n"; + save_file($outfile,$n,\@mat1); + $outfile=~s/_size/_msg/; + save_file($outfile,$n,\@mat2); + $outfile=~s/_msg/_avg/; + save_file($outfile,$n,\@mat3); + print"\n"; + } + return $done; +} + + +sub save_file{ + my $outfile=$_[0]; + my $n=$_[1]; + my @mat=@{$_[2]}; + $s=$n+1; + print "$outfile\n"; + open OUT,">$outfile"; + foreach $i (0..$n) { + foreach $j (0..$n) { + printf OUT "%.0f ",$mat[$i][$j]; + } + print OUT "\n"; + } + # print"\n------------\n\n"; + close OUT; +}