diff --git a/python/hpc_benchmark/test_hpc_benchmark_hg/check.sh b/python/hpc_benchmark/test_hpc_benchmark_hg/check.sh new file mode 100644 index 000000000..7f0e36210 --- /dev/null +++ b/python/hpc_benchmark/test_hpc_benchmark_hg/check.sh @@ -0,0 +1 @@ +for i in $(seq 0 9); do diff raster_plot$i.png ../test_hpc_benchmark_p2p/raster_plot$i.png; done diff --git a/python/hpc_benchmark/test_hpc_benchmark_hg/check2.sh b/python/hpc_benchmark/test_hpc_benchmark_hg/check2.sh new file mode 100644 index 000000000..b5e9d168d --- /dev/null +++ b/python/hpc_benchmark/test_hpc_benchmark_hg/check2.sh @@ -0,0 +1 @@ +more log_* | grep firi > tmp1.txt; more ../test_hpc_benchmark_p2p/log_* | grep firi > tmp2.txt; diff tmp1.txt tmp2.txt diff --git a/python/hpc_benchmark/test_hpc_benchmark_wg/check.sh b/python/hpc_benchmark/test_hpc_benchmark_wg/check.sh new file mode 100644 index 000000000..7f0e36210 --- /dev/null +++ b/python/hpc_benchmark/test_hpc_benchmark_wg/check.sh @@ -0,0 +1 @@ +for i in $(seq 0 9); do diff raster_plot$i.png ../test_hpc_benchmark_p2p/raster_plot$i.png; done diff --git a/python/hpc_benchmark/test_hpc_benchmark_wg/check2.sh b/python/hpc_benchmark/test_hpc_benchmark_wg/check2.sh new file mode 100644 index 000000000..b5e9d168d --- /dev/null +++ b/python/hpc_benchmark/test_hpc_benchmark_wg/check2.sh @@ -0,0 +1 @@ +more log_* | grep firi > tmp1.txt; more ../test_hpc_benchmark_p2p/log_* | grep firi > tmp2.txt; diff tmp1.txt tmp2.txt diff --git a/python/test/logp3_connect.txt b/python/test/logp3_connect.txt index df4e459a0..aae630d38 100644 --- a/python/test/logp3_connect.txt +++ b/python/test/logp3_connect.txt @@ -8,6 +8,10 @@ Homepage: https://github.com/nest/nest-gpu +Calibrating ... +Allocating auxiliary GPU memory... +Sorting... +Done ######################################## Even to all {'index': 0, 'source': 0, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 1.0, 'weight': 100.0} @@ -100,7 +104,3 @@ Even to 3,4,5,6 {'index': 34, 'source': 6, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 69.0, 'weight': 6900.0} -Calibrating ... -Allocating auxiliary GPU memory... -Sorting... -Done diff --git a/python/test/test_all.sh b/python/test/test_all.sh index b07c30826..72be6659a 100755 --- a/python/test/test_all.sh +++ b/python/test/test_all.sh @@ -2,7 +2,7 @@ pass_str[0]="TEST PASSED" pass_str[1]="TEST NOT PASSED" :>log.txt for fn in test_iaf_psc_exp_g.py test_iaf_psc_alpha.py test_fixed_total_number.py test_iaf_psc_exp.py test_spike_times.py test_aeif_cond_alpha.py test_aeif_cond_beta.py test_aeif_psc_alpha.py test_aeif_psc_delta.py test_aeif_psc_exp.py test_aeif_cond_alpha_multisynapse.py test_aeif_cond_beta_multisynapse.py test_aeif_psc_alpha_multisynapse.py test_aeif_psc_exp_multisynapse.py test_stdp_list.py test_stdp.py test_syn_model.py test_brunel_list.py test_brunel_outdegree.py test_brunel_user_m1.py test_spike_detector.py; do - python3 $fn >> log.txt 2>err.txt + python3 -u $fn >> log.txt 2>err.txt res=$? cat err.txt >> log.txt rm -f err.txt @@ -12,7 +12,7 @@ for fn in test_iaf_psc_exp_g.py test_iaf_psc_alpha.py test_fixed_total_number.py echo $fn : ${pass_str[$res]} done for fn in syn_group connect getarr setvar2 group_param; do - python3 test_$fn.py 2>&1 | grep -v dyl | grep -v 'Time:' | grep -v 'storage bytes:' | grep -v Indexing | grep -v 'Total number' > tmp + python3 -u test_$fn.py 2>&1 | grep -v dyl | grep -v 'Time:' | grep -v 'storage bytes:' | grep -v Indexing | grep -v 'Total number' | grep -v 'Time from' > tmp diff -qs tmp logp3_$fn.txt 2>&1 >> log.txt res=$? echo $fn : ${pass_str[$res]} diff --git a/python/test/test_stdp/cases/test_all.sh b/python/test/test_stdp/cases/test_all.sh index b87eeda89..870bc901e 100755 --- a/python/test/test_stdp/cases/test_all.sh +++ b/python/test/test_stdp/cases/test_all.sh @@ -1 +1 @@ -for i in $(seq 1 10); do python3 case$i.py | grep '^dw'; done +for i in $(seq 1 10); do python3 -u case$i.py | grep '^dw'; done diff --git a/src/connect.h b/src/connect.h index 5f5d44bb1..e3d927592 100644 --- a/src/connect.h +++ b/src/connect.h @@ -118,7 +118,9 @@ class Connection virtual uint** getDevNodeTargetHostIMap() = 0; // get remote target host groups of all nodes - virtual std::vector< std::unordered_set < int > > &getNodeTargetHostGroup() = 0; + virtual const std::vector< std::vector < int > > &getNodeTargetHostGroup() const = 0; + + //virtual int copyNodeTargetHostGroup(inode_t i_node, uint *copy_array) = 0; // get map of local source nodes positions in host group node map virtual std::vector< std::vector< uint > > &getHostGroupLocalSourceNodeMap() = 0; @@ -135,7 +137,7 @@ class Connection #endif // return map of host group source nodes to local image nodes - virtual std::vector > > &getHostGroupLocalNodeIndex() = 0; + virtual std::vector > > &getHostGroupLocalNodeIndex() = 0; // get point-to-point MPI communication activation matrix virtual std::vector< std::vector < bool > > &getP2PHostConnMatrix() = 0; @@ -286,6 +288,12 @@ class Connection // calibrate the maps used to send spikes among remote hosts virtual int remoteConnectionMapCalibrate( inode_t n_nodes ) = 0; + // get vector of number of elements in the maps of remote source nodes to local image nodes + virtual std::vector< uint* > &getDevNRemoteSourceNodeMap() = 0; + + // only for debugging, save remote connection P2P maps + virtual int remoteConnectionMapSave() = 0; + // remote connection methods. 4 combinations where source and target can be either // of inode_t type (in case of a sequence) or pointers to inode_t // (in case of arbitrary arrays if node indexes) @@ -462,8 +470,10 @@ class ConnectionTemplate : public Connection // n_remote_source_node_map[group_local_id][i_host] // with i_host = 0, ..., host_group_[group_local_id].size()-1 excluding this host itself std::vector< std::vector< uint > > h_n_remote_source_node_map_; + + // vector of number of elements in the maps of remote source nodes to local image nodes std::vector< uint* > d_n_remote_source_node_map_; - + // remote_source_node_map_[group_local_id][i_host][i_block][i] std::vector< std::vector< std::vector< uint* > > > h_remote_source_node_map_; @@ -650,10 +660,10 @@ class ConnectionTemplate : public Connection // same as above, but ordered std::vector > > host_group_source_node_vect_; // map of host group source nodes to local image nodes - std::vector > > host_group_local_node_index_; + std::vector > > host_group_local_node_index_; // local ids of the host groups to which each node should send spikes - std::vector< std::unordered_set< int > > node_target_host_group_; // [n_local_nodes ][num. of target host groups ] + std::vector< std::vector< int > > node_target_host_group_; // [n_local_nodes ][num. of target host groups ] ////////////////////////////////////////////////// // class ConnectionTemplate methods ////////////////////////////////////////////////// @@ -776,7 +786,7 @@ class ConnectionTemplate : public Connection return d_node_target_host_i_map_; } - std::vector< std::unordered_set < int > > &getNodeTargetHostGroup() + const std::vector< std::vector < int > > &getNodeTargetHostGroup() const { return node_target_host_group_; } @@ -809,7 +819,7 @@ class ConnectionTemplate : public Connection } // return map of host group source nodes to local image nodes - std::vector > > &getHostGroupLocalNodeIndex() + std::vector > > &getHostGroupLocalNodeIndex() { return host_group_local_node_index_; } @@ -970,6 +980,15 @@ class ConnectionTemplate : public Connection // Calibrate the maps int remoteConnectionMapCalibrate( inode_t n_nodes ); + // get vector of number of elements in the maps of remote source nodes to local image nodes + std::vector< uint* > &getDevNRemoteSourceNodeMap() + { + return d_n_remote_source_node_map_; + } + + // only for debugging, save remote connection P2P maps + int remoteConnectionMapSave(); + // Allocate GPU memory for new remote-source-node-map blocks int allocRemoteSourceNodeMapBlocks( std::vector< uint* >& i_remote_src_node_map, std::vector< uint* >& i_local_spike_buf_map, @@ -1613,6 +1632,19 @@ setSource( ConnKeyT* conn_key_subarray, uint* rand_val, int64_t n_conn, T source setConnSource< ConnKeyT >( conn_key_subarray[ i_conn ], i_source ); } +template < class T, class ConnKeyT > +__global__ void +setSource( ConnKeyT* conn_key_subarray, inode_t* source, int64_t n_conn) +{ + int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x; + if ( i_conn >= n_conn ) + { + return; + } + inode_t i_source = source[ i_conn ]; + setConnSource< ConnKeyT >( conn_key_subarray[ i_conn ], i_source ); +} + template < class T > __global__ void setSource( inode_t* conn_source_ids, uint* rand_val, int64_t n_conn, T source, inode_t n_source ) @@ -1662,14 +1694,13 @@ setOneToOneSourceTarget( ConnKeyT* conn_key_subarray, template < class T > __global__ void -setOneToOneSource( inode_t* conn_source_ids, int64_t n_block_conn, int64_t n_prev_conn, T source ) +setOneToOneSource( inode_t* conn_source_ids, int64_t n_conn, T source ) { - int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x; - if ( i_block_conn >= n_block_conn ) + int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x; + if ( i_conn >= n_conn ) { return; } - int64_t i_conn = n_prev_conn + i_block_conn; inode_t i_source = getNodeIndex( source, ( int ) ( i_conn ) ); conn_source_ids[ i_conn ] = i_source; } @@ -1700,18 +1731,16 @@ setAllToAllSourceTarget( ConnKeyT* conn_key_subarray, template < class T1 > __global__ void setAllToAllSource( inode_t* conn_source_ids, - int64_t n_block_conn, - int64_t n_prev_conn, + int64_t n_conn, T1 source, inode_t n_source, inode_t n_target ) { - int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x; - if ( i_block_conn >= n_block_conn ) + int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x; + if ( i_conn >= n_conn ) { return; } - int64_t i_conn = n_prev_conn + i_block_conn; inode_t i_source = getNodeIndex( source, ( int ) ( i_conn / n_target ) ); conn_source_ids[ i_conn ] = i_source; } @@ -1750,14 +1779,13 @@ setOutdegreeSource( ConnKeyT* conn_key_subarray, int64_t n_block_conn, int64_t n template < class T > __global__ void -setOutdegreeSource( inode_t* conn_source_ids, int64_t n_block_conn, int64_t n_prev_conn, T source, int outdegree ) +setOutdegreeSource( inode_t* conn_source_ids, int64_t n_conn, T source, int outdegree ) { - int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x; - if ( i_block_conn >= n_block_conn ) + int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x; + if ( i_conn >= n_conn ) { return; } - int64_t i_conn = n_prev_conn + i_block_conn; inode_t i_source = getNodeIndex( source, ( int ) ( i_conn / outdegree ) ); conn_source_ids[ i_conn ] = i_source; } @@ -2917,15 +2945,18 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectOneToOne( curandGenerator_t& int64_t n_new_conn = n_node; n_conn_ += n_new_conn; // new number of connections int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ ); - if ( remote_source_flag ) - { + + if ( remote_source_flag ) { reallocConnSourceIds( n_new_conn ); + setOneToOneSource< T1 > <<< ( n_new_conn + 1023 ) / 1024, 1024 >>> + (d_conn_source_ids_, n_new_conn, source ); + DBGCUDASYNC; + + return 0; } - else - { - allocateNewBlocks( new_n_block ); - } - + + allocateNewBlocks( new_n_block ); + // printf("Generating connections with one-to-one rule...\n"); int64_t n_prev_conn = 0; int ib0 = ( int ) ( old_n_conn / conn_block_size_ ); @@ -2953,28 +2984,20 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectOneToOne( curandGenerator_t& i_conn0 = 0; n_block_conn = conn_block_size_; } - if ( remote_source_flag ) - { - setOneToOneSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - d_conn_source_ids_, n_block_conn, n_prev_conn, source ); - DBGCUDASYNC; - } - else - { - setOneToOneSourceTarget< T1, T2, ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, source, target ); - DBGCUDASYNC; - setConnectionWeights( - local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); - setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); - setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn ); - DBGCUDASYNC; - setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn ); - DBGCUDASYNC; - // CUDASYNC; - } + + setOneToOneSourceTarget< T1, T2, ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, source, target ); + DBGCUDASYNC; + setConnectionWeights( + local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn ); + DBGCUDASYNC; + setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn ); + DBGCUDASYNC; + // CUDASYNC; n_prev_conn += n_block_conn; } @@ -3001,11 +3024,15 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectAllToAll( curandGenerator_t& if ( remote_source_flag ) { reallocConnSourceIds( n_new_conn ); + setAllToAllSource< T1 > <<< ( n_new_conn + 1023 ) / 1024, 1024 >>>( + d_conn_source_ids_, n_new_conn, source, n_source, n_target ); + DBGCUDASYNC; + + return 0; } - else - { - allocateNewBlocks( new_n_block ); - } + + allocateNewBlocks( new_n_block ); + // printf("Generating connections with all-to-all rule...\n"); int64_t n_prev_conn = 0; int ib0 = ( int ) ( old_n_conn / conn_block_size_ ); @@ -3033,36 +3060,29 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectAllToAll( curandGenerator_t& i_conn0 = 0; n_block_conn = conn_block_size_; } - if ( remote_source_flag ) - { - setAllToAllSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - d_conn_source_ids_, n_block_conn, n_prev_conn, source, n_source, n_target ); - DBGCUDASYNC; - } - else - { - setAllToAllSourceTarget< T1, T2, ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, - conn_struct_vect_[ ib ] + i_conn0, - n_block_conn, - n_prev_conn, - source, - n_source, - target, - n_target ); - DBGCUDASYNC; - setConnectionWeights( - local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); - - setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); - - setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn ); - DBGCUDASYNC; - setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn ); - DBGCUDASYNC; - } + + setAllToAllSourceTarget< T1, T2, ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, + conn_struct_vect_[ ib ] + i_conn0, + n_block_conn, + n_prev_conn, + source, + n_source, + target, + n_target ); + DBGCUDASYNC; + setConnectionWeights( + local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + + setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + + setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn ); + DBGCUDASYNC; + setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn ); + DBGCUDASYNC; + n_prev_conn += n_block_conn; } @@ -3090,14 +3110,19 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedTotalNumber( curandGene n_conn_ += n_new_conn; // new number of connections int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ ); - if ( remote_source_flag ) - { - reallocConnSourceIds( n_new_conn ); - } - else - { - allocateNewBlocks( new_n_block ); + reallocConnSourceIds( n_new_conn ); + // generate random source index in range 0 - n_neuron + CURAND_CALL( curandGenerate( src_gen, ( uint* ) d_conn_source_ids_, n_new_conn ) ); + setSource< T1 > <<< ( n_new_conn + 1023 ) / 1024, 1024 >>> + (d_conn_source_ids_, d_conn_source_ids_, n_new_conn, source, n_source ); + DBGCUDASYNC; + + if ( remote_source_flag ) { + return 0; } + + allocateNewBlocks( new_n_block ); + // printf("Generating connections with fixed_total_number rule...\n"); int64_t conn_source_ids_offset = 0; int ib0 = ( int ) ( old_n_conn / conn_block_size_ ); @@ -3125,39 +3150,29 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedTotalNumber( curandGene i_conn0 = 0; n_block_conn = conn_block_size_; } - // generate random source index in range 0 - n_neuron - CURAND_CALL( curandGenerate( src_gen, ( uint* ) d_conn_storage_, n_block_conn ) ); - if ( remote_source_flag ) - { - setSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - d_conn_source_ids_ + conn_source_ids_offset, ( uint* ) d_conn_storage_, n_block_conn, source, n_source ); - DBGCUDASYNC; - conn_source_ids_offset += n_block_conn; - } - else - { - setSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, source, n_source ); - DBGCUDASYNC; - - // generate random target index in range 0 - n_neuron - CURAND_CALL( curandGenerate( local_rnd_gen_, ( uint* ) d_conn_storage_, n_block_conn ) ); - setTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_struct_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, target, n_target ); - DBGCUDASYNC; - - setConnectionWeights( - local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); - - setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); - - setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn ); - DBGCUDASYNC; - setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn ); - DBGCUDASYNC; - } + + setSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>> + (conn_key_vect_[ ib ] + i_conn0, d_conn_source_ids_ + conn_source_ids_offset, n_block_conn); + DBGCUDASYNC; + conn_source_ids_offset += n_block_conn; + + // generate random target index in range 0 - n_neuron + CURAND_CALL( curandGenerate( local_rnd_gen_, ( uint* ) d_conn_storage_, n_block_conn ) ); + setTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_struct_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, target, n_target ); + DBGCUDASYNC; + + setConnectionWeights( + local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + + setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + + setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn ); + DBGCUDASYNC; + setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn ); + DBGCUDASYNC; } return 0; @@ -3184,15 +3199,19 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedIndegree( curandGenerat n_conn_ += n_new_conn; // new number of connections int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ ); - if ( remote_source_flag ) - { - reallocConnSourceIds( n_new_conn ); - } - else - { - allocateNewBlocks( new_n_block ); - } + reallocConnSourceIds( n_new_conn ); + // generate random source index in range 0 - n_neuron + CURAND_CALL( curandGenerate( src_gen, ( uint* ) d_conn_source_ids_, n_new_conn ) ); + setSource< T1 > <<< ( n_new_conn + 1023 ) / 1024, 1024 >>> + (d_conn_source_ids_, d_conn_source_ids_, n_new_conn, source, n_source ); + DBGCUDASYNC; + if ( remote_source_flag ) { + return 0; + } + + allocateNewBlocks( new_n_block ); + // printf("Generating connections with fixed_indegree rule...\n"); int64_t conn_source_ids_offset = 0; int64_t n_prev_conn = 0; @@ -3221,37 +3240,28 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedIndegree( curandGenerat i_conn0 = 0; n_block_conn = conn_block_size_; } - // generate random source index in range 0 - n_neuron - CURAND_CALL( curandGenerate( src_gen, ( uint* ) d_conn_storage_, n_block_conn ) ); - if ( remote_source_flag ) - { - setSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - d_conn_source_ids_ + conn_source_ids_offset, ( uint* ) d_conn_storage_, n_block_conn, source, n_source ); - DBGCUDASYNC; - conn_source_ids_offset += n_block_conn; - } - else - { - setSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, source, n_source ); - DBGCUDASYNC; - setIndegreeTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_struct_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, target, indegree ); - DBGCUDASYNC; + setSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>> + (conn_key_vect_[ ib ] + i_conn0, d_conn_source_ids_ + conn_source_ids_offset, n_block_conn); + DBGCUDASYNC; + conn_source_ids_offset += n_block_conn; - setConnectionWeights( - local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + setIndegreeTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_struct_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, target, indegree ); + DBGCUDASYNC; - setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + setConnectionWeights( + local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); - setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn ); - DBGCUDASYNC; - setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn ); - DBGCUDASYNC; - } + setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + + setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn ); + DBGCUDASYNC; + setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn ); + DBGCUDASYNC; + n_prev_conn += n_block_conn; } @@ -3282,11 +3292,14 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedOutdegree( curandGenera if ( remote_source_flag ) { reallocConnSourceIds( n_new_conn ); + setOutdegreeSource< T1 > <<< ( n_new_conn + 1023 ) / 1024, 1024 >>>( + d_conn_source_ids_, n_new_conn, source, outdegree ); + DBGCUDASYNC; + + return 0; } - else - { - allocateNewBlocks( new_n_block ); - } + + allocateNewBlocks( new_n_block ); // printf("Generating connections with fixed_outdegree rule...\n"); int64_t n_prev_conn = 0; @@ -3315,37 +3328,29 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedOutdegree( curandGenera i_conn0 = 0; n_block_conn = conn_block_size_; } - if ( remote_source_flag ) - { - setOutdegreeSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - d_conn_source_ids_, n_block_conn, n_prev_conn, source, outdegree ); - DBGCUDASYNC; - } - else - { - setOutdegreeSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, source, outdegree ); - DBGCUDASYNC; - - // generate random target index in range 0 - n_neuron - CURAND_CALL( curandGenerate( local_rnd_gen_, ( uint* ) d_conn_storage_, n_block_conn ) ); - setTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_struct_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, target, n_target ); - DBGCUDASYNC; - - setConnectionWeights( - local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); - - setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); - - setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn ); - DBGCUDASYNC; - setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( - conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn ); - DBGCUDASYNC; - } + + setOutdegreeSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, source, outdegree ); + DBGCUDASYNC; + + // generate random target index in range 0 - n_neuron + CURAND_CALL( curandGenerate( local_rnd_gen_, ( uint* ) d_conn_storage_, n_block_conn ) ); + setTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_struct_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, target, n_target ); + DBGCUDASYNC; + setConnectionWeights( + local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + + setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec ); + + setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn ); + DBGCUDASYNC; + setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>( + conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn ); + DBGCUDASYNC; + n_prev_conn += n_block_conn; } diff --git a/src/cuda_error.h b/src/cuda_error.h index 044533624..5ee10695f 100644 --- a/src/cuda_error.h +++ b/src/cuda_error.h @@ -171,7 +171,7 @@ gpuAssert( cudaError_t code, const char* file, int line, bool abort = true ) } #endif -#define ACTIVATE_PRINT_TIME +//#define ACTIVATE_PRINT_TIME #ifdef ACTIVATE_PRINT_TIME #define PRINT_TIME \ gpuErrchk( cudaPeekAtLastError() ); \ diff --git a/src/input_spike_buffer.h b/src/input_spike_buffer.h index 353eec19b..b98a09fe5 100644 --- a/src/input_spike_buffer.h +++ b/src/input_spike_buffer.h @@ -132,7 +132,46 @@ getMaxInputDelayKernel( int64_t n_conn, int** max_input_delay ) atomicMax( &max_input_delay[ i_target ][ i_port ], i_delay + 1 ); } +template < class ConnKeyT, class ConnStructT > +__global__ void +testMaxInputDelayKernel( int64_t n_conn, int** max_input_delay, int n_local_nodes, int max_n_ports ) +{ + int64_t i_conn = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x; + if ( i_conn >= n_conn ) + { + return; + } + // get connection block and relative index within the block + uint i_block = ( uint ) ( i_conn / ConnBlockSize ); + int64_t i_block_conn = i_conn % ConnBlockSize; + + // get references to key-structure pair representing the connection + ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ]; + ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ]; + + // MAYBE CAN BE IMPROVED BY USING A BIT TO SPECIFY IF A CONNECTION IS DIRECT + // get target node index and delay + inode_t i_target = getConnTarget< ConnStructT >( conn_struct ); + int i_port = getConnPort< ConnKeyT, ConnStructT >( conn_key, conn_struct ); + int i_delay = getConnDelay< ConnKeyT >( conn_key ); + + if (i_target>=n_local_nodes) { + printf("i_target %d >= n_local_nodes %d\n", i_target, n_local_nodes); + printf("i_delay %d\n", i_delay); + } + if (i_port>=max_n_ports) { + printf("i_port %d >= max_n_port %d\t", i_port, max_n_ports); + printf("i_target %d\n", i_target); + + } + + // atomic operation to avoid conflicts in memory access + // if delay is larger than current maximum evaluated for target node, update the maximum + //atomicMax( &max_input_delay[ i_target ][ i_port ], i_delay + 1 ); +} + + // Evaluates the index of the first outgoing connection of each source node template < class ConnKeyT > __global__ void @@ -452,6 +491,8 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::initInputSpikeBuffer( inode_t n_loc } if (n_conn_ > 0) { + //testMaxInputDelayKernel< ConnKeyT, ConnStructT > <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>(n_conn_, d_max_input_delay_, n_local_nodes, 2 ); + //CUDASYNC; getMaxInputDelayKernel< ConnKeyT, ConnStructT > <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>(n_conn_, d_max_input_delay_ ); DBGCUDASYNC; } diff --git a/src/mpi_comm.cu b/src/mpi_comm.cu index 8093575da..16c351cb7 100644 --- a/src/mpi_comm.cu +++ b/src/mpi_comm.cu @@ -44,7 +44,6 @@ int NESTGPU::SendSpikeToRemote( int n_ext_spikes ) { #ifdef HAVE_MPI - MPI_Request request; int mpi_id, tag = 1; // id is already in the class, can be removed MPI_Comm_rank( MPI_COMM_WORLD, &mpi_id ); @@ -106,26 +105,33 @@ NESTGPU::SendSpikeToRemote( int n_ext_spikes ) h_ExternalHostGroupSpikeIdx0[i] = 0; } } - + SendSpikeToRemote_CUDAcp_time_ += ( getRealTime() - time_mark ); time_mark = getRealTime(); // loop on remote MPI proc for ( int ih = 0; ih < n_hosts_; ih++ ) { - if ( ( int ) ih == mpi_id || p2p_host_conn_matrix[this_host_][ih]==false) + if (ih == mpi_id || p2p_host_conn_matrix[this_host_][ih]==false) { // skip self MPI proc and unused point-to-point MPI communications + recv_mpi_request[ n_hosts_ + ih ] = MPI_REQUEST_NULL; continue; } // get index and size of spike packet that must be sent to MPI proc ih // array_idx is the first index of the packet for host ih int array_idx = h_ExternalTargetSpikeIdx0[ ih ]; int n_spikes = h_ExternalTargetSpikeIdx0[ ih + 1 ] - array_idx; - // printf("MPI_Send (src,tgt,nspike): %d %d %d\n", mpi_id, ih, n_spike); - // nonblocking sent of spike packet to MPI proc ih - MPI_Isend( &h_ExternalTargetSpikeNodeId[ array_idx ], n_spikes, MPI_INT, ih, tag, MPI_COMM_WORLD, &request ); - MPI_Request_free(&request); + if (n_spikes >= max_spike_per_host_) { + throw ngpu_exception( std::string("MPI_Isend error from host ") + std::to_string(this_host_) + + " to host " + std::to_string(ih) + + "\nNumber of spikes to be sent remotely " + std::to_string( n_spikes ) + + " larger than limit " + std::to_string( max_spike_per_host_ ) + + "\nYou can try to increase the kernel parameter \"max_spike_per_host_fact_\"." ); + } + + MPI_Isend( &h_ExternalTargetSpikeNodeId[ array_idx ], n_spikes, MPI_UNSIGNED, ih, tag, MPI_COMM_WORLD, + &recv_mpi_request[ n_hosts_ + ih ] ); // printf("MPI_Send nspikes (src,tgt,nspike): " // "%d %d %d\n", mpi_id, ih, n_spikes); @@ -158,25 +164,26 @@ NESTGPU::RecvSpikeFromRemote() // loop on remote MPI proc for ( int i_host = 0; i_host < n_hosts_; i_host++ ) { - if ( ( int ) i_host == mpi_id || p2p_host_conn_matrix[i_host][this_host_]==false) + if (i_host == mpi_id || p2p_host_conn_matrix[i_host][this_host_]==false) { recv_mpi_request[ i_host ] = MPI_REQUEST_NULL; - continue; // skip self MPI proc + continue; } // start nonblocking MPI receive from MPI proc i_host MPI_Irecv( &h_ExternalSourceSpikeNodeId[0][ i_host * max_spike_per_host_ ], max_spike_per_host_, - MPI_INT, + MPI_UNSIGNED, i_host, tag, MPI_COMM_WORLD, &recv_mpi_request[ i_host ] ); } - MPI_Status statuses[ n_hosts_ ]; + MPI_Status statuses[ 2*n_hosts_ ]; //recv_mpi_request[ mpi_id ] = MPI_REQUEST_NULL; //MPI_Waitall( n_hosts_ + nhg - 1, recv_mpi_request, statuses ); - MPI_Waitall( n_hosts_, recv_mpi_request, statuses ); + MPI_Waitall( 2*n_hosts_, recv_mpi_request, statuses ); + std::vector< std::vector< int > > &host_group = conn_->getHostGroup(); std::vector &mpi_comm_vect = conn_->getMPIComm(); uint nhg = host_group.size(); @@ -215,13 +222,20 @@ NESTGPU::RecvSpikeFromRemote() } int count = 0; if (p2p_host_conn_matrix[i_host][this_host_]==true) { - MPI_Get_count( &statuses[ i_host ], MPI_INT, &count ); + MPI_Get_count( &statuses[ i_host ], MPI_UNSIGNED, &count ); + } + if (count < 0 || count > max_spike_per_host_) { + throw ngpu_exception( std::string("MPI_Irecv error in host ") + std::to_string(this_host_) + + " from host " + std::to_string(i_host) + + "\nNumber of spikes sent remotely larger than limit " + + std::to_string( max_spike_per_host_ ) + + "\nYou can try to increase the kernel parameter \"max_spike_per_host_fact_\"." ); } h_ExternalSourceSpikeNum[0][ i_host ] = count; } // Maybe the barrier is not necessary? - MPI_Barrier( MPI_COMM_WORLD ); + //MPI_Barrier( MPI_COMM_WORLD ); RecvSpikeFromRemote_comm_time_ += ( getRealTime() - time_mark ); return 0; @@ -249,7 +263,7 @@ NESTGPU::ConnectMpiInit( int argc, char* argv[] ) setNHosts( n_hosts ); setThisHost( this_host ); //conn_->remoteConnectionMapInit(); - recv_mpi_request = new MPI_Request[ n_hosts_ ]; + recv_mpi_request = new MPI_Request[ 2*n_hosts_ ]; return 0; #else diff --git a/src/nestgpu.cu b/src/nestgpu.cu index 7d0df80f4..37747a2d6 100644 --- a/src/nestgpu.cu +++ b/src/nestgpu.cu @@ -491,6 +491,8 @@ NESTGPU::Calibrate() ExternalSpikeInit(); PRINT_TIME; + + //conn_->remoteConnectionMapSave(); } if ( conn_->getRevConnFlag() ) diff --git a/src/remote_connect.cu b/src/remote_connect.cu index 8afe3b9cc..c1917c710 100644 --- a/src/remote_connect.cu +++ b/src/remote_connect.cu @@ -265,6 +265,39 @@ insertNodesInMapKernel( uint** node_map, } } +// Only for debugging P2P MPI spike communication +__global__ void +checkMapIndexToImageNodeKernel( uint n_hosts, uint* host_offset, uint* node_index, uint *n_map, + uint node_index_size, int this_host) +{ + const uint i_host = blockIdx.x; + if ( i_host < n_hosts ) + { + const uint pos = host_offset[ i_host ]; + const uint num = host_offset[ i_host + 1 ] - pos; + for ( uint i_elem = threadIdx.x; i_elem < num; i_elem += blockDim.x ) + { + const uint i_node_map = node_index[ pos + i_elem ]; + if ((pos + i_elem)>=node_index_size) { + printf("Error in checkMapIndexToImageNodeKernel.\n" + "\t(pos + i_elem)>=node_index_size\n" + "\tthis_host: %d\ti_host: %d\tpos: %d\tnum: %d\ti_elem: %d\tinode_map: %d\n", + this_host, i_host, pos, num, i_elem, i_node_map); + } + if (i_node_map>=n_map[i_host]) { + printf("Error in checkMapIndexToImageNodeKernel.\n" + "\ti_node_map>=n_map[i_host]\n" + "\tthis_host: %d\ti_host: %d\tpos: %d\tnum: %d\ti_elem: %d\tinode_map: %d\n", + this_host, i_host, pos, num, i_elem, i_node_map); + } + //const uint i_block = i_node_map / node_map_block_size; + //const uint i = i_node_map % node_map_block_size; + //const uint i_image_node = local_image_node_map[ 0 ][ i_host ][ i_block ][ i ]; + //node_index[ pos + i_elem ] = i_image_node; + } + } +} + // This function is used only by point-by-point communication, not by host groups __global__ void MapIndexToImageNodeKernel( uint n_hosts, uint* host_offset, uint* node_index ) diff --git a/src/remote_connect.h b/src/remote_connect.h index f66f710dd..c5d92a7d5 100644 --- a/src/remote_connect.h +++ b/src/remote_connect.h @@ -3,6 +3,8 @@ #define REMOTECONNECTH // #include #include +#include + #include "getRealTime.h" // #include "nestgpu.h" #include "connect.h" @@ -210,7 +212,10 @@ addOffsetToExternalNodeIdsKernel( int64_t n_conn, __global__ void MapIndexToImageNodeKernel( uint n_hosts, uint* host_offset, uint* node_index ); - +// only for debugging +__global__ void +checkMapIndexToImageNodeKernel( uint n_hosts, uint* host_offset, uint* node_index, uint *n_map, + uint node_index_size, int this_host); // Allocate GPU memory for new remote-source-node-map blocks template < class ConnKeyT, class ConnStructT > @@ -682,40 +687,41 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectionMapCalibrate( inode PRINT_TIME; - uint n_src_max = 0; - + uint src_node_max = 0; + + std::vector< std::unordered_set > node_target_host_group_us; node_target_host_group_.resize(n_nodes); + node_target_host_group_us.resize(n_nodes); + for (uint group_local_id=1; group_local_id tmp_node_map(n_src_max, 0); - - + std::vector tmp_node_map; + //tmp_node_map.resize(src_node_max); + for (uint group_local_id=1; group_local_id 0) { hc_remote_source_node_map_[group_local_id][gi_host].resize(n_node_map); hc_image_node_map_[group_local_id][gi_host].resize(n_node_map); @@ -734,19 +740,44 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectionMapCalibrate( inode gpuErrchk(cudaMemcpy(&hc_image_node_map_[group_local_id][gi_host][ib*node_map_block_size_], h_image_node_map_[group_local_id][gi_host][ib], n_elem*sizeof(uint), cudaMemcpyDeviceToHost )); } - + ///* + bool resize_flag = false; + for (uint i=0; isrc_node_max) { + resize_flag = true; + src_node_max = src_node; + //std::cerr << "Error. src_node: " << src_node << " greater than n_src_max: " << n_src_max << std::endl; + //exit(0); + } + } + if ( resize_flag ) { + tmp_node_map.resize(src_node_max+1); + } + std::fill(tmp_node_map.begin(), tmp_node_map.end(), -1); for (uint i=0; isrc_node_max) { + pos = -1; + } + else { + pos = tmp_node_map[src_node]; + } //if (pos<0) { // throw ngpu_exception( "source node not found in host map" ); //} host_group_local_node_index_[group_local_id][gi_host][i] = pos; } + + } + else { + std::fill(host_group_local_node_index_[group_local_id][gi_host].begin(), host_group_local_node_index_[group_local_id][gi_host].end(), -1); } } else { // only in the source, i.e. if src_host == this_host_ @@ -754,16 +785,128 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectionMapCalibrate( inode for (uint i=0; i::iterator, bool> insert_it = + node_target_host_group_us[i_source].insert(group_local_id); + if (insert_it.second){ + node_target_host_group_[i_source].push_back(group_local_id); + } } } } } PRINT_TIME; - + return 0; } +template < class ConnKeyT, class ConnStructT > +int +ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectionMapSave() +{ + std::vector< std::vector< uint > > hc_remote_source_node_map; + std::vector< std::vector< uint > > hc_image_node_map; + + hc_remote_source_node_map.resize(n_hosts_); + hc_image_node_map.resize(n_hosts_); + + for ( int src_host = 0; src_host < n_hosts_; src_host++ ) {// loop on hosts + if ( src_host != this_host_ ) { // skip self host + // get number of elements in the map + uint n_node_map; + gpuErrchk( + cudaMemcpy( &n_node_map, &d_n_remote_source_node_map_[0][ src_host ], sizeof( uint ), cudaMemcpyDeviceToHost ) ); + + if (n_node_map > 0) { + hc_remote_source_node_map[src_host].resize(n_node_map); + hc_image_node_map[src_host].resize(n_node_map); + // loop on remote-source-node-to-local-image-node map blocks + uint n_map_blocks = h_remote_source_node_map_[0][src_host].size(); + + for (uint ib=0; ib > hc_local_source_node_map; + + hc_local_source_node_map.resize(n_hosts_); + + for ( int tg_host = 0; tg_host < n_hosts_; tg_host++ ) {// loop on hosts + if ( tg_host != this_host_ ) { // skip self host + // get number of elements in the map + uint n_node_map; + gpuErrchk( + cudaMemcpy( &n_node_map, &d_n_local_source_node_map_[ tg_host ], sizeof( uint ), cudaMemcpyDeviceToHost ) ); + + if (n_node_map > 0) { + hc_local_source_node_map[tg_host].resize(n_node_map); + // loop on remote-source-node-to-local-image-node map blocks + uint n_map_blocks = h_local_source_node_map_[tg_host].size(); + + for (uint ib=0; ib int ConnectionTemplate< ConnKeyT, ConnStructT >::addOffsetToImageNodeMap( inode_t n_nodes ) @@ -1536,7 +1679,7 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::CreateHostGroup(int *host_arr, int host_group_source_node_vect_.push_back(empty_node_vect); host_group_local_source_node_map_.push_back(std::vector< uint >()); - std::vector< std::vector< int > > hg_lni(hg.size(), std::vector< int >()); + std::vector< std::vector< int64_t > > hg_lni(hg.size(), std::vector< int64_t >()); host_group_local_node_index_.push_back(hg_lni); } #ifdef HAVE_MPI diff --git a/src/remote_spike.cu b/src/remote_spike.cu index 910b4322d..c6567d156 100644 --- a/src/remote_spike.cu +++ b/src/remote_spike.cu @@ -355,15 +355,14 @@ NESTGPU::ExternalSpikeInit() h_ExternalTargetSpikeNum.resize( n_hosts_ ); h_ExternalSourceSpikeIdx0.resize( n_hosts_ + 1 ); h_ExternalTargetSpikeNodeId.resize( max_remote_spike_num_ ); - h_ExternalHostGroupSpikeNodeId.resize( max_remote_spike_num_ ); - h_ExternalSourceSpikeDispl.resize( n_hosts_ ); + h_ExternalSourceSpikeDispl[0] = 0; for (int ih=1; ih > &node_target_host_group = conn_->getNodeTargetHostGroup(); + const std::vector< std::vector < int > > &node_target_host_group = conn_->getNodeTargetHostGroup(); std::vector< std::vector< uint > > &host_group_local_source_node_map = conn_->getHostGroupLocalSourceNodeMap(); std::vector < uint > n_node_target_host_group(n_node, 0); @@ -444,7 +445,7 @@ NESTGPU::ExternalSpikeInit() auto node_target_host_group_it = node_target_host_group_flat.begin(); uint *host_group_node_id_pt = &host_group_node_id_flat[0]; - + for (inode_t i_node = 0; i_node>>( n_hosts_, @@ -671,11 +671,12 @@ NESTGPU::CopySpikeFromRemote() if ( n_spike_tot >= max_remote_spike_num_ ) { throw ngpu_exception( std::string( "Number of spikes to be received remotely " ) + std::to_string( n_spike_tot ) - + " larger than limit " + std::to_string( max_remote_spike_num_ ) ); + + " larger than limit " + std::to_string( max_remote_spike_num_ ) + + "\nYou can try to increase the kernel parameter \"max_remote_spike_num_fact\"." ); } } } - std::vector > > &host_group_local_node_index = conn_->getHostGroupLocalNodeIndex(); + std::vector > > &host_group_local_node_index = conn_->getHostGroupLocalNodeIndex(); std::vector< std::vector< int > > &host_group = conn_->getHostGroup(); uint nhg = host_group.size(); @@ -686,10 +687,11 @@ NESTGPU::CopySpikeFromRemote() int i_host = host_group[group_local_id][gi_host]; if (i_host != this_host_) { int n_spike = h_ExternalSourceSpikeNum[group_local_id][ gi_host ]; + for ( int i_spike = 0; i_spike < n_spike; i_spike++ ) { // pack spikes received from remote hosts inode_t node_pos = h_ExternalSourceSpikeNodeId[group_local_id][ gi_host * max_spike_per_host_ + i_spike ]; - int node_local = host_group_local_node_index[group_local_id][gi_host][node_pos]; + int64_t node_local = host_group_local_node_index[group_local_id][gi_host][node_pos]; if (node_local >= 0) { h_ExternalSourceSpikeNodeId_flat[ n_spike_tot ] = node_local; n_spike_tot++; @@ -720,6 +722,17 @@ NESTGPU::CopySpikeFromRemote() cudaMemcpyHostToDevice ) ); DBGCUDASYNC; RecvSpikeFromRemote_CUDAcp_time_ += ( getRealTime() - time_mark ); + +//#define CHECK_MAP_INDEX_TO_IMAGE_NODE +#ifdef CHECK_MAP_INDEX_TO_IMAGE_NODE + std::vector< uint* > &d_n_remote_source_node_map = conn_->getDevNRemoteSourceNodeMap();; + uint *n_map = d_n_remote_source_node_map[0]; + checkMapIndexToImageNodeKernel<<< n_hosts_, 1024 >>>( + n_hosts_, d_ExternalSourceSpikeIdx0, d_ExternalSourceSpikeNodeId, + n_map, max_remote_spike_num_, this_host_); // n_spike_tot -> max_remote_spike_num_ + CUDASYNC; +#endif + // convert node map indexes to image node indexes MapIndexToImageNodeKernel<<< n_hosts_, 1024 >>>( n_hosts_, d_ExternalSourceSpikeIdx0, d_ExternalSourceSpikeNodeId ); diff --git a/src/spike_buffer.cu b/src/spike_buffer.cu index e696d45a1..a28ba7c5f 100644 --- a/src/spike_buffer.cu +++ b/src/spike_buffer.cu @@ -181,9 +181,8 @@ PushSpike( int i_spike_buffer, float mul ) if ( n_rec_spike_times >= max_n_rec_spike_times - 1 ) { printf( - "Maximum number of recorded spike times exceeded" - " for spike buffer %d\n", - i_spike_buffer ); + "Maximum number of recorded spike times %d exceeded" + " for spike buffer %d\n", max_n_rec_spike_times, i_spike_buffer ); } else { // record spike time