diff --git a/python/hpc_benchmark/test_hpc_benchmark_hg/check.sh b/python/hpc_benchmark/test_hpc_benchmark_hg/check.sh
new file mode 100644
index 000000000..7f0e36210
--- /dev/null
+++ b/python/hpc_benchmark/test_hpc_benchmark_hg/check.sh
@@ -0,0 +1 @@
+for i in $(seq 0 9); do diff raster_plot$i.png ../test_hpc_benchmark_p2p/raster_plot$i.png; done
diff --git a/python/hpc_benchmark/test_hpc_benchmark_hg/check2.sh b/python/hpc_benchmark/test_hpc_benchmark_hg/check2.sh
new file mode 100644
index 000000000..b5e9d168d
--- /dev/null
+++ b/python/hpc_benchmark/test_hpc_benchmark_hg/check2.sh
@@ -0,0 +1 @@
+more log_* | grep firi > tmp1.txt; more ../test_hpc_benchmark_p2p/log_* | grep firi > tmp2.txt; diff tmp1.txt tmp2.txt
diff --git a/python/hpc_benchmark/test_hpc_benchmark_wg/check.sh b/python/hpc_benchmark/test_hpc_benchmark_wg/check.sh
new file mode 100644
index 000000000..7f0e36210
--- /dev/null
+++ b/python/hpc_benchmark/test_hpc_benchmark_wg/check.sh
@@ -0,0 +1 @@
+for i in $(seq 0 9); do diff raster_plot$i.png ../test_hpc_benchmark_p2p/raster_plot$i.png; done
diff --git a/python/hpc_benchmark/test_hpc_benchmark_wg/check2.sh b/python/hpc_benchmark/test_hpc_benchmark_wg/check2.sh
new file mode 100644
index 000000000..b5e9d168d
--- /dev/null
+++ b/python/hpc_benchmark/test_hpc_benchmark_wg/check2.sh
@@ -0,0 +1 @@
+more log_* | grep firi > tmp1.txt; more ../test_hpc_benchmark_p2p/log_* | grep firi > tmp2.txt; diff tmp1.txt tmp2.txt
diff --git a/python/test/logp3_connect.txt b/python/test/logp3_connect.txt
index df4e459a0..aae630d38 100644
--- a/python/test/logp3_connect.txt
+++ b/python/test/logp3_connect.txt
@@ -8,6 +8,10 @@
 
  Homepage: https://github.com/nest/nest-gpu
 
+Calibrating ...
+Allocating auxiliary GPU memory...
+Sorting...
+Done
 ########################################
 Even to all
 {'index': 0, 'source': 0, 'target': 1, 'port': 0, 'syn_group': 0, 'delay': 1.0, 'weight': 100.0}
@@ -100,7 +104,3 @@ Even to 3,4,5,6
 {'index': 34, 'source': 6, 'target': 9, 'port': 0, 'syn_group': 0, 'delay': 69.0, 'weight': 6900.0}
 
 
-Calibrating ...
-Allocating auxiliary GPU memory...
-Sorting...
-Done
diff --git a/python/test/test_all.sh b/python/test/test_all.sh
index b07c30826..72be6659a 100755
--- a/python/test/test_all.sh
+++ b/python/test/test_all.sh
@@ -2,7 +2,7 @@ pass_str[0]="TEST PASSED"
 pass_str[1]="TEST NOT PASSED"
 :>log.txt
 for fn in test_iaf_psc_exp_g.py test_iaf_psc_alpha.py test_fixed_total_number.py test_iaf_psc_exp.py test_spike_times.py test_aeif_cond_alpha.py test_aeif_cond_beta.py test_aeif_psc_alpha.py test_aeif_psc_delta.py test_aeif_psc_exp.py test_aeif_cond_alpha_multisynapse.py  test_aeif_cond_beta_multisynapse.py  test_aeif_psc_alpha_multisynapse.py  test_aeif_psc_exp_multisynapse.py test_stdp_list.py test_stdp.py test_syn_model.py test_brunel_list.py test_brunel_outdegree.py test_brunel_user_m1.py test_spike_detector.py; do
-    python3 $fn >> log.txt 2>err.txt
+    python3 -u $fn >> log.txt 2>err.txt
     res=$?
     cat err.txt >> log.txt
     rm -f err.txt
@@ -12,7 +12,7 @@ for fn in test_iaf_psc_exp_g.py test_iaf_psc_alpha.py test_fixed_total_number.py
     echo $fn : ${pass_str[$res]}
 done
 for fn in syn_group connect getarr setvar2 group_param; do
-    python3 test_$fn.py 2>&1 | grep -v dyl | grep -v 'Time:' | grep -v 'storage bytes:' | grep -v Indexing | grep -v 'Total number' > tmp
+    python3 -u test_$fn.py 2>&1 | grep -v dyl | grep -v 'Time:' | grep -v 'storage bytes:' | grep -v Indexing | grep -v 'Total number' | grep -v 'Time from' > tmp
     diff -qs tmp logp3_$fn.txt 2>&1 >> log.txt
     res=$?
     echo $fn : ${pass_str[$res]}    
diff --git a/python/test/test_stdp/cases/test_all.sh b/python/test/test_stdp/cases/test_all.sh
index b87eeda89..870bc901e 100755
--- a/python/test/test_stdp/cases/test_all.sh
+++ b/python/test/test_stdp/cases/test_all.sh
@@ -1 +1 @@
-for i in $(seq 1 10); do python3 case$i.py | grep '^dw'; done
+for i in $(seq 1 10); do python3 -u case$i.py | grep '^dw'; done
diff --git a/src/connect.h b/src/connect.h
index 5f5d44bb1..e3d927592 100644
--- a/src/connect.h
+++ b/src/connect.h
@@ -118,7 +118,9 @@ class Connection
   virtual uint** getDevNodeTargetHostIMap() = 0;
 
   // get remote target host groups of all nodes
-  virtual std::vector< std::unordered_set < int > > &getNodeTargetHostGroup() = 0;
+  virtual const std::vector< std::vector < int > > &getNodeTargetHostGroup() const = 0;
+  
+  //virtual int copyNodeTargetHostGroup(inode_t i_node, uint *copy_array) = 0;
   
   // get map of local source nodes positions in host group node map
   virtual std::vector< std::vector< uint > > &getHostGroupLocalSourceNodeMap() = 0;
@@ -135,7 +137,7 @@ class Connection
 #endif
   
   // return map of host group source nodes to local image nodes
-  virtual std::vector<std::vector< std::vector< int > > > &getHostGroupLocalNodeIndex() = 0;
+  virtual std::vector<std::vector< std::vector< int64_t > > > &getHostGroupLocalNodeIndex() = 0;
 
   // get point-to-point MPI communication activation matrix
   virtual std::vector< std::vector < bool > > &getP2PHostConnMatrix() = 0;
@@ -286,6 +288,12 @@ class Connection
   // calibrate the maps used to send spikes among remote hosts
   virtual int remoteConnectionMapCalibrate( inode_t n_nodes ) = 0;
 
+  // get vector of number of elements in the maps of remote source nodes to local image nodes
+  virtual std::vector< uint* >  &getDevNRemoteSourceNodeMap() = 0;
+
+  // only for debugging, save remote connection P2P maps
+  virtual int remoteConnectionMapSave() = 0;
+  
   // remote connection methods. 4 combinations where source and target can be either
   // of inode_t type (in case of a sequence) or pointers to inode_t
   // (in case of arbitrary arrays if node indexes)
@@ -462,8 +470,10 @@ class ConnectionTemplate : public Connection
   // n_remote_source_node_map[group_local_id][i_host]
   // with i_host = 0, ...,  host_group_[group_local_id].size()-1 excluding this host itself
   std::vector< std::vector< uint > > h_n_remote_source_node_map_;
+  
+  // vector of number of elements in the maps of remote source nodes to local image nodes
   std::vector< uint* > d_n_remote_source_node_map_;
-
+  
   // remote_source_node_map_[group_local_id][i_host][i_block][i]
   std::vector< std::vector< std::vector< uint* > > > h_remote_source_node_map_;
 
@@ -650,10 +660,10 @@ class ConnectionTemplate : public Connection
   // same as above, but ordered
   std::vector<std::vector< std::vector< inode_t > > > host_group_source_node_vect_;
   // map of host group source nodes to local image nodes
-  std::vector<std::vector< std::vector< int > > > host_group_local_node_index_;
+  std::vector<std::vector< std::vector< int64_t > > > host_group_local_node_index_;
 
   // local ids of the host groups to which each node should send spikes
-  std::vector< std::unordered_set< int > > node_target_host_group_; // [n_local_nodes ][num. of target host groups ]
+  std::vector< std::vector< int > > node_target_host_group_; // [n_local_nodes ][num. of target host groups ]
   //////////////////////////////////////////////////
   // class ConnectionTemplate methods
   //////////////////////////////////////////////////
@@ -776,7 +786,7 @@ class ConnectionTemplate : public Connection
     return d_node_target_host_i_map_;
   }
 
-  std::vector< std::unordered_set < int > > &getNodeTargetHostGroup()
+  const std::vector< std::vector < int > > &getNodeTargetHostGroup() const
   {
     return node_target_host_group_;
   }
@@ -809,7 +819,7 @@ class ConnectionTemplate : public Connection
   }
   
   // return map of host group source nodes to local image nodes
-  std::vector<std::vector< std::vector< int > > > &getHostGroupLocalNodeIndex()
+  std::vector<std::vector< std::vector< int64_t > > > &getHostGroupLocalNodeIndex()
   {
     return host_group_local_node_index_;
   }
@@ -970,6 +980,15 @@ class ConnectionTemplate : public Connection
   // Calibrate the maps
   int remoteConnectionMapCalibrate( inode_t n_nodes );
 
+  // get vector of number of elements in the maps of remote source nodes to local image nodes
+  std::vector< uint* >  &getDevNRemoteSourceNodeMap()
+  {
+    return d_n_remote_source_node_map_;
+  }
+
+  // only for debugging, save remote connection P2P maps
+  int remoteConnectionMapSave();
+
   // Allocate GPU memory for new remote-source-node-map blocks
   int allocRemoteSourceNodeMapBlocks( std::vector< uint* >& i_remote_src_node_map,
     std::vector< uint* >& i_local_spike_buf_map,
@@ -1613,6 +1632,19 @@ setSource( ConnKeyT* conn_key_subarray, uint* rand_val, int64_t n_conn, T source
   setConnSource< ConnKeyT >( conn_key_subarray[ i_conn ], i_source );
 }
 
+template < class T, class ConnKeyT >
+__global__ void
+setSource( ConnKeyT* conn_key_subarray, inode_t* source, int64_t n_conn)
+{
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  inode_t i_source = source[ i_conn ];
+  setConnSource< ConnKeyT >( conn_key_subarray[ i_conn ], i_source );
+}
+
 template < class T >
 __global__ void
 setSource( inode_t* conn_source_ids, uint* rand_val, int64_t n_conn, T source, inode_t n_source )
@@ -1662,14 +1694,13 @@ setOneToOneSourceTarget( ConnKeyT* conn_key_subarray,
 
 template < class T >
 __global__ void
-setOneToOneSource( inode_t* conn_source_ids, int64_t n_block_conn, int64_t n_prev_conn, T source )
+setOneToOneSource( inode_t* conn_source_ids, int64_t n_conn, T source )
 {
-  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if ( i_block_conn >= n_block_conn )
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
   {
     return;
   }
-  int64_t i_conn = n_prev_conn + i_block_conn;
   inode_t i_source = getNodeIndex( source, ( int ) ( i_conn ) );
   conn_source_ids[ i_conn ] = i_source;
 }
@@ -1700,18 +1731,16 @@ setAllToAllSourceTarget( ConnKeyT* conn_key_subarray,
 template < class T1 >
 __global__ void
 setAllToAllSource( inode_t* conn_source_ids,
-  int64_t n_block_conn,
-  int64_t n_prev_conn,
+  int64_t n_conn,
   T1 source,
   inode_t n_source,
   inode_t n_target )
 {
-  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if ( i_block_conn >= n_block_conn )
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
   {
     return;
   }
-  int64_t i_conn = n_prev_conn + i_block_conn;
   inode_t i_source = getNodeIndex( source, ( int ) ( i_conn / n_target ) );
   conn_source_ids[ i_conn ] = i_source;
 }
@@ -1750,14 +1779,13 @@ setOutdegreeSource( ConnKeyT* conn_key_subarray, int64_t n_block_conn, int64_t n
 
 template < class T >
 __global__ void
-setOutdegreeSource( inode_t* conn_source_ids, int64_t n_block_conn, int64_t n_prev_conn, T source, int outdegree )
+setOutdegreeSource( inode_t* conn_source_ids, int64_t n_conn, T source, int outdegree )
 {
-  int64_t i_block_conn = threadIdx.x + blockIdx.x * blockDim.x;
-  if ( i_block_conn >= n_block_conn )
+  int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
+  if ( i_conn >= n_conn )
   {
     return;
   }
-  int64_t i_conn = n_prev_conn + i_block_conn;
   inode_t i_source = getNodeIndex( source, ( int ) ( i_conn / outdegree ) );
   conn_source_ids[ i_conn ] = i_source;
 }
@@ -2917,15 +2945,18 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectOneToOne( curandGenerator_t&
   int64_t n_new_conn = n_node;
   n_conn_ += n_new_conn; // new number of connections
   int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ );
-  if ( remote_source_flag )
-  {
+
+  if ( remote_source_flag ) {
     reallocConnSourceIds( n_new_conn );
+    setOneToOneSource< T1 > <<< ( n_new_conn + 1023 ) / 1024, 1024 >>>
+      (d_conn_source_ids_, n_new_conn, source );
+    DBGCUDASYNC;
+    
+    return 0;
   }
-  else
-  {
-    allocateNewBlocks( new_n_block );
-  }
-
+  
+  allocateNewBlocks( new_n_block );
+  
   // printf("Generating connections with one-to-one rule...\n");
   int64_t n_prev_conn = 0;
   int ib0 = ( int ) ( old_n_conn / conn_block_size_ );
@@ -2953,28 +2984,20 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectOneToOne( curandGenerator_t&
       i_conn0 = 0;
       n_block_conn = conn_block_size_;
     }
-    if ( remote_source_flag )
-    {
-      setOneToOneSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        d_conn_source_ids_, n_block_conn, n_prev_conn, source );
-      DBGCUDASYNC;
-    }
-    else
-    {
-      setOneToOneSourceTarget< T1, T2, ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, source, target );
-      DBGCUDASYNC;
-      setConnectionWeights(
-        local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
-      setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
-      setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
-      DBGCUDASYNC;
-      setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
-      DBGCUDASYNC;
-      // CUDASYNC;
-    }
+    
+    setOneToOneSourceTarget< T1, T2, ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, source, target );
+    DBGCUDASYNC;
+    setConnectionWeights(
+      local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+    setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+    setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
+    DBGCUDASYNC;
+    setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
+    DBGCUDASYNC;
+    // CUDASYNC;
 
     n_prev_conn += n_block_conn;
   }
@@ -3001,11 +3024,15 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectAllToAll( curandGenerator_t&
   if ( remote_source_flag )
   {
     reallocConnSourceIds( n_new_conn );
+    setAllToAllSource< T1 > <<< ( n_new_conn + 1023 ) / 1024, 1024 >>>(
+        d_conn_source_ids_, n_new_conn, source, n_source, n_target );
+    DBGCUDASYNC;
+
+    return 0;  
   }
-  else
-  {
-    allocateNewBlocks( new_n_block );
-  }
+  
+  allocateNewBlocks( new_n_block );
+
   // printf("Generating connections with all-to-all rule...\n");
   int64_t n_prev_conn = 0;
   int ib0 = ( int ) ( old_n_conn / conn_block_size_ );
@@ -3033,36 +3060,29 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectAllToAll( curandGenerator_t&
       i_conn0 = 0;
       n_block_conn = conn_block_size_;
     }
-    if ( remote_source_flag )
-    {
-      setAllToAllSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        d_conn_source_ids_, n_block_conn, n_prev_conn, source, n_source, n_target );
-      DBGCUDASYNC;
-    }
-    else
-    {
-      setAllToAllSourceTarget< T1, T2, ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0,
-        conn_struct_vect_[ ib ] + i_conn0,
-        n_block_conn,
-        n_prev_conn,
-        source,
-        n_source,
-        target,
-        n_target );
-      DBGCUDASYNC;
-      setConnectionWeights(
-        local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
-
-      setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
-
-      setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
-      DBGCUDASYNC;
-      setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
-      DBGCUDASYNC;
-    }
+    
+    setAllToAllSourceTarget< T1, T2, ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0,
+      conn_struct_vect_[ ib ] + i_conn0,
+      n_block_conn,
+      n_prev_conn,
+      source,
+      n_source,
+      target,
+      n_target );
+    DBGCUDASYNC;
+    setConnectionWeights(
+      local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+    setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+    setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
+    DBGCUDASYNC;
+    setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
+    DBGCUDASYNC;
+    
     n_prev_conn += n_block_conn;
   }
 
@@ -3090,14 +3110,19 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedTotalNumber( curandGene
   n_conn_ += n_new_conn; // new number of connections
   int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ );
 
-  if ( remote_source_flag )
-  {
-    reallocConnSourceIds( n_new_conn );
-  }
-  else
-  {
-    allocateNewBlocks( new_n_block );
+  reallocConnSourceIds( n_new_conn );
+  // generate random source index in range 0 - n_neuron
+  CURAND_CALL( curandGenerate( src_gen, ( uint* ) d_conn_source_ids_, n_new_conn ) );
+  setSource< T1 > <<< ( n_new_conn + 1023 ) / 1024, 1024 >>>
+    (d_conn_source_ids_, d_conn_source_ids_, n_new_conn, source, n_source );
+  DBGCUDASYNC;
+
+  if ( remote_source_flag ) {
+    return 0;
   }
+  
+  allocateNewBlocks( new_n_block );
+  
   // printf("Generating connections with fixed_total_number rule...\n");
   int64_t conn_source_ids_offset = 0;
   int ib0 = ( int ) ( old_n_conn / conn_block_size_ );
@@ -3125,39 +3150,29 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedTotalNumber( curandGene
       i_conn0 = 0;
       n_block_conn = conn_block_size_;
     }
-    // generate random source index in range 0 - n_neuron
-    CURAND_CALL( curandGenerate( src_gen, ( uint* ) d_conn_storage_, n_block_conn ) );
-    if ( remote_source_flag )
-    {
-      setSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        d_conn_source_ids_ + conn_source_ids_offset, ( uint* ) d_conn_storage_, n_block_conn, source, n_source );
-      DBGCUDASYNC;
-      conn_source_ids_offset += n_block_conn;
-    }
-    else
-    {
-      setSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, source, n_source );
-      DBGCUDASYNC;
-
-      // generate random target index in range 0 - n_neuron
-      CURAND_CALL( curandGenerate( local_rnd_gen_, ( uint* ) d_conn_storage_, n_block_conn ) );
-      setTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_struct_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, target, n_target );
-      DBGCUDASYNC;
-
-      setConnectionWeights(
-        local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
-
-      setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
-
-      setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
-      DBGCUDASYNC;
-      setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
-      DBGCUDASYNC;
-    }
+    
+    setSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>
+      (conn_key_vect_[ ib ] + i_conn0, d_conn_source_ids_ + conn_source_ids_offset, n_block_conn);
+    DBGCUDASYNC;
+    conn_source_ids_offset += n_block_conn;
+
+    // generate random target index in range 0 - n_neuron
+    CURAND_CALL( curandGenerate( local_rnd_gen_, ( uint* ) d_conn_storage_, n_block_conn ) );
+    setTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_struct_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, target, n_target );
+    DBGCUDASYNC;
+
+    setConnectionWeights(
+      local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+    setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+    setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
+    DBGCUDASYNC;
+    setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
+    DBGCUDASYNC;
   }
 
   return 0;
@@ -3184,15 +3199,19 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedIndegree( curandGenerat
   n_conn_ += n_new_conn; // new number of connections
   int new_n_block = ( int ) ( ( n_conn_ + conn_block_size_ - 1 ) / conn_block_size_ );
 
-  if ( remote_source_flag )
-  {
-    reallocConnSourceIds( n_new_conn );
-  }
-  else
-  {
-    allocateNewBlocks( new_n_block );
-  }
+  reallocConnSourceIds( n_new_conn );  
+  // generate random source index in range 0 - n_neuron
+  CURAND_CALL( curandGenerate( src_gen, ( uint* ) d_conn_source_ids_, n_new_conn ) );
+  setSource< T1 > <<< ( n_new_conn + 1023 ) / 1024, 1024 >>>
+    (d_conn_source_ids_, d_conn_source_ids_, n_new_conn, source, n_source );
+  DBGCUDASYNC;
 
+  if ( remote_source_flag ) {
+    return 0;
+  }
+  
+  allocateNewBlocks( new_n_block );
+  
   // printf("Generating connections with fixed_indegree rule...\n");
   int64_t conn_source_ids_offset = 0;
   int64_t n_prev_conn = 0;
@@ -3221,37 +3240,28 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedIndegree( curandGenerat
       i_conn0 = 0;
       n_block_conn = conn_block_size_;
     }
-    // generate random source index in range 0 - n_neuron
-    CURAND_CALL( curandGenerate( src_gen, ( uint* ) d_conn_storage_, n_block_conn ) );
-    if ( remote_source_flag )
-    {
-      setSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        d_conn_source_ids_ + conn_source_ids_offset, ( uint* ) d_conn_storage_, n_block_conn, source, n_source );
-      DBGCUDASYNC;
-      conn_source_ids_offset += n_block_conn;
-    }
-    else
-    {
-      setSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, source, n_source );
-      DBGCUDASYNC;
 
-      setIndegreeTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_struct_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, target, indegree );
-      DBGCUDASYNC;
+    setSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>
+      (conn_key_vect_[ ib ] + i_conn0, d_conn_source_ids_ + conn_source_ids_offset, n_block_conn);
+    DBGCUDASYNC;
+    conn_source_ids_offset += n_block_conn;
 
-      setConnectionWeights(
-        local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+    setIndegreeTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_struct_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, target, indegree );
+    DBGCUDASYNC;
 
-      setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+    setConnectionWeights(
+      local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
 
-      setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
-      DBGCUDASYNC;
-      setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
-      DBGCUDASYNC;
-    }
+    setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+    setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
+    DBGCUDASYNC;
+    setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
+    DBGCUDASYNC;
+    
     n_prev_conn += n_block_conn;
   }
 
@@ -3282,11 +3292,14 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedOutdegree( curandGenera
   if ( remote_source_flag )
   {
     reallocConnSourceIds( n_new_conn );
+    setOutdegreeSource< T1 > <<< ( n_new_conn + 1023 ) / 1024, 1024 >>>(
+        d_conn_source_ids_, n_new_conn, source, outdegree );
+    DBGCUDASYNC;
+
+    return 0;  
   }
-  else
-  {
-    allocateNewBlocks( new_n_block );
-  }
+  
+  allocateNewBlocks( new_n_block );
 
   // printf("Generating connections with fixed_outdegree rule...\n");
   int64_t n_prev_conn = 0;
@@ -3315,37 +3328,29 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::connectFixedOutdegree( curandGenera
       i_conn0 = 0;
       n_block_conn = conn_block_size_;
     }
-    if ( remote_source_flag )
-    {
-      setOutdegreeSource< T1 > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        d_conn_source_ids_, n_block_conn, n_prev_conn, source, outdegree );
-      DBGCUDASYNC;
-    }
-    else
-    {
-      setOutdegreeSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, source, outdegree );
-      DBGCUDASYNC;
-
-      // generate random target index in range 0 - n_neuron
-      CURAND_CALL( curandGenerate( local_rnd_gen_, ( uint* ) d_conn_storage_, n_block_conn ) );
-      setTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_struct_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, target, n_target );
-      DBGCUDASYNC;
-
-      setConnectionWeights(
-        local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
-
-      setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
-
-      setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
-      DBGCUDASYNC;
-      setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
-        conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
-      DBGCUDASYNC;
-    }
+    
+    setOutdegreeSource< T1, ConnKeyT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, n_block_conn, n_prev_conn, source, outdegree );
+    DBGCUDASYNC;
+
+    // generate random target index in range 0 - n_neuron
+    CURAND_CALL( curandGenerate( local_rnd_gen_, ( uint* ) d_conn_storage_, n_block_conn ) );
+    setTarget< T2, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_struct_vect_[ ib ] + i_conn0, ( uint* ) d_conn_storage_, n_block_conn, target, n_target );
+    DBGCUDASYNC;
 
+    setConnectionWeights(
+      local_rnd_gen_, d_conn_storage_, conn_struct_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+    setConnectionDelays( local_rnd_gen_, d_conn_storage_, conn_key_vect_[ ib ] + i_conn0, n_block_conn, syn_spec );
+
+    setPort< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.port_, n_block_conn );
+    DBGCUDASYNC;
+    setSynGroup< ConnKeyT, ConnStructT > <<< ( n_block_conn + 1023 ) / 1024, 1024 >>>(
+      conn_key_vect_[ ib ] + i_conn0, conn_struct_vect_[ ib ] + i_conn0, syn_spec.syn_group_, n_block_conn );
+    DBGCUDASYNC;
+    
     n_prev_conn += n_block_conn;
   }
 
diff --git a/src/cuda_error.h b/src/cuda_error.h
index 044533624..5ee10695f 100644
--- a/src/cuda_error.h
+++ b/src/cuda_error.h
@@ -171,7 +171,7 @@ gpuAssert( cudaError_t code, const char* file, int line, bool abort = true )
   }
 #endif
 
-#define ACTIVATE_PRINT_TIME
+//#define ACTIVATE_PRINT_TIME
 #ifdef ACTIVATE_PRINT_TIME
 #define PRINT_TIME                              \
   gpuErrchk( cudaPeekAtLastError() );           \
diff --git a/src/input_spike_buffer.h b/src/input_spike_buffer.h
index 353eec19b..b98a09fe5 100644
--- a/src/input_spike_buffer.h
+++ b/src/input_spike_buffer.h
@@ -132,7 +132,46 @@ getMaxInputDelayKernel( int64_t n_conn, int** max_input_delay )
   atomicMax( &max_input_delay[ i_target ][ i_port ], i_delay + 1 );
 }
 
+template < class ConnKeyT, class ConnStructT >
+__global__ void
+testMaxInputDelayKernel( int64_t n_conn, int** max_input_delay, int n_local_nodes, int max_n_ports )
+{
+  int64_t i_conn = ( int64_t ) blockIdx.x * blockDim.x + threadIdx.x;
+  if ( i_conn >= n_conn )
+  {
+    return;
+  }
+  // get connection block and relative index within the block
+  uint i_block = ( uint ) ( i_conn / ConnBlockSize );
+  int64_t i_block_conn = i_conn % ConnBlockSize;
+
+  // get references to key-structure pair representing the connection
+  ConnKeyT& conn_key = ( ( ConnKeyT** ) ConnKeyArray )[ i_block ][ i_block_conn ];
+  ConnStructT& conn_struct = ( ( ConnStructT** ) ConnStructArray )[ i_block ][ i_block_conn ];
+
+  // MAYBE CAN BE IMPROVED BY USING A BIT TO SPECIFY IF A CONNECTION IS DIRECT
+  // get target node index and delay
+  inode_t i_target = getConnTarget< ConnStructT >( conn_struct );
+  int i_port = getConnPort< ConnKeyT, ConnStructT >( conn_key, conn_struct );
+  int i_delay = getConnDelay< ConnKeyT >( conn_key );
+
+  if (i_target>=n_local_nodes) {
+    printf("i_target %d >= n_local_nodes %d\n", i_target, n_local_nodes);
+    printf("i_delay %d\n", i_delay);
+  }
+  if (i_port>=max_n_ports) {
+    printf("i_port %d >= max_n_port %d\t", i_port, max_n_ports);
+    printf("i_target %d\n", i_target);
+
+  }
+  
+  // atomic operation to avoid conflicts in memory access
+  // if delay is larger than current maximum evaluated for target node, update the maximum
+  //atomicMax( &max_input_delay[ i_target ][ i_port ], i_delay + 1 );
+}
+
 
+  
 // Evaluates the index of the first outgoing connection of each source node
 template < class ConnKeyT >
 __global__ void
@@ -452,6 +491,8 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::initInputSpikeBuffer( inode_t n_loc
   }
 
   if (n_conn_ > 0) {
+    //testMaxInputDelayKernel< ConnKeyT, ConnStructT > <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>(n_conn_, d_max_input_delay_, n_local_nodes, 2 );
+    //CUDASYNC;
     getMaxInputDelayKernel< ConnKeyT, ConnStructT > <<< ( n_conn_ + 1023 ) / 1024, 1024 >>>(n_conn_, d_max_input_delay_ );
     DBGCUDASYNC;
   }
diff --git a/src/mpi_comm.cu b/src/mpi_comm.cu
index 8093575da..16c351cb7 100644
--- a/src/mpi_comm.cu
+++ b/src/mpi_comm.cu
@@ -44,7 +44,6 @@ int
 NESTGPU::SendSpikeToRemote( int n_ext_spikes )
 {
 #ifdef HAVE_MPI
-  MPI_Request request;
   int mpi_id, tag = 1; // id is already in the class, can be removed
   MPI_Comm_rank( MPI_COMM_WORLD, &mpi_id );
 
@@ -106,26 +105,33 @@ NESTGPU::SendSpikeToRemote( int n_ext_spikes )
       h_ExternalHostGroupSpikeIdx0[i] = 0;
     }
   }
- 
+
   SendSpikeToRemote_CUDAcp_time_ += ( getRealTime() - time_mark );
   time_mark = getRealTime();
 
   // loop on remote MPI proc
   for ( int ih = 0; ih < n_hosts_; ih++ )
   {
-    if ( ( int ) ih == mpi_id || p2p_host_conn_matrix[this_host_][ih]==false)
+    if (ih == mpi_id || p2p_host_conn_matrix[this_host_][ih]==false)
     { // skip self MPI proc and unused point-to-point MPI communications
+      recv_mpi_request[ n_hosts_ + ih ] = MPI_REQUEST_NULL;
       continue;
     }
     // get index and size of spike packet that must be sent to MPI proc ih
     // array_idx is the first index of the packet for host ih
     int array_idx = h_ExternalTargetSpikeIdx0[ ih ];
     int n_spikes = h_ExternalTargetSpikeIdx0[ ih + 1 ] - array_idx;
-    // printf("MPI_Send (src,tgt,nspike): %d %d %d\n", mpi_id, ih, n_spike);
-
     // nonblocking sent of spike packet to MPI proc ih
-    MPI_Isend( &h_ExternalTargetSpikeNodeId[ array_idx ], n_spikes, MPI_INT, ih, tag, MPI_COMM_WORLD, &request );
-    MPI_Request_free(&request);
+    if (n_spikes >= max_spike_per_host_) {
+      throw ngpu_exception( std::string("MPI_Isend error from host ") + std::to_string(this_host_) +
+			    " to host " + std::to_string(ih) +
+			    "\nNumber of spikes to be sent remotely " + std::to_string( n_spikes ) +
+			    " larger than limit " + std::to_string( max_spike_per_host_ ) +
+			    "\nYou can try to increase the kernel parameter \"max_spike_per_host_fact_\"." );
+    }
+
+    MPI_Isend( &h_ExternalTargetSpikeNodeId[ array_idx ], n_spikes, MPI_UNSIGNED, ih, tag, MPI_COMM_WORLD,
+               &recv_mpi_request[ n_hosts_ + ih ] );
 
     // printf("MPI_Send nspikes (src,tgt,nspike): "
     //	   "%d %d %d\n", mpi_id, ih, n_spikes);
@@ -158,25 +164,26 @@ NESTGPU::RecvSpikeFromRemote()
   // loop on remote MPI proc
   for ( int i_host = 0; i_host < n_hosts_; i_host++ )
   {
-    if ( ( int ) i_host == mpi_id || p2p_host_conn_matrix[i_host][this_host_]==false)
+    if (i_host == mpi_id || p2p_host_conn_matrix[i_host][this_host_]==false)
     {
       recv_mpi_request[ i_host ] = MPI_REQUEST_NULL;
-      continue; // skip self MPI proc
+      continue;
     }
     // start nonblocking MPI receive from MPI proc i_host
     MPI_Irecv( &h_ExternalSourceSpikeNodeId[0][ i_host * max_spike_per_host_ ],
       max_spike_per_host_,
-      MPI_INT,
+      MPI_UNSIGNED,
       i_host,
       tag,
       MPI_COMM_WORLD,
       &recv_mpi_request[ i_host ] );
   }
-  MPI_Status statuses[ n_hosts_ ];
+  MPI_Status statuses[ 2*n_hosts_ ];
   //recv_mpi_request[ mpi_id ] = MPI_REQUEST_NULL;
   //MPI_Waitall( n_hosts_ + nhg - 1, recv_mpi_request, statuses );
-  MPI_Waitall( n_hosts_, recv_mpi_request, statuses );
+  MPI_Waitall( 2*n_hosts_, recv_mpi_request, statuses );
 
+  
   std::vector< std::vector< int > > &host_group = conn_->getHostGroup();
   std::vector<MPI_Comm> &mpi_comm_vect = conn_->getMPIComm();
   uint nhg = host_group.size();
@@ -215,13 +222,20 @@ NESTGPU::RecvSpikeFromRemote()
     }
     int count = 0;
     if (p2p_host_conn_matrix[i_host][this_host_]==true) {
-      MPI_Get_count( &statuses[ i_host ], MPI_INT, &count );
+      MPI_Get_count( &statuses[ i_host ], MPI_UNSIGNED, &count );
+    }
+    if (count < 0 || count > max_spike_per_host_) {
+      throw ngpu_exception( std::string("MPI_Irecv error in host ") + std::to_string(this_host_) +
+			    " from host " + std::to_string(i_host) +
+			    "\nNumber of spikes sent remotely larger than limit " +
+			    std::to_string( max_spike_per_host_ ) +
+			    "\nYou can try to increase the kernel parameter \"max_spike_per_host_fact_\"." );
     }
     h_ExternalSourceSpikeNum[0][ i_host ] = count;
   }
   
   // Maybe the barrier is not necessary?
-  MPI_Barrier( MPI_COMM_WORLD );
+  //MPI_Barrier( MPI_COMM_WORLD );
   RecvSpikeFromRemote_comm_time_ += ( getRealTime() - time_mark );
   
   return 0;
@@ -249,7 +263,7 @@ NESTGPU::ConnectMpiInit( int argc, char* argv[] )
   setNHosts( n_hosts );
   setThisHost( this_host );
   //conn_->remoteConnectionMapInit();
-  recv_mpi_request = new MPI_Request[ n_hosts_ ];
+  recv_mpi_request = new MPI_Request[ 2*n_hosts_ ];
 
   return 0;
 #else
diff --git a/src/nestgpu.cu b/src/nestgpu.cu
index 7d0df80f4..37747a2d6 100644
--- a/src/nestgpu.cu
+++ b/src/nestgpu.cu
@@ -491,6 +491,8 @@ NESTGPU::Calibrate()
     ExternalSpikeInit();
 
     PRINT_TIME;
+
+    //conn_->remoteConnectionMapSave();
   }
 
   if ( conn_->getRevConnFlag() )
diff --git a/src/remote_connect.cu b/src/remote_connect.cu
index 8afe3b9cc..c1917c710 100644
--- a/src/remote_connect.cu
+++ b/src/remote_connect.cu
@@ -265,6 +265,39 @@ insertNodesInMapKernel( uint** node_map,
   }
 }
 
+// Only for debugging P2P MPI spike communication
+__global__ void
+checkMapIndexToImageNodeKernel( uint n_hosts, uint* host_offset, uint* node_index, uint *n_map,
+                               uint node_index_size, int this_host)
+{
+  const uint i_host = blockIdx.x;
+  if ( i_host < n_hosts )
+  {
+    const uint pos = host_offset[ i_host ];
+    const uint num = host_offset[ i_host + 1 ] - pos;
+    for ( uint i_elem = threadIdx.x; i_elem < num; i_elem += blockDim.x )
+    {
+      const uint i_node_map = node_index[ pos + i_elem ];
+      if ((pos + i_elem)>=node_index_size) {
+	printf("Error in checkMapIndexToImageNodeKernel.\n"
+	       "\t(pos + i_elem)>=node_index_size\n"
+	       "\tthis_host: %d\ti_host: %d\tpos: %d\tnum: %d\ti_elem: %d\tinode_map: %d\n",
+	       this_host, i_host, pos, num, i_elem, i_node_map);
+      }
+      if (i_node_map>=n_map[i_host]) {
+	printf("Error in checkMapIndexToImageNodeKernel.\n"
+	       "\ti_node_map>=n_map[i_host]\n"
+	       "\tthis_host: %d\ti_host: %d\tpos: %d\tnum: %d\ti_elem: %d\tinode_map: %d\n",
+	       this_host, i_host, pos, num, i_elem, i_node_map);
+      }
+      //const uint i_block = i_node_map / node_map_block_size;
+      //const uint i = i_node_map % node_map_block_size;
+      //const uint i_image_node = local_image_node_map[ 0 ][ i_host ][ i_block ][ i ]; 
+      //node_index[ pos + i_elem ] = i_image_node;
+    }
+  }
+}
+
 // This function is used only by point-by-point communication, not by host groups
 __global__ void
 MapIndexToImageNodeKernel( uint n_hosts, uint* host_offset, uint* node_index )
diff --git a/src/remote_connect.h b/src/remote_connect.h
index f66f710dd..c5d92a7d5 100644
--- a/src/remote_connect.h
+++ b/src/remote_connect.h
@@ -3,6 +3,8 @@
 #define REMOTECONNECTH
 // #include <cub/cub.cuh>
 #include <vector>
+#include <fstream>
+
 #include "getRealTime.h"
 // #include "nestgpu.h"
 #include "connect.h"
@@ -210,7 +212,10 @@ addOffsetToExternalNodeIdsKernel( int64_t n_conn,
 
 __global__ void MapIndexToImageNodeKernel( uint n_hosts, uint* host_offset, uint* node_index );
 
-
+// only for debugging
+__global__ void
+checkMapIndexToImageNodeKernel( uint n_hosts, uint* host_offset, uint* node_index, uint *n_map,
+                               uint node_index_size, int this_host);
 
 // Allocate GPU memory for new remote-source-node-map blocks
 template < class ConnKeyT, class ConnStructT >
@@ -682,40 +687,41 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectionMapCalibrate( inode
 
   PRINT_TIME;
 
-  uint n_src_max = 0;
-  
+  uint src_node_max = 0;
+
+  std::vector< std::unordered_set <int> > node_target_host_group_us;
   node_target_host_group_.resize(n_nodes);
+  node_target_host_group_us.resize(n_nodes);
+  
   for (uint group_local_id=1; group_local_id<nhg; group_local_id++) {
     host_group_local_source_node_map_[group_local_id].resize(n_nodes);
     uint nh = host_group_[group_local_id].size(); // number of hosts in the group
     for ( uint gi_host = 0; gi_host < nh; gi_host++ ) {// loop on hosts
       uint n_src = host_group_source_node_[group_local_id][gi_host].size();
-      n_src_max = max(n_src_max, n_src);
+      //n_src_max = max(n_src_max, n_src);
       host_group_source_node_vect_[group_local_id][gi_host].resize(n_src);
       std::copy(host_group_source_node_[group_local_id][gi_host].begin(), host_group_source_node_[group_local_id][gi_host].end(),
 		host_group_source_node_vect_[group_local_id][gi_host].begin());
       
       host_group_local_node_index_[group_local_id][gi_host].resize(n_src);
 
-      /////////////// check that the following is not needed
-      //std::fill(host_group_local_node_index_[group_local_id][gi_host].begin(), host_group_local_node_index_[group_local_id][gi_host].end(), -1);
     }
   }
   
   PRINT_TIME;
-  std::vector<uint> tmp_node_map(n_src_max, 0);
-  
-    
+  std::vector<int64_t> tmp_node_map;
+  //tmp_node_map.resize(src_node_max);
+
   for (uint group_local_id=1; group_local_id<nhg; group_local_id++) {
     uint nh = host_group_[group_local_id].size(); // number of hosts in the group
     for ( uint gi_host = 0; gi_host < nh; gi_host++ ) {// loop on hosts
       int src_host = host_group_[group_local_id][gi_host];
+      ///*
       if ( src_host != this_host_ ) { // skip self host
 	// get number of elements in the map
 	uint n_node_map;
 	gpuErrchk(
 		  cudaMemcpy( &n_node_map, &d_n_remote_source_node_map_[group_local_id][ gi_host ], sizeof( uint ), cudaMemcpyDeviceToHost ) );
-
 	if (n_node_map > 0) {
 	  hc_remote_source_node_map_[group_local_id][gi_host].resize(n_node_map);
 	  hc_image_node_map_[group_local_id][gi_host].resize(n_node_map);
@@ -734,19 +740,44 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectionMapCalibrate( inode
 	    gpuErrchk(cudaMemcpy(&hc_image_node_map_[group_local_id][gi_host][ib*node_map_block_size_],
 				 h_image_node_map_[group_local_id][gi_host][ib], n_elem*sizeof(uint), cudaMemcpyDeviceToHost ));
 	  }
-       
+       	  ///*
+	  bool resize_flag = false;
+	  for (uint i=0; i<n_node_map; i++) {
+	    inode_t src_node = hc_remote_source_node_map_[group_local_id][gi_host][i];
+	    if (src_node>src_node_max) {
+	      resize_flag = true;
+	      src_node_max = src_node;
+	      //std::cerr << "Error. src_node: " << src_node << " greater than n_src_max: " << n_src_max << std::endl;
+	      //exit(0);
+	    }
+	  }
+	  if ( resize_flag ) {
+	    tmp_node_map.resize(src_node_max+1);
+	  }
+	  std::fill(tmp_node_map.begin(), tmp_node_map.end(), -1);
 	  for (uint i=0; i<n_node_map; i++) {
 	    inode_t src_node = hc_remote_source_node_map_[group_local_id][gi_host][i];
 	    tmp_node_map[src_node] = hc_image_node_map_[group_local_id][gi_host][i];
 	  }
+	  
 	  for (uint i=0; i<host_group_source_node_vect_[group_local_id][gi_host].size(); i++) {
 	    inode_t src_node = host_group_source_node_vect_[group_local_id][gi_host][i];
-	    inode_t pos = tmp_node_map[src_node];
+	    int64_t pos;
+	    if (src_node>src_node_max) {
+	      pos = -1;
+	    }
+	    else {
+	      pos = tmp_node_map[src_node];
+	    }
 	    //if (pos<0) {
 	    //  throw ngpu_exception( "source node not found in host map" );
 	    //}
 	    host_group_local_node_index_[group_local_id][gi_host][i] = pos;
 	  }
+	  
+	}
+	else {
+	  std::fill(host_group_local_node_index_[group_local_id][gi_host].begin(), host_group_local_node_index_[group_local_id][gi_host].end(), -1);
 	}
       }
       else { // only in the source, i.e. if src_host == this_host_
@@ -754,16 +785,128 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectionMapCalibrate( inode
 	for (uint i=0; i<n_src; i++) {
 	  inode_t i_source = host_group_source_node_vect_[group_local_id][gi_host][i];
 	  host_group_local_source_node_map_[group_local_id][i_source] = i;
-	  node_target_host_group_[i_source].insert(group_local_id);	  
+	  std::pair<std::unordered_set<int>::iterator, bool> insert_it =
+	    node_target_host_group_us[i_source].insert(group_local_id);
+	  if (insert_it.second){
+	    node_target_host_group_[i_source].push_back(group_local_id);
+	  }
 	}
       }
     }
   }
   PRINT_TIME;
- 
+
   return 0;
 }
 
+template < class ConnKeyT, class ConnStructT >
+int
+ConnectionTemplate< ConnKeyT, ConnStructT >::remoteConnectionMapSave()
+{
+  std::vector< std::vector< uint > > hc_remote_source_node_map;
+  std::vector< std::vector< uint > > hc_image_node_map;
+  
+  hc_remote_source_node_map.resize(n_hosts_);
+  hc_image_node_map.resize(n_hosts_);
+  
+  for ( int src_host = 0; src_host < n_hosts_; src_host++ ) {// loop on hosts
+    if ( src_host != this_host_ ) { // skip self host
+      // get number of elements in the map
+      uint n_node_map;
+      gpuErrchk(
+		cudaMemcpy( &n_node_map, &d_n_remote_source_node_map_[0][ src_host ], sizeof( uint ), cudaMemcpyDeviceToHost ) );
+      
+      if (n_node_map > 0) {
+	hc_remote_source_node_map[src_host].resize(n_node_map);
+	hc_image_node_map[src_host].resize(n_node_map);
+	// loop on remote-source-node-to-local-image-node map blocks
+	uint n_map_blocks =  h_remote_source_node_map_[0][src_host].size();
+	
+	for (uint ib=0; ib<n_map_blocks; ib++) {
+	  uint n_elem;
+	  if (ib<n_map_blocks-1) {
+	    n_elem = node_map_block_size_;
+	  }
+	  else {
+	    n_elem = (n_node_map - 1) % node_map_block_size_ + 1;
+	  }
+	  gpuErrchk(cudaMemcpy(&hc_remote_source_node_map[src_host][ib*node_map_block_size_],
+			       h_remote_source_node_map_[0][src_host][ib], n_elem*sizeof(uint), cudaMemcpyDeviceToHost ));
+	  gpuErrchk(cudaMemcpy(&hc_image_node_map[src_host][ib*node_map_block_size_],
+			       h_image_node_map_[0][src_host][ib], n_elem*sizeof(uint), cudaMemcpyDeviceToHost ));
+	}
+	
+      }
+      
+      std::string filename = std::string("map_remote_src_") + std::to_string(this_host_) + "_" + std::to_string(src_host) + ".dat";
+      std::ofstream ofs;
+      ofs.open(filename, std::ios::out);      
+      if ( ofs.fail() ) {
+	std::cerr << "Cannot open output file\n"; 
+	exit(-1);
+      }
+      ofs << n_node_map << std::endl;
+      for (uint i=0; i<n_node_map; i++) {
+	ofs << hc_remote_source_node_map[src_host][i] << "\t" << hc_image_node_map[src_host][i] << std::endl;
+      }
+      ofs.close();
+
+    }
+  }
+
+
+  std::vector< std::vector< uint > > hc_local_source_node_map;
+  
+  hc_local_source_node_map.resize(n_hosts_);
+  
+  for ( int tg_host = 0; tg_host < n_hosts_; tg_host++ ) {// loop on hosts
+    if ( tg_host != this_host_ ) { // skip self host
+      // get number of elements in the map
+      uint n_node_map;
+      gpuErrchk(
+		cudaMemcpy( &n_node_map, &d_n_local_source_node_map_[ tg_host ], sizeof( uint ), cudaMemcpyDeviceToHost ) );
+      
+      if (n_node_map > 0) {
+	hc_local_source_node_map[tg_host].resize(n_node_map);
+	// loop on remote-source-node-to-local-image-node map blocks
+	uint n_map_blocks =  h_local_source_node_map_[tg_host].size();
+	
+	for (uint ib=0; ib<n_map_blocks; ib++) {
+	  uint n_elem;
+	  if (ib<n_map_blocks-1) {
+	    n_elem = node_map_block_size_;
+	  }
+	  else {
+	    n_elem = (n_node_map - 1) % node_map_block_size_ + 1;
+	  }
+	  gpuErrchk(cudaMemcpy(&hc_local_source_node_map[tg_host][ib*node_map_block_size_],
+			       h_local_source_node_map_[tg_host][ib], n_elem*sizeof(uint), cudaMemcpyDeviceToHost ));
+	}
+	
+      }
+      
+      std::string filename = std::string("map_local_src_") + std::to_string(this_host_) + "_" + std::to_string(tg_host) + ".dat";
+      std::ofstream ofs;
+      ofs.open(filename, std::ios::out);      
+      if ( ofs.fail() ) {
+	std::cerr << "Cannot open output file\n"; 
+	exit(-1);
+      }
+      ofs << n_node_map << std::endl;
+      for (uint i=0; i<n_node_map; i++) {
+	ofs << hc_local_source_node_map[tg_host][i] << std::endl;
+      }
+      ofs.close();
+
+    }
+  }
+  
+  return 0;
+}
+
+
+
+
 template < class ConnKeyT, class ConnStructT >
 int
 ConnectionTemplate< ConnKeyT, ConnStructT >::addOffsetToImageNodeMap( inode_t n_nodes )
@@ -1536,7 +1679,7 @@ ConnectionTemplate< ConnKeyT, ConnStructT >::CreateHostGroup(int *host_arr, int
     host_group_source_node_vect_.push_back(empty_node_vect);
     
     host_group_local_source_node_map_.push_back(std::vector< uint >());
-    std::vector< std::vector< int > > hg_lni(hg.size(), std::vector< int >());
+    std::vector< std::vector< int64_t > > hg_lni(hg.size(), std::vector< int64_t >());
     host_group_local_node_index_.push_back(hg_lni);
   }
 #ifdef HAVE_MPI
diff --git a/src/remote_spike.cu b/src/remote_spike.cu
index 910b4322d..c6567d156 100644
--- a/src/remote_spike.cu
+++ b/src/remote_spike.cu
@@ -355,15 +355,14 @@ NESTGPU::ExternalSpikeInit()
   h_ExternalTargetSpikeNum.resize( n_hosts_ );
   h_ExternalSourceSpikeIdx0.resize( n_hosts_ + 1 );
   h_ExternalTargetSpikeNodeId.resize( max_remote_spike_num_ );
-
   h_ExternalHostGroupSpikeNodeId.resize( max_remote_spike_num_ );
-
   h_ExternalSourceSpikeDispl.resize( n_hosts_ );
+  
   h_ExternalSourceSpikeDispl[0] = 0;
   for (int ih=1; ih<n_hosts_; ih++ ) {
     h_ExternalSourceSpikeDispl[ih] = h_ExternalSourceSpikeDispl[ih-1] + max_spike_per_host_;
   }
-  
+
   CUDAMALLOCCTRL( "&d_ExternalSpikeNum", &d_ExternalSpikeNum, sizeof( uint ) );
   CUDAMALLOCCTRL( "&d_ExternalSpikeSourceNode", &d_ExternalSpikeSourceNode, max_spike_per_host_ * sizeof( uint ) );
 
@@ -411,6 +410,7 @@ NESTGPU::ExternalSpikeInit()
 
   h_ExternalSourceSpikeNum[0].resize( n_hosts_ );
   h_ExternalSourceSpikeNodeId[0].resize( max_remote_spike_num_ );
+
   for (uint ihg=1; ihg<nhg; ihg++) {
     h_ExternalSourceSpikeNum[ihg].resize( host_group[ihg].size() );
     h_ExternalSourceSpikeNodeId[ihg].resize( max_remote_spike_num_ );
@@ -419,10 +419,11 @@ NESTGPU::ExternalSpikeInit()
   
   h_ExternalHostGroupSpikeIdx0.resize( nhg + 1 );
   h_ExternalHostGroupSpikeNum.resize( nhg );
+
   CUDAMALLOCCTRL( "&d_ExternalHostGroupSpikeIdx0", &d_ExternalHostGroupSpikeIdx0, ( nhg + 1 ) * sizeof( uint ) );
   CUDAMALLOCCTRL( "&d_ExternalHostGroupSpikeNum", &d_ExternalHostGroupSpikeNum, nhg * sizeof( uint ) );
   
-  std::vector< std::unordered_set < int > > &node_target_host_group = conn_->getNodeTargetHostGroup();
+  const std::vector< std::vector < int > > &node_target_host_group = conn_->getNodeTargetHostGroup();
   std::vector< std::vector< uint > > &host_group_local_source_node_map = conn_->getHostGroupLocalSourceNodeMap();
 
   std::vector < uint > n_node_target_host_group(n_node, 0);
@@ -444,7 +445,7 @@ NESTGPU::ExternalSpikeInit()
 
   auto node_target_host_group_it = node_target_host_group_flat.begin();
   uint *host_group_node_id_pt = &host_group_node_id_flat[0];
-    
+
   for (inode_t i_node = 0; i_node<n_node; i_node++) {
     uint ntg = n_node_target_host_group[i_node];
     if (node_target_host_group[i_node].size() != ntg) {
@@ -464,10 +465,10 @@ NESTGPU::ExternalSpikeInit()
     }
     if (i_node < n_node - 1) {
       hd_ExternalNodeTargetHostGroupId[i_node + 1] = hd_ExternalNodeTargetHostGroupId[i_node] + ntg;
-      hd_ExternalHostGroupNodeId[i_node + 1] = hd_ExternalHostGroupNodeId[i_node] + ntg; 
+      hd_ExternalHostGroupNodeId[i_node + 1] = hd_ExternalHostGroupNodeId[i_node] + ntg;
     }
   }
-
+  
   gpuErrchk( cudaMemcpy( hd_ExternalNodeTargetHostGroupId[0], &node_target_host_group_flat[0], ntg_tot * sizeof( uint ),
 			 cudaMemcpyHostToDevice ) );
   gpuErrchk( cudaMemcpy( hd_ExternalHostGroupNodeId[0], &host_group_node_id_flat[0], ntg_tot * sizeof( uint ),
@@ -482,7 +483,6 @@ NESTGPU::ExternalSpikeInit()
   gpuErrchk( cudaMemcpy( d_NExternalNodeTargetHostGroup, &n_node_target_host_group[0], n_node*sizeof(uint),
 			 cudaMemcpyHostToDevice ) );
   
-  
   if ( remote_spike_mul_ )
   {
     DeviceExternalSpikeInit<<< 1, 1 >>>( n_hosts_,
@@ -671,11 +671,12 @@ NESTGPU::CopySpikeFromRemote()
       if ( n_spike_tot >= max_remote_spike_num_ )
 	{
 	  throw ngpu_exception( std::string( "Number of spikes to be received remotely " ) + std::to_string( n_spike_tot )
-				+ " larger than limit " + std::to_string( max_remote_spike_num_ ) );
+				+ " larger than limit " + std::to_string( max_remote_spike_num_ )
+				+ "\nYou can try to increase the kernel parameter \"max_remote_spike_num_fact\"." );
 	} 
     }
   }
-  std::vector<std::vector< std::vector< int > > > &host_group_local_node_index = conn_->getHostGroupLocalNodeIndex();
+  std::vector<std::vector< std::vector< int64_t > > > &host_group_local_node_index = conn_->getHostGroupLocalNodeIndex();
 
   std::vector< std::vector< int > > &host_group = conn_->getHostGroup();
   uint nhg = host_group.size();
@@ -686,10 +687,11 @@ NESTGPU::CopySpikeFromRemote()
       int i_host = host_group[group_local_id][gi_host];
       if (i_host != this_host_) {
 	int n_spike = h_ExternalSourceSpikeNum[group_local_id][ gi_host ];
+
 	for ( int i_spike = 0; i_spike < n_spike; i_spike++ ) {
 	  // pack spikes received from remote hosts
 	  inode_t node_pos = h_ExternalSourceSpikeNodeId[group_local_id][ gi_host * max_spike_per_host_ + i_spike ];
-	  int node_local = host_group_local_node_index[group_local_id][gi_host][node_pos];
+	  int64_t node_local = host_group_local_node_index[group_local_id][gi_host][node_pos];
 	  if (node_local >= 0) {
 	    h_ExternalSourceSpikeNodeId_flat[ n_spike_tot ] = node_local;
 	    n_spike_tot++;
@@ -720,6 +722,17 @@ NESTGPU::CopySpikeFromRemote()
       cudaMemcpyHostToDevice ) );
     DBGCUDASYNC;
     RecvSpikeFromRemote_CUDAcp_time_ += ( getRealTime() - time_mark );
+
+//#define CHECK_MAP_INDEX_TO_IMAGE_NODE
+#ifdef CHECK_MAP_INDEX_TO_IMAGE_NODE
+    std::vector< uint* > &d_n_remote_source_node_map = conn_->getDevNRemoteSourceNodeMap();;
+    uint *n_map = d_n_remote_source_node_map[0];
+    checkMapIndexToImageNodeKernel<<< n_hosts_, 1024 >>>(
+      n_hosts_, d_ExternalSourceSpikeIdx0, d_ExternalSourceSpikeNodeId,
+      n_map, max_remote_spike_num_, this_host_); // n_spike_tot -> max_remote_spike_num_
+    CUDASYNC;
+#endif
+    
     // convert node map indexes to image node indexes
     MapIndexToImageNodeKernel<<< n_hosts_, 1024 >>>(
       n_hosts_, d_ExternalSourceSpikeIdx0, d_ExternalSourceSpikeNodeId );
diff --git a/src/spike_buffer.cu b/src/spike_buffer.cu
index e696d45a1..a28ba7c5f 100644
--- a/src/spike_buffer.cu
+++ b/src/spike_buffer.cu
@@ -181,9 +181,8 @@ PushSpike( int i_spike_buffer, float mul )
       if ( n_rec_spike_times >= max_n_rec_spike_times - 1 )
       {
         printf(
-          "Maximum number of recorded spike times exceeded"
-          " for spike buffer %d\n",
-          i_spike_buffer );
+          "Maximum number of recorded spike times %d exceeded"
+          " for spike buffer %d\n", max_n_rec_spike_times, i_spike_buffer );
       }
       else
       { // record spike time