Skip to content

Commit

Permalink
Merge pull request #87 from golosio/local-2.0-dev-mpi
Browse files Browse the repository at this point in the history
Removed unnecessary memory allocation and updates for image nodes
  • Loading branch information
golosio committed Oct 19, 2023
2 parents b90f386 + e2b88a8 commit 772d38e
Show file tree
Hide file tree
Showing 6 changed files with 162 additions and 124 deletions.
86 changes: 55 additions & 31 deletions src/nestgpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ NESTGPU::NESTGPU()
sim_time_ = 1000.0; //Simulation time in ms
//n_poiss_nodes_ = 0;
n_remote_nodes_.assign(1, 0);
n_ext_nodes_ = 0;
n_image_nodes_ = 0;
SetTimeResolution(0.1); // time resolution in ms
max_spike_num_fact_ = 1.0;
max_spike_per_host_fact_ = 1.0;
Expand Down Expand Up @@ -390,31 +390,45 @@ int NESTGPU::GetMaxSpikeBufferSize()
return max_spike_buffer_size_;
}

uint NESTGPU::GetNNode()
uint NESTGPU::GetNLocalNodes()
{
return node_group_map_.size();
}

int NESTGPU::CreateNodeGroup(int n_node, int n_port)
int NESTGPU::CheckImageNodes(int n_nodes)
{
int i_node_0 = GetNNode();
int max_n_neurons = IntPow(2,h_MaxNodeNBits);
int i_node_0 = GetNLocalNodes();
int max_n_nodes = IntPow(2,h_MaxNodeNBits);

if ((i_node_0 + n_nodes) > max_n_nodes) {
throw ngpu_exception(std::string("Local plus Image nodes exceed maximum"
" number of nodes ")
+ std::to_string(max_n_nodes));
}

return i_node_0;
}

int NESTGPU::CreateNodeGroup(int n_nodes, int n_ports)
{
int i_node_0 = GetNLocalNodes();
int max_n_nodes = IntPow(2,h_MaxNodeNBits);
int max_n_ports = IntPow(2,h_MaxPortNBits);

if ((i_node_0 + n_node) > max_n_neurons) {
throw ngpu_exception(std::string("Maximum number of neurons ")
+ std::to_string(max_n_neurons) + " exceeded");
if ((i_node_0 + n_nodes) > max_n_nodes) {
throw ngpu_exception(std::string("Maximum number of nodes ")
+ std::to_string(max_n_nodes) + " exceeded");
}
if (n_port > max_n_ports) {
if (n_ports > max_n_ports) {
throw ngpu_exception(std::string("Maximum number of ports ")
+ std::to_string(max_n_ports) + " exceeded");
}
int i_group = node_vect_.size() - 1;
node_group_map_.insert(node_group_map_.end(), n_node, i_group);
node_group_map_.insert(node_group_map_.end(), n_nodes, i_group);

node_vect_[i_group]->random_generator_ = random_generator_;
node_vect_[i_group]->Init(i_node_0, n_node, n_port, i_group);
node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_node, n_port);
node_vect_[i_group]->Init(i_node_0, n_nodes, n_ports, i_group);
node_vect_[i_group]->get_spike_array_ = InitGetSpikeArray(n_nodes, n_ports);

return i_node_0;
}
Expand All @@ -439,17 +453,20 @@ int NESTGPU::Calibrate()
gpuErrchk(cudaMemcpyToSymbol(NESTGPUTimeResolution, &time_resolution_,
sizeof(float)));
///////////////////////////////////
i_ext_node_0_ = GetNNode();
// std::cout << "i_ext_node_0_: " << i_ext_node_0_ << " n_ext_nodes_: "
// << n_ext_nodes_ << "\n";
if (n_ext_nodes_ > 0) {
_Create("ext_neuron", n_ext_nodes_, 1);
int n_nodes = GetNLocalNodes();
gpuErrchk(cudaMemcpyToSymbol(n_local_nodes, &n_nodes,
sizeof(int)));

// std::cout << "n_local_nodes: " << n_nodes << " n_image_nodes_: "
// << n_image_nodes_ << "\n";
if (n_image_nodes_ > 0) {
CheckImageNodes(n_image_nodes_);
addOffsetToExternalNodeIds();
}

calibrate_flag_ = true;

organizeConnections(time_resolution_, GetNNode(),
organizeConnections(time_resolution_, GetTotalNNodes(),
NConn, h_ConnBlockSize,
KeySubarray, ConnectionSubarray);

Expand All @@ -459,7 +476,7 @@ int NESTGPU::Calibrate()

int max_delay_num = h_MaxDelayNum;

unsigned int n_spike_buffers = GetNNode();
unsigned int n_spike_buffers = GetTotalNNodes();
NestedLoop::Init(n_spike_buffers);

// temporary
Expand All @@ -472,17 +489,17 @@ int NESTGPU::Calibrate()
NodeGroupArrayInit();

max_spike_num_ = (int)round(max_spike_num_fact_
* GetNNode()
* GetTotalNNodes()
* max_delay_num);
max_spike_num_ = (max_spike_num_>1) ? max_spike_num_ : 1;

max_spike_per_host_ = (int)round(max_spike_per_host_fact_
* GetNNode()
* GetNLocalNodes()
* max_delay_num);
max_spike_per_host_ = (max_spike_per_host_>1) ? max_spike_per_host_ : 1;

SpikeInit(max_spike_num_);
SpikeBufferInit(GetNNode(), max_spike_buffer_size_);
SpikeBufferInit(GetTotalNNodes(), max_spike_buffer_size_);

//#ifndef CHECKRC
if (n_hosts_ > 1) {
Expand Down Expand Up @@ -577,7 +594,7 @@ int NESTGPU::Calibrate()
//#endif

if (rev_conn_flag_) {
RevSpikeInit(GetNNode());
RevSpikeInit(GetNLocalNodes());
}

multimeter_->OpenFiles();
Expand Down Expand Up @@ -706,7 +723,7 @@ int NESTGPU::SimulationStep()
double time_mark;

time_mark = getRealTime();
SpikeBufferUpdate<<<(GetNNode()+1023)/1024, 1024>>>();
SpikeBufferUpdate<<<(GetTotalNNodes()+1023)/1024, 1024>>>();
gpuErrchk( cudaPeekAtLastError() );
SpikeBufferUpdate_time_ += (getRealTime() - time_mark);
time_mark = getRealTime();
Expand Down Expand Up @@ -744,23 +761,27 @@ int NESTGPU::SimulationStep()
if (n_ext_spike != 0) {
time_mark = getRealTime();
SendExternalSpike<<<(n_ext_spike+1023)/1024, 1024>>>();
gpuErrchk( cudaPeekAtLastError() );
//gpuErrchk( cudaPeekAtLastError() );
CUDASYNC;
SendExternalSpike_time_ += (getRealTime() - time_mark);
}
//for (int ih=0; ih<connect_mpi_->mpi_np_; ih++) {
//if (ih == connect_mpi_->mpi_id_) {

time_mark = getRealTime();
SendSpikeToRemote(n_hosts_, max_spike_per_host_);
CUDASYNC;
SendSpikeToRemote_time_ += (getRealTime() - time_mark);
time_mark = getRealTime();
RecvSpikeFromRemote(n_hosts_, max_spike_per_host_);
CUDASYNC;
RecvSpikeFromRemote_time_ += (getRealTime() - time_mark);
CopySpikeFromRemote(n_hosts_, max_spike_per_host_);
CUDASYNC;
MPI_Barrier(MPI_COMM_WORLD);

}
CUDASYNC;

int n_spikes;
time_mark = getRealTime();
Expand Down Expand Up @@ -824,7 +845,8 @@ int NESTGPU::SimulationStep()
//time_mark = getRealTime();
RevSpikeReset<<<1, 1>>>();
gpuErrchk( cudaPeekAtLastError() );
RevSpikeBufferUpdate<<<(GetNNode()+1023)/1024, 1024>>>(GetNNode());
RevSpikeBufferUpdate<<<(GetNLocalNodes()+1023)/1024, 1024>>>
(GetNLocalNodes());
gpuErrchk( cudaPeekAtLastError() );
unsigned int n_rev_spikes;
gpuErrchk(cudaMemcpy(&n_rev_spikes, d_RevSpikeNum, sizeof(unsigned int),
Expand Down Expand Up @@ -1624,11 +1646,12 @@ int64_t *NESTGPU::GetConnections(int i_source, int n_source,
{
if (n_source<=0) {
i_source = 0;
n_source = GetNNode();
// gets also connections from image neurons
n_source = GetTotalNNodes();
}
if (n_target<=0) {
i_target = 0;
n_target = GetNNode();
n_target = GetNLocalNodes();
}
int *i_source_pt = new int[n_source];
for (int i=0; i<n_source; i++) {
Expand All @@ -1654,7 +1677,7 @@ int64_t *NESTGPU::GetConnections(int *i_source_pt, int n_source,
{
if (n_target<=0) {
i_target = 0;
n_target = GetNNode();
n_target = GetNLocalNodes();
}
int *i_target_pt = new int[n_target];
for (int i=0; i<n_target; i++) {
Expand All @@ -1676,7 +1699,8 @@ int64_t *NESTGPU::GetConnections(int i_source, int n_source,
{
if (n_source<=0) {
i_source = 0;
n_source = GetNNode();
// gets also connections from image neurons
n_source = GetTotalNNodes();
}
int *i_source_pt = new int[n_source];
for (int i=0; i<n_source; i++) {
Expand Down
14 changes: 9 additions & 5 deletions src/nestgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,10 @@ class NESTGPU
long long Nt_; // number of simulation time steps
//int n_poiss_nodes_;
std::vector<int> n_remote_nodes_;
int n_ext_nodes_;
int i_ext_node_0_;
//int n_ext_nodes_;
//int i_ext_node_0_;
//int i_remote_node_0_;
int n_image_nodes_;

double start_real_time_;
double build_real_time_;
Expand All @@ -177,13 +178,14 @@ class NESTGPU
int InitConnRandomGenerator();
int FreeConnRandomGenerator();

int CreateNodeGroup(int n_neuron, int n_port);
int CreateNodeGroup(int n_nodes, int n_ports);
int CheckUncalibrated(std::string message);
double *InitGetSpikeArray(int n_node, int n_port);
double *InitGetSpikeArray(int n_nodes, int n_ports);
int NodeGroupArrayInit();
int ClearGetSpikeArrays();
int FreeGetSpikeArrays();
int FreeNodeGroupMap();
int CheckImageNodes(int n_nodes);

NodeSeq _Create(std::string model_name, int n_nodes, int n_ports);

Expand Down Expand Up @@ -300,7 +302,9 @@ class NESTGPU
int SetMaxSpikeBufferSize(int max_size);
int GetMaxSpikeBufferSize();

uint GetNNode();
uint GetNLocalNodes();

uint GetTotalNNodes() { return GetNLocalNodes() + n_image_nodes_; }

int HostNum() {
return n_hosts_;
Expand Down
21 changes: 13 additions & 8 deletions src/remote_connect.cu
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ int **d_node_map_index; // [i_node]
// - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
bool *use_all_source_nodes; // [n_connection_rules]:

__constant__ int n_local_nodes; // number of local nodes

// Allocate GPU memory for new remote-source-node-map blocks
int allocRemoteSourceNodeMapBlocks(std::vector<int*> &i_remote_src_node_map,
std::vector<int*> &i_local_spike_buf_map,
Expand Down Expand Up @@ -379,7 +381,7 @@ int NESTGPU::RemoteConnectionMapCalibrate(int i_host, int n_hosts)
////////////////////////////////////////
#endif

int n_nodes = GetNNode(); // number of nodes
int n_nodes = GetNLocalNodes(); // number of nodes
// n_target_hosts[i_node] is the number of remote target hosts
// on which each local node
// has outgoing connections
Expand Down Expand Up @@ -845,30 +847,31 @@ __global__ void MapIndexToSpikeBufferKernel(int n_hosts, int *host_offset,

__global__ void addOffsetToExternalNodeIdsKernel
(int64_t n_conn, uint *key_subarray, connection_struct *conn_subarray,
int i_ext_node_0)
int i_image_node_0)
{
int64_t i_conn = threadIdx.x + blockIdx.x * blockDim.x;
if (i_conn>=n_conn) return;
uint target_port = conn_subarray[i_conn].target_port;
if (target_port & (1 << (MaxPortNBits - 1))) {
target_port = target_port ^ (1 << (MaxPortNBits - 1));
conn_subarray[i_conn].target_port = target_port;
key_subarray[i_conn] += (i_ext_node_0 << MaxPortNBits);
key_subarray[i_conn] += (i_image_node_0 << MaxPortNBits);
}
}

int NESTGPU::addOffsetToExternalNodeIds()
{
int64_t block_size = h_ConnBlockSize;
int n_blocks = (NConn - 1) / block_size + 1;

int i_image_node_0 = GetNLocalNodes();

for (int ib=0; ib<n_blocks; ib++) {
uint64_t n_block_conn = block_size; // number of connections in the block
if (ib == n_blocks-1) { // last block
n_block_conn = (NConn - 1) % block_size + 1;
}
addOffsetToExternalNodeIdsKernel<<<(n_block_conn+1023)/1024, 1024>>>
(n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_ext_node_0_);
(n_block_conn, KeySubarray[ib], ConnectionSubarray[ib], i_image_node_0);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
Expand Down Expand Up @@ -897,24 +900,26 @@ int NESTGPU::addOffsetToExternalNodeIds()
}

__global__ void addOffsetToSpikeBufferMapKernel(int i_host, int n_node_map,
int i_ext_node_0)
int i_image_node_0)
{
int i_node_map = threadIdx.x + blockIdx.x * blockDim.x;
if (i_node_map>=n_node_map) return;

const int i_block = i_node_map / node_map_block_size;
const int i = i_node_map % node_map_block_size;
local_spike_buffer_map[i_host][i_block][i] += i_ext_node_0;
local_spike_buffer_map[i_host][i_block][i] += i_image_node_0;
}

int NESTGPU::addOffsetToSpikeBufferMap()
{
int i_image_node_0 = GetNLocalNodes();

for (int i_host=0; i_host<n_hosts_; i_host++) {
if (i_host != this_host_) {
int n_node_map = h_n_remote_source_node_map[i_host];
if (n_node_map > 0) {
addOffsetToSpikeBufferMapKernel<<<(n_node_map+1023)/1024, 1024>>>
(i_host, n_node_map, i_ext_node_0_);
(i_host, n_node_map, i_image_node_0);
gpuErrchk( cudaPeekAtLastError() );
gpuErrchk( cudaDeviceSynchronize() );
}
Expand Down
12 changes: 7 additions & 5 deletions src/remote_connect.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ extern int **d_node_target_host_i_map; // [i_node]
// - false otherwise (fixed_indegree, fixed_total_number, pairwise_bernoulli)
extern bool *use_all_source_nodes; // [n_connection_rules]:

extern __constant__ int n_local_nodes; // number of local nodes

// device function that checks if an int value is in a sorted 2d-array
// assuming that the entries in the 2d-array are sorted.
// The 2d-array is divided in noncontiguous blocks of size block_size
Expand Down Expand Up @@ -284,7 +286,7 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source,
// n_nodes will be the first index for new mapping of remote source nodes
// to local spike buffers
//int spike_buffer_map_i0 = GetNNode();
int spike_buffer_map_i0 = n_ext_nodes_;
int spike_buffer_map_i0 = n_image_nodes_;
syn_spec.port_ = syn_spec.port_ | (1 << (h_MaxPortNBits-1));

// check if the flag UseAllSourceNodes[conn_rule] is false
Expand Down Expand Up @@ -774,12 +776,12 @@ int NESTGPU::_RemoteConnectSource(int source_host, T1 source, int n_source,
fixConnectionSourceNodeIndexes(KeySubarray, old_n_conn, NConn,
h_ConnBlockSize, d_local_node_index);

// On target host. Create n_nodes_to_map nodes of type ext_neuron
// On target host. Create n_nodes_to_map nodes of type image_node
//std::cout << "h_n_node_to_map " << h_n_node_to_map <<"\n";
if (h_n_node_to_map > 0) {
//_Create("ext_neuron", h_n_node_to_map);
n_ext_nodes_ += h_n_node_to_map;
//std::cout << "n_ext_nodes_ " << n_ext_nodes_ <<"\n";
//_Create("image_node", h_n_node_to_map);
n_image_nodes_ += h_n_node_to_map;
//std::cout << "n_image_nodes_ " << n_image_nodes_ <<"\n";
}

return 0;
Expand Down
Loading

0 comments on commit 772d38e

Please sign in to comment.