Skip to content

Commit

Permalink
Allocating chunk of mem for fixed working buffers
Browse files Browse the repository at this point in the history
  • Loading branch information
milakov committed Nov 23, 2015
1 parent 29196b1 commit 0586d45
Show file tree
Hide file tree
Showing 16 changed files with 121 additions and 52 deletions.
30 changes: 25 additions & 5 deletions nnforge/cuda/backward_propagation_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1031,12 +1031,17 @@ namespace nnforge

void backward_propagation_cuda::setup_temporary_working_fixed_buffer_sizes()
{
size_t max_fixed_working_buffers_size = cuda_config->get_max_fixed_working_buffers_size();

std::vector<std::vector<std::pair<layer_name_with_action, buffer_lifetime> > > temporary_working_fixed_buffer_set_list;
{
std::map<layer_name_with_action, std::vector<std::pair<buffer_lifetime, float> > > buffers;
for(std::vector<layer_name_with_action>::const_iterator it = actions_in_execution_order.begin(); it != actions_in_execution_order.end(); ++it)
{
size_t temporary_working_fixed_buffer_size = updaters[it->get_name()]->get_temporary_working_fixed_buffer_size(it->get_action());
std::pair<size_t, bool> temporary_working_fixed_buffer_size_and_flag = updaters[it->get_name()]->get_temporary_working_fixed_buffer_size(it->get_action());
size_t temporary_working_fixed_buffer_size = temporary_working_fixed_buffer_size_and_flag.first;
if (temporary_working_fixed_buffer_size_and_flag.second)
temporary_working_fixed_buffer_size = std::max(temporary_working_fixed_buffer_size, max_fixed_working_buffers_size);
if (temporary_working_fixed_buffer_size > 0)
buffers.insert(std::make_pair(*it, std::vector<std::pair<buffer_lifetime, float> >())).first->second.push_back(std::make_pair(buffer_lifetime(buffer_lifetime::working_buffer), static_cast<float>(temporary_working_fixed_buffer_size)));
}
Expand All @@ -1062,15 +1067,30 @@ namespace nnforge

temporary_working_fixed_set_size_list.clear();
temporary_working_fixed_data_action_to_set_map.clear();

std::set<unsigned int> set_ids_with_hungry_working_buffers;
for(unsigned int set_id = 0; set_id < temporary_working_fixed_buffer_set_list.size(); ++set_id)
{
const std::vector<std::pair<layer_name_with_action, buffer_lifetime> >& action_list = temporary_working_fixed_buffer_set_list[set_id];
size_t max_buffer_size = 0;
for(std::vector<std::pair<layer_name_with_action, buffer_lifetime> >::const_iterator it = action_list.begin(); it != action_list.end(); ++it)
{
std::string layer_name = it->first.get_name();
if (updaters[layer_name]->get_temporary_working_fixed_buffer_size(it->first.get_action()).second)
set_ids_with_hungry_working_buffers.insert(set_id);
}
}
if (set_ids_with_hungry_working_buffers.size() > 1)
max_fixed_working_buffers_size /= set_ids_with_hungry_working_buffers.size();

for(unsigned int set_id = 0; set_id < temporary_working_fixed_buffer_set_list.size(); ++set_id)
{
const std::vector<std::pair<layer_name_with_action, buffer_lifetime> >& action_list = temporary_working_fixed_buffer_set_list[set_id];
size_t max_buffer_size = (set_ids_with_hungry_working_buffers.find(set_id) != set_ids_with_hungry_working_buffers.end()) ? max_fixed_working_buffers_size : 1;
for(std::vector<std::pair<layer_name_with_action, buffer_lifetime> >::const_iterator it = action_list.begin(); it != action_list.end(); ++it)
{
std::string layer_name = it->first.get_name();
temporary_working_fixed_data_action_to_set_map.insert(std::make_pair(it->first, set_id));
size_t buffer_size = updaters[layer_name]->get_temporary_working_fixed_buffer_size(it->first.get_action());
size_t buffer_size = updaters[layer_name]->get_temporary_working_fixed_buffer_size(it->first.get_action()).first;
max_buffer_size = std::max(max_buffer_size, buffer_size);
}
temporary_working_fixed_set_size_list.push_back(max_buffer_size);
Expand All @@ -1087,10 +1107,10 @@ namespace nnforge
{
if (it != temporary_working_fixed_set_size_list.begin())
debug_str << ", ";
debug_str << ((*it + 1024 - 1) / 1024) << " KB";
debug_str << ((*it + (1024 * 1024) - 1) / (1024 * 1024)) << " MB";
total_buffer_size += *it;
}
debug_str << "), total " << ((total_buffer_size + 1024 - 1) / 1024) << " KB";
debug_str << "), total " << ((total_buffer_size + (1024 * 1024) - 1) / (1024 * 1024)) << " MB";
}
debug->output_message(debug_str.str().c_str());
boost::filesystem::ofstream out(debug->get_path_to_unique_file("backward_prop_cuda_temporary_fixed_buffers", "gv"), std::ios_base::out | std::ios_base::trunc);
Expand Down
4 changes: 2 additions & 2 deletions nnforge/cuda/convolution_layer_tester_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,12 @@ namespace nnforge
zero_padding);
}

size_t convolution_layer_tester_cuda::get_temporary_working_fixed_buffer_size() const
std::pair<size_t, bool> convolution_layer_tester_cuda::get_temporary_working_fixed_buffer_size() const
{
unsigned int working_buffer_elem_count = input_configuration_specific_list[0].feature_map_count;
for(int i = 0; i < window_sizes.size(); ++i)
working_buffer_elem_count *= window_sizes[i];
return working_buffer_elem_count * sizeof(int);
return std::make_pair(working_buffer_elem_count * sizeof(int), true);
}
}
}
2 changes: 1 addition & 1 deletion nnforge/cuda/convolution_layer_tester_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ namespace nnforge
cuda_linear_buffer_device::ptr temporary_working_per_entry_buffer,
unsigned int entry_count);

virtual size_t get_temporary_working_fixed_buffer_size() const;
virtual std::pair<size_t, bool> get_temporary_working_fixed_buffer_size() const;

protected:
virtual void tester_configured();
Expand Down
42 changes: 25 additions & 17 deletions nnforge/cuda/convolution_layer_updater_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,26 +299,34 @@ namespace nnforge
zero_padding);
}

size_t convolution_layer_updater_cuda::get_temporary_working_fixed_buffer_size(const layer_action& action) const
std::pair<size_t, bool> convolution_layer_updater_cuda::get_temporary_working_fixed_buffer_size(const layer_action& action) const
{
if (action.get_action_type() == layer_action::forward)
switch (action.get_action_type())
{
unsigned int working_buffer_elem_count = input_configuration_specific_list[0].feature_map_count;
for(int i = 0; i < window_sizes.size(); ++i)
working_buffer_elem_count *= window_sizes[i];

return working_buffer_elem_count * sizeof(int);
}
else if (action.get_action_type() == layer_action::backward_weights)
{
unsigned int working_buffer_elem_count = std::max(input_configuration_specific_list[0].feature_map_count, output_configuration_specific.feature_map_count);
for(int i = 0; i < window_sizes.size(); ++i)
working_buffer_elem_count *= window_sizes[i];

return working_buffer_elem_count * sizeof(int);
case layer_action::forward:
{
unsigned int working_buffer_elem_count = input_configuration_specific_list[0].feature_map_count;
for(int i = 0; i < window_sizes.size(); ++i)
working_buffer_elem_count *= window_sizes[i];
return std::make_pair(working_buffer_elem_count * sizeof(int), true);
}
case layer_action::backward_data:
{
unsigned int working_buffer_elem_count = std::max(input_configuration_specific_list[0].feature_map_count, output_configuration_specific.feature_map_count);
for(int i = 0; i < window_sizes.size(); ++i)
working_buffer_elem_count *= window_sizes[i];
return std::make_pair(working_buffer_elem_count * sizeof(int), true);
}
case layer_action::backward_weights:
{
unsigned int working_buffer_elem_count = std::max(input_configuration_specific_list[0].feature_map_count, output_configuration_specific.feature_map_count);
for(int i = 0; i < window_sizes.size(); ++i)
working_buffer_elem_count *= window_sizes[i];
return std::make_pair(working_buffer_elem_count * sizeof(int), true);
}
default:
return std::make_pair(0, false);
}
else
return layer_updater_cuda::get_temporary_working_fixed_buffer_size(action);
}

bool convolution_layer_updater_cuda::is_backward_data_dependent_on_input_buffer(unsigned int action_input_index, unsigned int data_input_index) const
Expand Down
2 changes: 1 addition & 1 deletion nnforge/cuda/convolution_layer_updater_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ namespace nnforge
cuda_linear_buffer_device::const_ptr temporary_per_entry_buffer,
unsigned int entry_count);

virtual size_t get_temporary_working_fixed_buffer_size(const layer_action& action) const;
virtual std::pair<size_t, bool> get_temporary_working_fixed_buffer_size(const layer_action& action) const;

virtual bool is_backward_data_dependent_on_input_buffer(unsigned int action_input_index, unsigned int data_input_index) const;

Expand Down
12 changes: 10 additions & 2 deletions nnforge/cuda/cuda_running_configuration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,15 @@ namespace nnforge
unsigned int reserved_thread_count,
bool dont_share_buffers,
bool single_command_stream,
unsigned int optimize_action_graph_assumed_chunk_size)
unsigned int optimize_action_graph_assumed_chunk_size,
float cuda_fixed_working_buffers_ratio)
: device_id(device_id)
, max_global_memory_usage_ratio(max_global_memory_usage_ratio)
, reserved_thread_count(reserved_thread_count)
, dont_share_buffers(dont_share_buffers)
, single_command_stream(single_command_stream)
, optimize_action_graph_assumed_chunk_size(optimize_action_graph_assumed_chunk_size)
, cuda_fixed_working_buffers_ratio(cuda_fixed_working_buffers_ratio)
, cublas_handle(0)
, cusparse_handle(0)
, cudnn_handle(0)
Expand Down Expand Up @@ -180,11 +182,12 @@ namespace nnforge
#ifdef _WIN32
out << "Driver mode = " << (running_configuration.tcc_mode ? "TCC" : "WDDM") << std::endl;
#endif
out << "Estimated GFLOPS = " << static_cast<int>(running_configuration.get_flops() / 1.0e+12F) << std::endl;
out << "Estimated GFLOPS = " << static_cast<int>(running_configuration.get_flops() / 1.0e+9F) << std::endl;

out << "--- Settings ---" << std::endl;

out << "Max global memory usage ratio = " << running_configuration.max_global_memory_usage_ratio << std::endl;
out << "Fixed working buffers ratio = " << running_configuration.cuda_fixed_working_buffers_ratio << std::endl;
out << "Threads reserved for CUDA sync (others will be used for on-the-fly data processing by job runner) = " << running_configuration.reserved_thread_count << std::endl;
out << "Don't share buffers = " << running_configuration.dont_share_buffers << std::endl;
out << "Use single command stream = " << running_configuration.single_command_stream << std::endl;
Expand Down Expand Up @@ -220,6 +223,11 @@ namespace nnforge
return entry_count;
}

size_t cuda_running_configuration::get_max_fixed_working_buffers_size() const
{
return static_cast<size_t>(static_cast<float>(global_memory_size) * max_global_memory_usage_ratio * cuda_fixed_working_buffers_ratio);
}

cublasHandle_t cuda_running_configuration::get_cublas_handle() const
{
return cublas_handle;
Expand Down
6 changes: 5 additions & 1 deletion nnforge/cuda/cuda_running_configuration.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,17 @@ namespace nnforge
unsigned int reserved_thread_count,
bool dont_share_buffers,
bool single_command_stream,
unsigned int optimize_action_graph_assumed_chunk_size);
unsigned int optimize_action_graph_assumed_chunk_size,
float cuda_fixed_working_buffers_ratio);

~cuda_running_configuration();

unsigned int get_max_entry_count(
const buffer_cuda_size_configuration& buffers_config,
float ratio = 1.0F) const;

size_t get_max_fixed_working_buffers_size() const;

cublasHandle_t get_cublas_handle() const;

cusparseHandle_t get_cusparse_handle() const;
Expand Down Expand Up @@ -85,6 +88,7 @@ namespace nnforge
bool dont_share_buffers;
bool single_command_stream;
unsigned int optimize_action_graph_assumed_chunk_size;
float cuda_fixed_working_buffers_ratio;

int driver_version;
int runtime_version;
Expand Down
12 changes: 8 additions & 4 deletions nnforge/cuda/factory_generator_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,15 @@ namespace nnforge
unsigned int cuda_reserved_thread_count,
bool cuda_dont_share_buffers,
bool cuda_single_command_stream,
unsigned int optimize_action_graph_assumed_chunk_size)
unsigned int cuda_optimize_action_graph_assumed_chunk_size,
float cuda_fixed_working_buffers_ratio)
: cuda_device_id(cuda_device_id)
, cuda_max_global_memory_usage_ratio(cuda_max_global_memory_usage_ratio)
, cuda_reserved_thread_count(cuda_reserved_thread_count)
, cuda_dont_share_buffers(cuda_dont_share_buffers)
, cuda_single_command_stream(cuda_single_command_stream)
, optimize_action_graph_assumed_chunk_size(optimize_action_graph_assumed_chunk_size)
, cuda_optimize_action_graph_assumed_chunk_size(cuda_optimize_action_graph_assumed_chunk_size)
, cuda_fixed_working_buffers_ratio(cuda_fixed_working_buffers_ratio)
{
}

Expand All @@ -57,7 +59,8 @@ namespace nnforge
cuda_reserved_thread_count,
cuda_dont_share_buffers,
cuda_single_command_stream,
optimize_action_graph_assumed_chunk_size));
cuda_optimize_action_graph_assumed_chunk_size,
cuda_fixed_working_buffers_ratio));
}

forward_propagation_factory::ptr factory_generator_cuda::create_forward_propagation_factory() const
Expand All @@ -75,6 +78,7 @@ namespace nnforge
std::vector<float_option> res;

res.push_back(float_option("cuda_max_global_memory_usage_ratio,G", &cuda_max_global_memory_usage_ratio, 0.9F, "Part of the global memory to be used by a single CUDA configuration. Set to smaller value if the device is used for graphics as well"));
res.push_back(float_option("cuda_fixed_working_buffers_ratio", &cuda_fixed_working_buffers_ratio, 0.1F, "Part of memory use dby app, which is allocated to working buffers (independent of batch size)"));

return res;
}
Expand All @@ -85,7 +89,7 @@ namespace nnforge

res.push_back(int_option("cuda_device_id,D", &cuda_device_id, 0, "CUDA device ID"));
res.push_back(int_option("cuda_reserved_thread_count", &cuda_reserved_thread_count, 1, "The number of hw threads not used for input data processing"));
res.push_back(int_option("optimize_action_graph_assumed_chunk_size", &optimize_action_graph_assumed_chunk_size, 32, "Assumed chunk size when optimizing action graph"));
res.push_back(int_option("cuda_optimize_action_graph_assumed_chunk_size", &cuda_optimize_action_graph_assumed_chunk_size, 32, "Assumed chunk size when optimizing action graph"));

return res;
}
Expand Down
6 changes: 4 additions & 2 deletions nnforge/cuda/factory_generator_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ namespace nnforge
unsigned int cuda_reserved_thread_count,
bool cuda_dont_share_buffers,
bool cuda_single_command_stream,
unsigned int optimize_action_graph_assumed_chunk_size);
unsigned int cuda_optimize_action_graph_assumed_chunk_size,
float cuda_fixed_working_buffers_ratio);

factory_generator_cuda();

Expand All @@ -58,7 +59,8 @@ namespace nnforge
int cuda_reserved_thread_count;
bool cuda_dont_share_buffers;
bool cuda_single_command_stream;
int optimize_action_graph_assumed_chunk_size;
int cuda_optimize_action_graph_assumed_chunk_size;
float cuda_fixed_working_buffers_ratio;

cuda_running_configuration::const_ptr cuda_config;
};
Expand Down
31 changes: 26 additions & 5 deletions nnforge/cuda/forward_propagation_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -744,12 +744,17 @@ namespace nnforge

void forward_propagation_cuda::setup_temporary_working_fixed_buffer_sizes()
{
size_t max_fixed_working_buffers_size = cuda_config->get_max_fixed_working_buffers_size();

std::vector<std::vector<std::pair<layer_name_with_action, buffer_lifetime> > > temporary_working_fixed_buffer_set_list;
{
std::map<layer_name_with_action, std::vector<std::pair<buffer_lifetime, float> > > buffers;
for(std::map<std::string, layer_tester_cuda::ptr>::const_iterator it = testers.begin(); it != testers.end(); ++it)
{
size_t temporary_working_fixed_buffer_size = it->second->get_temporary_working_fixed_buffer_size();
std::pair<size_t, bool> temporary_working_fixed_buffer_size_and_flag = it->second->get_temporary_working_fixed_buffer_size();
size_t temporary_working_fixed_buffer_size = temporary_working_fixed_buffer_size_and_flag.first;
if (temporary_working_fixed_buffer_size_and_flag.second)
temporary_working_fixed_buffer_size = std::max(temporary_working_fixed_buffer_size, max_fixed_working_buffers_size);
if (temporary_working_fixed_buffer_size > 0)
buffers.insert(std::make_pair(layer_name_with_action(it->first, layer_action::forward), std::vector<std::pair<buffer_lifetime, float> >())).first->second.push_back(std::make_pair(buffer_lifetime(buffer_lifetime::working_buffer), static_cast<float>(temporary_working_fixed_buffer_size)));
}
Expand All @@ -775,15 +780,31 @@ namespace nnforge

temporary_working_fixed_set_size_list.clear();
temporary_working_fixed_data_action_to_set_map.clear();

std::set<unsigned int> set_ids_with_hungry_working_buffers;
for(unsigned int set_id = 0; set_id < temporary_working_fixed_buffer_set_list.size(); ++set_id)
{
const std::vector<std::pair<layer_name_with_action, buffer_lifetime> >& action_list = temporary_working_fixed_buffer_set_list[set_id];
size_t max_buffer_size = 0;
for(std::vector<std::pair<layer_name_with_action, buffer_lifetime> >::const_iterator it = action_list.begin(); it != action_list.end(); ++it)
{
std::string layer_name = it->first.get_name();
if (testers.find(layer_name)->second->get_temporary_working_fixed_buffer_size().second)
set_ids_with_hungry_working_buffers.insert(set_id);
}
}
if (set_ids_with_hungry_working_buffers.size() > 1)
max_fixed_working_buffers_size /= set_ids_with_hungry_working_buffers.size();

for(unsigned int set_id = 0; set_id < temporary_working_fixed_buffer_set_list.size(); ++set_id)
{
const std::vector<std::pair<layer_name_with_action, buffer_lifetime> >& action_list = temporary_working_fixed_buffer_set_list[set_id];
size_t max_buffer_size = (set_ids_with_hungry_working_buffers.find(set_id) != set_ids_with_hungry_working_buffers.end()) ? max_fixed_working_buffers_size : 1;

for(std::vector<std::pair<layer_name_with_action, buffer_lifetime> >::const_iterator it = action_list.begin(); it != action_list.end(); ++it)
{
std::string layer_name = it->first.get_name();
temporary_working_fixed_data_action_to_set_map.insert(std::make_pair(it->first, set_id));
size_t buffer_size = testers.find(layer_name)->second->get_temporary_working_fixed_buffer_size();
size_t buffer_size = testers.find(layer_name)->second->get_temporary_working_fixed_buffer_size().first;
max_buffer_size = std::max(max_buffer_size, buffer_size);
}
temporary_working_fixed_set_size_list.push_back(max_buffer_size);
Expand All @@ -800,10 +821,10 @@ namespace nnforge
{
if (it != temporary_working_fixed_set_size_list.begin())
debug_str << ", ";
debug_str << ((*it + 1024 - 1) / 1024) << " KB";
debug_str << ((*it + (1024 * 1024) - 1) / (1024 * 1024)) << " MB";
total_buffer_size += *it;
}
debug_str << "), total " << ((total_buffer_size + 1024 - 1) / 1024) << " KB";
debug_str << "), total " << ((total_buffer_size + (1024 * 1024) - 1) / (1024 * 1024)) << " MB";
}
debug->output_message(debug_str.str().c_str());
boost::filesystem::ofstream out(debug->get_path_to_unique_file("forward_prop_cuda_temporary_fixed_buffers", "gv"), std::ios_base::out | std::ios_base::trunc);
Expand Down
4 changes: 2 additions & 2 deletions nnforge/cuda/layer_tester_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,9 @@ namespace nnforge
return std::vector<cuda_linear_buffer_device::const_ptr>();
}

size_t layer_tester_cuda::get_temporary_working_fixed_buffer_size() const
std::pair<size_t, bool> layer_tester_cuda::get_temporary_working_fixed_buffer_size() const
{
return 0;
return std::make_pair(0, false);
}

size_t layer_tester_cuda::get_temporary_working_per_entry_buffer_size() const
Expand Down
3 changes: 2 additions & 1 deletion nnforge/cuda/layer_tester_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,8 @@ namespace nnforge

virtual int get_input_index_layer_can_write() const;

virtual size_t get_temporary_working_fixed_buffer_size() const;
// The function should return the minimum size and the flag indicating whether the tester would be happy to have larger working buffer
virtual std::pair<size_t, bool> get_temporary_working_fixed_buffer_size() const;

virtual size_t get_temporary_working_per_entry_buffer_size() const;

Expand Down
Loading

0 comments on commit 0586d45

Please sign in to comment.