Skip to content

Commit

Permalink
Reduce memory usage of the hash-table. Fixes #97.
Browse files Browse the repository at this point in the history
Each bucket was a vector which internally stores 3 pointers. Now each
bucket is a forward_list which is just one pointer.
  • Loading branch information
dimztimz committed Feb 3, 2021
1 parent 2686d8f commit 56af716
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 58 deletions.
10 changes: 1 addition & 9 deletions src/nuspell/aff_data.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,6 @@ class Encoding {

enum class Flag_Type { SINGLE_CHAR, DOUBLE_CHAR, NUMBER, UTF8 };

struct Extractor_First_of_Word_Pair {
auto& operator()(const std::pair<std::wstring, Flag_Set>& p) const
{
return p.first;
}
};

/**
* @internal
* @brief Map between words and word_flags.
Expand All @@ -98,8 +91,7 @@ struct Extractor_First_of_Word_Pair {
* Does not store morphological data as is low priority feature and is out of
* scope.
*/
using Word_List = Hash_Multiset<std::pair<std::wstring, Flag_Set>, std::wstring,
Extractor_First_of_Word_Pair>;
using Word_List = Hash_Multimap<std::wstring, Flag_Set>;

struct NUSPELL_EXPORT Aff_Data {
static constexpr auto HIDDEN_HOMONYM_FLAG = char16_t(-1);
Expand Down
75 changes: 26 additions & 49 deletions src/nuspell/structures.hxx
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <algorithm>
#include <cmath>
#include <forward_list>
#include <functional>
#include <iterator>
#include <stack>
Expand Down Expand Up @@ -536,18 +537,18 @@ struct identity {
}
};

template <class Value, class Key = Value, class KeyExtract = identity>
class Hash_Multiset {
private:
using bucket_type = std::vector<Value>;
template <class Key, class T>
class Hash_Multimap {
using bucket_type = std::forward_list<std::pair<Key, T>>;
static constexpr float max_load_fact = 7.0 / 8.0;
std::vector<bucket_type> data;
size_t sz = 0;
size_t max_load_factor_capacity = 0;

public:
using key_type = Key;
using value_type = Value;
using mapped_type = T;
using value_type = std::pair<Key, T>;
using size_type = std::size_t;
using difference_type = std::ptrdiff_t;
using hasher = std::hash<Key>;
Expand All @@ -558,7 +559,7 @@ class Hash_Multiset {
using local_iterator = typename bucket_type::iterator;
using local_const_iterator = typename bucket_type::const_iterator;

Hash_Multiset() = default;
Hash_Multimap() = default;

auto size() const { return sz; }
auto empty() const { return size() == 0; }
Expand All @@ -576,7 +577,7 @@ class Hash_Multiset {
}
if (count < size() / max_load_fact)
count = size() / max_load_fact;
auto n = Hash_Multiset();
auto n = Hash_Multimap();
n.rehash(count);
for (auto& b : data) {
for (auto& x : b) {
Expand All @@ -597,33 +598,27 @@ class Hash_Multiset {
{
using namespace std;
auto hash = hasher();
auto key_extract = KeyExtract();
if (sz == max_load_factor_capacity) {
reserve(sz + 1);
}
auto&& key = key_extract(value);
auto&& key = value.first;
auto h = hash(key);
auto h_mod = h & (data.size() - 1);
auto& bucket = data[h_mod];
if (bucket.size() == 0 || bucket.size() == 1 ||
key == key_extract(bucket.back())) {
bucket.push_back(move(value));
++sz;
return end(bucket) - 1;
auto prev = bucket.before_begin();
auto curr = begin(bucket);
// find first entry with same key
for (; curr != end(bucket); prev = curr++) {
if (curr->first == key)
break;
}
auto last =
std::find_if(rbegin(bucket), rend(bucket), [&](auto& x) {
return key == key_extract(x);
});
if (last != rend(bucket)) {
auto ret = bucket.insert(last.base(), move(value));
++sz;
return ret;
// find last entry with same key
for (; curr != end(bucket); prev = curr++) {
if (curr->first != key)
break;
}

bucket.push_back(move(value));
++sz;
return end(bucket) - 1;
// insert after last
return bucket.insert_after(prev, move(value));
}
template <class... Args>
auto emplace(Args&&... a)
Expand All @@ -636,35 +631,17 @@ class Hash_Multiset {
{
using namespace std;
auto hash = hasher();
auto key_extract = KeyExtract();
if (data.empty())
return {};
auto h = hash(key);
auto h_mod = h & (data.size() - 1);
auto& bucket = data[h_mod];
if (bucket.empty())
return {begin(bucket), begin(bucket)}; // ret empty
// return {} here ^^^^^^^ is OK bug GCC debug iterators have
// this bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70303
if (bucket.size() == 1) {
if (key == key_extract(bucket.front()))
return {begin(bucket), end(bucket)};
return {begin(bucket), begin(bucket)}; // ret empty
}
auto first =
std::find_if(begin(bucket), end(bucket), [&](auto& x) {
return key == key_extract(x);
});
auto eq_key = [&](auto& x) { return key == x.first; };
auto first = std::find_if(begin(bucket), end(bucket), eq_key);
if (first == end(bucket))
return {begin(bucket), begin(bucket)}; // ret empty
auto next = first + 1;
if (next == end(bucket) || key != key_extract(*next))
return {first, next};
auto last =
std::find_if(rbegin(bucket), rend(bucket), [&](auto& x) {
return key == key_extract(x);
});
return {first, last.base()};
return {first, first}; // ret empty
auto last = std::find_if_not(next(first), end(bucket), eq_key);
return {first, last};
}

auto bucket_count() const -> size_type { return data.size(); }
Expand Down

0 comments on commit 56af716

Please sign in to comment.