Skip to content
100 changes: 16 additions & 84 deletions include/zim/archive.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,73 +42,26 @@ namespace zim
efficientOrder
};

/**
* Configuration to pass to archive constructors.
/** Get the maximum size of the cluster cache.
*
* Some configuration option specifying how to open a zim archive.
* For now, it is only related to preload data but it may change in the future.
* @return The maximum memory size used the cluster cache.
*/
size_t LIBZIM_API getClusterCacheMaxSize();

/** Get the current size of the cluster cache.
*
* Archive may preload few data to speedup future accessing.
* However, this preload itself can take times.
* @return The current memory size used by the cluster cache.
*/
size_t LIBZIM_API getClusterCacheCurrentSize();

/** Set the size of the cluster cache.
*
* OpenConfig allow user to define how Archive should preload data.
* If the new size is lower than the number of currently stored clusters
* some clusters will be dropped from cache to respect the new size.
*
* @param sizeInB The memory limit (in bytes) for the cluster cache.
*/
struct LIBZIM_API OpenConfig {
/**
* Default configuration.
*
* - Dirent ranges is activated.
* - Xapian preloading is activated.
*/
OpenConfig();

/**
* Configure xapian preloading.
*
* This method modify the configuration and return itelf.
*/
OpenConfig& preloadXapianDb(bool load) { m_preloadXapianDb = load; return *this; }

/**
* Configure xapian preloading.
*
* This method create a new configuration with the new value.
*/
OpenConfig preloadXapianDb(bool load) const {
auto other = *this;
other.m_preloadXapianDb = load;
return other;
}

/**
* Configure direntRanges preloading.
*
* libzim will load `nbRanges + 1` dirents to create `nbRanges` dirent ranges.
* This will be used to speedup dirent lookup. This is an extra layer on top of
* classic dirent cache.
*
* This method modify the configuration and return itelf.
*/
OpenConfig& preloadDirentRanges(int nbRanges) { m_preloadDirentRanges = nbRanges; return *this; }

/**
* Configure direntRanges preloading.
*
* libzim will load `nbRanges + 1` dirents to create `nbRanges` dirent ranges.
* This will be used to speedup dirent lookup. This is an extra layer on top of
* classic dirent cache.
*
* This method create a new configuration with the new value.
*/
OpenConfig preloadDirentRanges(int nbRanges) const {
auto other = *this;
other.m_preloadDirentRanges = nbRanges;
return other;
}

bool m_preloadXapianDb;
int m_preloadDirentRanges;
};
void LIBZIM_API setClusterCacheMaxSize(size_t sizeInB);


/**
Expand Down Expand Up @@ -679,27 +632,6 @@ namespace zim
*/
std::shared_ptr<FileImpl> getImpl() const { return m_impl; }

/** Get the maximum size of the cluster cache.
*
* @return The maximum number of clusters stored in the cache.
*/
size_t getClusterCacheMaxSize() const;

/** Get the current size of the cluster cache.
*
* @return The number of clusters currently stored in the cache.
*/
size_t getClusterCacheCurrentSize() const;

/** Set the size of the cluster cache.
*
* If the new size is lower than the number of currently stored clusters
* some clusters will be dropped from cache to respect the new size.
*
* @param nbClusters The maximum number of clusters stored in the cache.
*/
void setClusterCacheMaxSize(size_t nbClusters);

/** Get the size of the dirent cache.
*
* @return The maximum number of dirents stored in the cache.
Expand Down
72 changes: 72 additions & 0 deletions include/zim/zim.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,78 @@ namespace zim
// An offset.
typedef uint64_t offset_type;

/**
* Configuration to pass to archive constructors.
*
* This struct contains options controlling the opening of a ZIM archive. For
* now, it is only related to preloading of data but it may change in the
* future.
*
* Archive may eagerly preload certain data to speed up future operations.
* However, such preloading itself takes some time.
*
* OpenConfig allows the user to define which data should be preloaded when
* opening the archive.
*/
struct LIBZIM_API OpenConfig {
/**
* Default configuration.
*
* - Dirent ranges is activated.
* - Xapian preloading is activated.
*/
OpenConfig();

/**
* Configure xapian preloading.
*
* This method modifies the configuration and returns itself.
*/
OpenConfig& preloadXapianDb(bool load) {
m_preloadXapianDb = load;
return *this;
}

/**
* Configure xapian preloading.
*
* This method creates a new configuration with the new value.
*/
OpenConfig preloadXapianDb(bool load) const {
return OpenConfig(*this).preloadXapianDb(load);
}

/**
* Configure direntRanges preloading.
*
* libzim will load `nbRanges + 1` dirents to create `nbRanges` dirent
* ranges. This will be used to speed up dirent lookup. This is an extra
* layer on top of classic dirent cache.
*
* This method modifies the configuration and returns itself.
*/
OpenConfig& preloadDirentRanges(int nbRanges) {
m_preloadDirentRanges = nbRanges;
return *this;
}

/**
* Configure direntRanges preloading.
*
* libzim will load `nbRanges + 1` dirents to create `nbRanges` dirent
* ranges. This will be used to speed up dirent lookup. This is an extra
* layer on top of classic dirent cache.
*
* This method creates a new configuration with the new value.
*/
OpenConfig preloadDirentRanges(int nbRanges) const {
return OpenConfig(*this).preloadDirentRanges(nbRanges);
}

bool m_preloadXapianDb;
int m_preloadDirentRanges;
};

struct FdInput {
// An open file descriptor
int fd;
Expand Down
4 changes: 2 additions & 2 deletions meson_options.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
option('CLUSTER_CACHE_SIZE', type : 'string', value : '16',
description : 'set cluster cache size to number (default:16)')
option('CLUSTER_CACHE_SIZE', type : 'integer', min: 0, max: 1000000000000, value : 536870912,
description : 'set default cluster cache size in bytes (default:512MB)')
option('DIRENT_CACHE_SIZE', type : 'string', value : '512',
description : 'set dirent cache size to number (default:512)')
option('DIRENT_LOOKUP_CACHE_SIZE', type : 'string', value : '1024',
Expand Down
14 changes: 7 additions & 7 deletions src/archive.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ log_define("zim.archive")
namespace zim
{
OpenConfig::OpenConfig()
:
:
m_preloadXapianDb(true),
m_preloadDirentRanges(DIRENT_LOOKUP_CACHE_SIZE)
{ }
Expand Down Expand Up @@ -530,19 +530,19 @@ namespace zim
return m_impl->hasNewNamespaceScheme();
}

size_t Archive::getClusterCacheMaxSize() const
size_t getClusterCacheMaxSize()
{
return m_impl->getClusterCacheMaxSize();
return getClusterCache().getMaxCost();
}

size_t Archive::getClusterCacheCurrentSize() const
size_t getClusterCacheCurrentSize()
{
return m_impl->getClusterCacheCurrentSize();
return getClusterCache().getCurrentCost();
}

void Archive::setClusterCacheMaxSize(size_t nbClusters)
void setClusterCacheMaxSize(size_t sizeInB)
{
m_impl->setClusterCacheMaxSize(nbClusters);
getClusterCache().setMaxCost(sizeInB);
}

size_t Archive::getDirentCacheMaxSize() const
Expand Down
5 changes: 5 additions & 0 deletions src/buffer_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@
return source.size();
}

size_t BufferReader::getMemorySize() const

Check warning on line 47 in src/buffer_reader.cpp

View check run for this annotation

Codecov / codecov/patch

src/buffer_reader.cpp#L47

Added line #L47 was not covered by tests
{
return source.size().v;

Check warning on line 49 in src/buffer_reader.cpp

View check run for this annotation

Codecov / codecov/patch

src/buffer_reader.cpp#L49

Added line #L49 was not covered by tests
}

offset_t BufferReader::offset() const
{
return offset_t((offset_type)(static_cast<const void*>(source.data(offset_t(0)))));
Expand Down
1 change: 1 addition & 0 deletions src/buffer_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class LIBZIM_PRIVATE_API BufferReader : public Reader {
virtual ~BufferReader() {};

zsize_t size() const override;
size_t getMemorySize() const override;
offset_t offset() const override;

const Buffer get_buffer(offset_t offset, zsize_t size) const override;
Expand Down
37 changes: 35 additions & 2 deletions src/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@

#include "log.h"

#include "config.h"

log_define("zim.cluster")

#define log_debug1(e)
Expand Down Expand Up @@ -179,4 +177,39 @@ getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression*
}
}

// This function must return the memory consumption for a given cluster so
// that it can be used as a cost estimate during caching.
// However, because of partial (incremental) decompression, this size depends
// on the state of decompression:
// - As decompression advances, new blob readers are created in
// `m_blobReaders`
// - The decoding/decompressing stream itself may allocate memory.
// Our approach is to return the average memory consumption by this cluster
// under the assumption that half of its data is decompressed.
// Note:
// - No need to protect this method from concurent access (as well
// as memoize its result) as it is intended to be called by ConcurentCache
// which should invoke this method exactly once per cluster object
size_t Cluster::getMemorySize() const {
const auto blobOffsetsSize = sizeof(offset_t) * m_blobOffsets.size();
const auto decompressedDataSize = m_blobOffsets.back().v;

// If the cluster is not compressed, we rely on mmap and kernel
// to do the memory management.
const auto dataSize = isCompressed() ? decompressedDataSize : 0;

// Memory consumption by the decompressor stream.
// For non-compressed data reader it is assumed to be 0 (see the comment
// in BaseFileReader::getMemorySize())
auto streamSize = m_reader->getMemorySize();

// Compression levels may define a huge window and make decompression
// stream allocate a huge memory to store it. However (at least on
// linux) the actual memory used (as opposed to allocated memory) will
// not exceed the content size, even for a larger window. So, let's
// clamp the stream size to the size of the content itself.
streamSize = std::min<size_type>(streamSize, decompressedDataSize);

return blobOffsetsSize + dataSize/2 + streamSize;
}
}
8 changes: 8 additions & 0 deletions src/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,17 @@ namespace zim
Blob getBlob(blob_index_t n) const;
Blob getBlob(blob_index_t n, offset_t offset, zsize_t size) const;

size_t getMemorySize() const;

static std::shared_ptr<Cluster> read(const Reader& zimReader, offset_t clusterOffset);
};

struct ClusterMemorySize {
static size_t cost(const std::shared_ptr<const Cluster>& cluster) {
return cluster->getMemorySize();
}
};

}

#endif // ZIM_CLUSTER_H
13 changes: 13 additions & 0 deletions src/compression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@
lzma_end(stream);
}

size_t LZMA_INFO::state_size(const stream_t& stream)
{
return lzma_memusage(&stream);
}


const std::string ZSTD_INFO::name = "zstd";

Expand Down Expand Up @@ -170,3 +175,11 @@
void ZSTD_INFO::stream_end_encode(stream_t* stream)
{
}

size_t ZSTD_INFO::state_size(const stream_t& stream) {
if (stream.decoder_stream) {
return ZSTD_sizeof_CStream(stream.encoder_stream);
} else {
return ZSTD_sizeof_DStream(stream.decoder_stream);

Check warning on line 183 in src/compression.cpp

View check run for this annotation

Codecov / codecov/patch

src/compression.cpp#L183

Added line #L183 was not covered by tests
}
}
2 changes: 2 additions & 0 deletions src/compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ struct LZMA_INFO {
static CompStatus stream_run_decode(stream_t* stream, CompStep step);
static CompStatus stream_run(stream_t* stream, CompStep step);
static void stream_end_decode(stream_t* stream);
static size_t state_size(const stream_t& stream);
};


Expand Down Expand Up @@ -94,6 +95,7 @@ struct LIBZIM_PRIVATE_API ZSTD_INFO {
static CompStatus stream_run_decode(stream_t* stream, CompStep step);
static void stream_end_encode(stream_t* stream);
static void stream_end_decode(stream_t* stream);
static size_t state_size(const stream_t& stream);
};


Expand Down
7 changes: 7 additions & 0 deletions src/concurrent_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,13 @@ class ConcurrentCache
return impl_.drop(key);
}

template<class F>
void dropAll(F f) {
log_debug_func_call("ConcurrentCache::dropAll");
log_debug_raii_sync_statement(std::unique_lock<std::mutex> l(lock_));
impl_.dropAll(f);
}

size_t getMaxCost() const {
std::unique_lock<std::mutex> l(lock_);
return impl_.getMaxCost();
Expand Down
Loading