180 changes: 180 additions & 0 deletions TODO
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
- docs, moar tests

- extended attributes:
- number of blocks
- number of chunks
- number of times opened?

- per-file "hotness" (how often was a file opened); dump to file upon umount

- nanofsextract? ---> --unpack
- readahead?
- typedef -> using

- start filename compression early

- remove multiple blockhash window sizes, one is enough apparently?

- window-increment-shift seems silly to configure?

- identify blocks that contain mostly binary data and adjust compressor?

- get rid of glog dependency

- get rid of passing by shared/unique ptr where possible

- weak_ptr, e.g. in inode implementation?

- --repack and --unpack option

- metadata stripping (i.e. re-write metadata without owner/time info)


/*

scanner:
bhw= - 388.3s 13.07 GiB
bhw= 8 812.9s 7.559 GiB
bhw= 9 693.1s 7.565 GiB
bhw=10 651.8s 7.617 GiB
bhw=11 618.7s 7.313 GiB
bhw=12 603.6s 7.625 GiB
bhw=13 591.2s 7.858 GiB
bhw=14 574.1s 8.306 GiB
bhw=15 553.8s 8.869 GiB
bhw=16 541.9s 9.529 GiB


lz4:
<---- 1m29.535s / 9m31.212s

lz4hc:
1 - 20.94s - 2546 MiB
2 - 21.67s - 2441 MiB
3 - 24.19s - 2377 MiB
4 - 27.29s - 2337 MiB
5 - 31.49s - 2311 MiB
6 - 36.39s - 2294 MiB
7 - 42.04s - 2284 MiB
8 - 48.67s - 2277 MiB
9 - 56.94s - 2273 MiB <---- 1m27.979s / 9m20.637s
10 - 68.03s - 2271 MiB
11 - 79.54s - 2269 MiB
12 - 94.84s - 2268 MiB

zstd:
1 - 11.42s - 1667 MiB
2 - 12.95s - 1591 MiB <---- 2m8.351s / 15m25.752s
3 - 22.03s - 1454 MiB
4 - 25.64s - 1398 MiB
5 - 32.34s - 1383 MiB
6 - 41.45s - 1118 MiB <---- 2m4.258s / 14m28.627s
7 - 46.26s - 1104 MiB
8 - 53.34s - 1077 MiB
9 - 59.99s - 1066 MiB
10 - 63.3s - 1066 MiB
11 - 66.97s - 956 MiB <---- 2m3.496s / 14m17.862s
12 - 79.89s - 953 MiB
13 - 89.8s - 943 MiB
14 - 118.1s - 941 MiB
15 - 230s - 951 MiB
16 - 247.4s - 863 MiB <---- 2m11.202s / 14m57.245s
17 - 294.5s - 854 MiB
18 - 634s - 806 MiB
19 - 762.5s - 780 MiB
20 - 776.8s - 718 MiB <---- 2m16.448s / 15m43.923s
21 - 990.4s - 716 MiB
22 - 984.3s - 715 MiB <---- 2m18.133s / 15m55.263s

lzma:
level=6:dict_size=21 921.9s - 838.8 MiB <---- 5m11.219s / 37m36.002s

*/





Perl:
542 versions of perl
found/scanned: 152809/152809 dirs, 0/0 links, 1325098/1325098 files
original size: 32.03 GiB, saved: 19.01 GiB by deduplication (1133032 duplicate files), 5.835 GiB by segmenting
filesystem size: 7.183 GiB in 460 blocks (499389 chunks, 192066/192066 inodes), 460 blocks/662.3 MiB written

bench
build real user
-----------------------------------------------------------------------------------------------------
-rw-r--r-- 1 mhx users 14G Jul 27 23:11 perl-install-0.dwarfs 8:05 0:38 0:45
-rw-r--r-- 1 mhx users 4.8G Jul 27 23:18 perl-install-1.dwarfs 6:34 0:14 1:24
-rw-r--r-- 1 mhx users 3.8G Jul 27 23:26 perl-install-2.dwarfs 7:31 0:17 1:11
-rw-r--r-- 1 mhx users 3.2G Jul 27 23:36 perl-install-3.dwarfs 10:11 0:11 0:59
-rw-r--r-- 1 mhx users 1.8G Jul 27 23:47 perl-install-4.dwarfs 11:05 0:14 1:24
-rw-r--r-- 1 mhx users 1.2G Jul 27 23:59 perl-install-5.dwarfs 11:53 0:13 1:15
-rw-r--r-- 1 mhx users 901M Jul 28 00:16 perl-install-6.dwarfs 17:42 0:14 1:25
-rw-r--r-- 1 mhx users 704M Jul 28 00:37 perl-install-7.dwarfs 20:52 0:20 2:14
-rw-r--r-- 1 mhx users 663M Jul 28 04:04 perl-install-8.dwarfs 24:13 0:50 6:02
-rw-r--r-- 1 mhx users 615M Jul 28 02:50 perl-install-9.dwarfs 34:40 0:51 5:50

-rw-r--r-- 1 mhx users 3.6G Jul 28 09:13 perl-install-defaults.squashfs 17:20
-rw-r--r-- 1 mhx users 2.4G Jul 28 10:42 perl-install-opt.squashfs 71:49








soak:

-7 (cache=1g)

Passed with 542 of 542 combinations.

real 75m21.191s
user 68m3.903s
sys 6m21.020s

-9 (cache=1g)

Passed with 542 of 542 combinations.

real 118m48.371s
user 107m35.685s
sys 7m16.438s

squashfs-opt

real 81m36.957s
user 62m37.369s
sys 20m52.367s


-1 (cache=2g)
mhx@gimli ~ $ time find tmp/mount/ -type f | xargs -n 1 -P 32 -d $'\n' -I {} dd of=/dev/null if={} bs=64K status=none

real 2m19.927s
user 0m16.813s
sys 2m4.293s

-7 (cache=2g)
mhx@gimli ~ $ time find tmp/mount/ -type f | xargs -n 1 -P 32 -d $'\n' -I {} dd of=/dev/null if={} bs=64K status=none

real 2m24.346s
user 0m17.007s
sys 1m59.823s

squash-default
mhx@gimli ~ $ time find tmp/mount/ -type f | xargs -n 1 -P 32 -d $'\n' -I {} dd of=/dev/null if={} bs=64K status=none

real 8m41.594s
user 1m25.346s
sys 19m12.036s

squash-opt
mhx@gimli ~ $ time find tmp/mount/ -type f | xargs -n 1 -P 32 -d $'\n' -I {} dd of=/dev/null if={} bs=64K status=none

real 141m41.092s
user 1m12.650s
sys 59m18.194s

69 changes: 69 additions & 0 deletions dwarfs.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
function filter(f)
-- if f.name == 'Jamroot' or (f.name == 'test' and f.type == 'dir') then
-- return false
-- end
return true
end

function autovivify(C, args, num)
for i = 1, num do
local v = args[i]
if C[v] == nil then C[v] = {} end
C = C[v]
end
return C
end

function incr(C, ...)
local args = { n = select("#", ...), ... }
C = autovivify(C, args, args.n - 2)
local field = args[args.n - 1]
C[field] = (C[field] or 0) + args[args.n]
end

function push(C, ...)
local args = { n = select("#", ...), ... }
C = autovivify(C, args, args.n - 1)
table.insert(C, args[args.n])
end

function sortbysize(tbl)
return function (a, b)
return tbl[b]["size"]/tbl[b]["num"] < tbl[a]["size"]/tbl[a]["num"]
end
end

function order(filelist)
local C = {}
for _, f in pairs(filelist) do
local _, _, base, ext = string.find(f.name, "(.*)(%.%w+)$")
if ext == nil or string.find(ext, "[a-z]") == nil then
base, ext = f.name, ""
end
incr(C, ext, "size", f.size)
incr(C, ext, "num", 1)
incr(C, ext, "name", base, "size", f.size)
incr(C, ext, "name", base, "num", 1)
push(C, ext, "name", base, "files", f)
end
local ordered = {}
local exts = {}
for k, _ in pairs(C) do table.insert(exts, k) end
table.sort(exts, sortbysize(C))
for _, ext in ipairs(exts) do
local N = C[ext]["name"]
local bases = {}
for k, _ in pairs(N) do table.insert(bases, k) end
table.sort(bases, sortbysize(N))
for _, base in ipairs(bases) do
local files = N[base]["files"]
table.sort(files, function (a, b)
return b.size < a.size
end)
for _, file in ipairs(files) do
table.insert(ordered, file)
end
end
end
return ordered
end
1 change: 1 addition & 0 deletions folly
Submodule folly added at a5e2a7
86 changes: 86 additions & 0 deletions include/dwarfs/block_cache.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <future>
#include <limits>
#include <memory>
#include <mutex>

#include "fstypes.h"
#include "logger.h"

namespace dwarfs {

struct block_cache_options;

class cached_block;

class block_range {
public:
block_range(std::shared_ptr<cached_block const> block, size_t offset,
size_t size);

const uint8_t* data() const { return begin_; }
const uint8_t* begin() const { return begin_; }
const uint8_t* end() const { return end_; }
size_t size() const { return end_ - begin_; }

private:
const uint8_t* const begin_;
const uint8_t* const end_;
std::shared_ptr<cached_block const> block_;
};

class block_cache {
public:
block_cache(logger& lgr, const block_cache_options& options);

size_t block_count() const { return impl_->block_count(); }

void insert(compression_type comp, const uint8_t* data, size_t size) {
impl_->insert(comp, data, size);
}

void set_block_size(size_t size) { impl_->set_block_size(size); }

std::future<block_range>
get(size_t block_no, size_t offset, size_t size) const {
return impl_->get(block_no, offset, size);
}

class impl {
public:
virtual ~impl() = default;

virtual size_t block_count() const = 0;
virtual void
insert(compression_type comp, const uint8_t* data, size_t size) = 0;
virtual void set_block_size(size_t size) = 0;
virtual std::future<block_range>
get(size_t block_no, size_t offset, size_t length) const = 0;
};

private:
std::unique_ptr<impl> impl_;
};
} // namespace dwarfs
119 changes: 119 additions & 0 deletions include/dwarfs/block_compressor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <memory>
#include <vector>

namespace dwarfs {

enum class compression_type : uint8_t {
NONE = 0,
LZMA = 1,
ZSTD = 2,
LZ4 = 3,
LZ4HC = 4,
};

class block_compressor {
public:
block_compressor(const std::string& spec, size_t block_size = 0);

block_compressor(const block_compressor& bc)
: impl_(bc.impl_->clone()) {}

block_compressor(block_compressor&& bc) = default;
block_compressor& operator=(block_compressor&& rhs) = default;

std::vector<uint8_t> compress(const std::vector<uint8_t>& data) const {
return impl_->compress(data);
}

std::vector<uint8_t> compress(std::vector<uint8_t>&& data) const {
return impl_->compress(std::move(data));
}

void append(const uint8_t* data, size_t size, bool last) {
impl_->append(data, size, last);
}

std::vector<uint8_t> move_data() { return impl_->move_data(); }

compression_type type() const { return impl_->type(); }

class impl {
public:
virtual ~impl() = default;

virtual std::unique_ptr<impl> clone() const = 0;

// TODO: obsolete
virtual std::vector<uint8_t>
compress(const std::vector<uint8_t>& data) const = 0;
virtual std::vector<uint8_t>
compress(std::vector<uint8_t>&& data) const = 0;

virtual void append(const uint8_t* data, size_t size, bool last) = 0;
virtual std::vector<uint8_t> move_data() = 0;

virtual compression_type type() const = 0;
};

private:
std::unique_ptr<impl> impl_;
};

class block_decompressor {
public:
block_decompressor(compression_type type, const uint8_t* data, size_t size,
std::vector<uint8_t>& target);

bool decompress_frame(size_t frame_size = BUFSIZ) {
return impl_->decompress_frame(frame_size);
}

size_t uncompressed_size() const { return impl_->uncompressed_size(); }

compression_type type() const { return impl_->type(); }

static std::vector<uint8_t>
decompress(compression_type type, const uint8_t* data, size_t size) {
std::vector<uint8_t> target;
block_decompressor bd(type, data, size, target);
bd.decompress_frame(bd.uncompressed_size());
return target;
}

class impl {
public:
virtual ~impl() = default;

virtual bool decompress_frame(size_t frame_size) = 0;
virtual size_t uncompressed_size() const = 0;

virtual compression_type type() const = 0;
};

private:
std::unique_ptr<impl> impl_;
};
} // namespace dwarfs
71 changes: 71 additions & 0 deletions include/dwarfs/block_manager.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <memory>
#include <vector>

#include "logger.h"

namespace dwarfs {

class filesystem_writer;
class inode;
class os_access;
class progress;

class block_manager {
public:
struct config {
config();

std::vector<size_t> blockhash_window_size;
unsigned window_increment_shift;
size_t memory_limit;
unsigned block_size_bits;
};

block_manager(logger& lgr, progress& prog, const config& cfg,
std::shared_ptr<os_access> os, filesystem_writer& fsw);

void add_inode(std::shared_ptr<inode> ino) { impl_->add_inode(ino); }

void finish_blocks() { impl_->finish_blocks(); }

size_t total_size() const { return impl_->total_size(); }

size_t total_blocks() const { return impl_->total_blocks(); }

class impl {
public:
virtual ~impl() = default;

virtual void add_inode(std::shared_ptr<inode> ino) = 0;
virtual void finish_blocks() = 0;
virtual size_t total_size() const = 0;
virtual size_t total_blocks() const = 0;
};

private:
std::unique_ptr<impl> impl_;
};
} // namespace dwarfs
28 changes: 28 additions & 0 deletions include/dwarfs/config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

namespace dwarfs {

static const unsigned MIN_BLOCK_BITS_SIZE = 12;
static const unsigned MAX_BLOCK_BITS_SIZE = 28;
} // namespace dwarfs
53 changes: 53 additions & 0 deletions include/dwarfs/console_writer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <array>
#include <ostream>

#include "logger.h"
#include "progress.h"

namespace dwarfs {

class console_writer : public logger {
public:
console_writer(std::ostream& os, bool is_terminal, size_t width,
level_type threshold);

void write(level_type level, const std::string& output) override;

void update(const progress& p, bool last);

private:
void rewind();

std::ostream& os_;
std::mutex mx_;
std::atomic<level_type> threshold_;
std::string statebuf_;
double frac_;
std::atomic<size_t> counter_{0};
const bool show_progress_;
const size_t width_;
};
} // namespace dwarfs
77 changes: 77 additions & 0 deletions include/dwarfs/cyclic_hash.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <array>
#include <random>
#include <stdexcept>

namespace dwarfs {

template <typename T>
class byte_hash {
public:
byte_hash() {
std::default_random_engine generator;
std::uniform_int_distribution<T> distribution(0, static_cast<T>(-1));

for (size_t i = 0; i < hash_.size(); ++i) {
hash_[i] = distribution(generator);
}
}

T operator()(uint8_t c) const { return hash_[c]; }

private:
std::array<T, std::numeric_limits<uint8_t>::max() + 1> hash_;
};

template <typename T>
class cyclic_hash {
public:
cyclic_hash(size_t window_size, const byte_hash<T>& ch)
: hash_(0)
, byte_hash_(ch) {
if (window_size % hash_bits) {
throw std::runtime_error("unsupported window size");
}
}

void reset() { hash_ = 0; }

void update(uint8_t outbyte, uint8_t inbyte) {
hash_ = rol(hash_) ^ byte_hash_(outbyte) ^ byte_hash_(inbyte);
}

void update(uint8_t inbyte) { hash_ = rol(hash_) ^ byte_hash_(inbyte); }

T operator()() const { return hash_; }

private:
static const size_t hash_bits = 8 * sizeof(T);

inline T rol(T x) const { return (x << 1) | (x >> (hash_bits - 1)); }

T hash_;
const byte_hash<T>& byte_hash_;
};
} // namespace dwarfs
183 changes: 183 additions & 0 deletions include/dwarfs/entry.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <array>
#include <functional>
#include <memory>
#include <vector>

#include <sys/stat.h>

#include <folly/Range.h>

#include "file_interface.h"
#include "fstypes.h"

namespace dwarfs {

class file;
class link;
class dir;
class inode;
class os_access;
class progress;

class entry_visitor {
public:
virtual ~entry_visitor() = default;
virtual void visit(file* p) = 0;
virtual void visit(link* p) = 0;
virtual void visit(dir* p) = 0;
};

class entry : public file_interface {
public:
enum type_t { E_FILE, E_DIR, E_LINK };

entry(const std::string& name, std::shared_ptr<entry> parent,
const struct ::stat& st);

void scan(os_access& os, progress& prog);
bool has_parent() const;
std::shared_ptr<entry> parent() const;
void set_name(const std::string& name);
void set_name_offset(size_t offset);
std::string path() const override;
const std::string& name() const override { return name_; }
size_t size() const override { return stat_.st_size; }
virtual type_t type() const = 0;
std::string type_string() const override;
virtual size_t total_size() const;
virtual void walk(std::function<void(entry*)> const& f);
virtual void walk(std::function<void(const entry*)> const& f) const;
void pack(dir_entry& de) const;
void pack(dir_entry_ug& de) const;
void pack(dir_entry_ug_time& de) const;
virtual void accept(entry_visitor& v, bool preorder = false) = 0;
virtual uint32_t inode_num() const = 0;

protected:
virtual void pack_specific(dir_entry& de) const = 0;
virtual void scan(os_access& os, const std::string& p, progress& prog) = 0;

private:
std::string name_;
std::weak_ptr<entry> parent_;
struct ::stat stat_;
uint32_t name_offset_;
};

class file : public entry {
public:
file(const std::string& name, std::shared_ptr<entry> parent,
const struct ::stat& st, bool with_similarity)
: entry(name, parent, st)
, with_similarity_(with_similarity) {}

type_t type() const override;
folly::StringPiece hash() const;
void set_inode(std::shared_ptr<inode> ino);
std::shared_ptr<inode> get_inode() const;
void accept(entry_visitor& v, bool preorder) override;
uint32_t inode_num() const override;
uint32_t similarity_hash() const { return similarity_hash_; }

protected:
void pack_specific(dir_entry& de) const override;
void scan(os_access& os, const std::string& p, progress& prog) override;

private:
uint32_t similarity_hash_{0};
const bool with_similarity_;
std::array<char, 20> hash_{0};
std::shared_ptr<inode> inode_;
};

class dir : public entry {
public:
using entry::entry;

type_t type() const override;
void add(std::shared_ptr<entry> e);
size_t total_size() const override;
void walk(std::function<void(entry*)> const& f) override;
void walk(std::function<void(const entry*)> const& f) const override;
void accept(entry_visitor& v, bool preorder) override;
void sort();
void set_offset(size_t offset);
void set_inode(uint32_t inode);
virtual size_t packed_size() const = 0;
virtual void
pack(uint8_t* buf,
std::function<void(const entry* e, size_t offset)> const& offset_cb)
const = 0;
virtual size_t packed_entry_size() const = 0;
virtual void pack_entry(uint8_t* buf) const = 0;
uint32_t inode_num() const override { return inode_; }

protected:
void pack_specific(dir_entry& de) const override;
void scan(os_access& os, const std::string& p, progress& prog) override;

using entry_ptr = std::shared_ptr<entry>;

std::vector<std::shared_ptr<entry>> entries_;
uint32_t offset_ = 0;
uint32_t inode_ = 0;
};

class link : public entry {
public:
using entry::entry;

type_t type() const override;
const std::string& linkname() const;
void set_offset(size_t offset);
void set_inode(uint32_t inode);
void accept(entry_visitor& v, bool preorder) override;
uint32_t inode_num() const override { return inode_; }

protected:
void pack_specific(dir_entry& de) const override;
void scan(os_access& os, const std::string& p, progress& prog) override;

private:
std::string link_;
uint32_t offset_ = 0;
uint32_t inode_ = 0;
};

class entry_factory {
public:
static std::shared_ptr<entry_factory>
create(bool no_owner = false, bool no_time = false,
bool with_similarity = false);

virtual ~entry_factory() = default;

virtual std::shared_ptr<entry>
create(os_access& os, const std::string& name,
std::shared_ptr<entry> parent = std::shared_ptr<entry>()) = 0;
virtual dir_entry_type de_type() const = 0;
};
} // namespace dwarfs
37 changes: 37 additions & 0 deletions include/dwarfs/file_interface.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <string>

namespace dwarfs {

class file_interface {
public:
virtual ~file_interface() = default;

virtual std::string path() const = 0;
virtual const std::string& name() const = 0;
virtual std::string type_string() const = 0;
virtual size_t size() const = 0;
};
} // namespace dwarfs
37 changes: 37 additions & 0 deletions include/dwarfs/file_vector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <functional>

namespace dwarfs {

class file_vector {
public:
virtual ~file_vector() = default;
virtual const file_interface* operator[](size_t i) const = 0;
virtual size_t size() const = 0;
virtual void
sort(std::function<bool(const file_interface* a,
const file_interface* b)> const& less) = 0;
};
} // namespace dwarfs
171 changes: 171 additions & 0 deletions include/dwarfs/filesystem.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <exception>
#include <functional>
#include <memory>
#include <ostream>
#include <string>

#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/types.h>

#include "logger.h"
#include "mmif.h"

namespace dwarfs {

struct iovec_read_buf;

class error : public std::exception {
public:
error(const std::string& str, int err_no) noexcept
: what_(str)
, errno_(err_no) {}

error(const error& e) noexcept
: what_(e.what_)
, errno_(e.errno_) {}

error& operator=(const error& e) noexcept {
if (&e != this) {
what_ = e.what_;
errno_ = e.errno_;
}
return *this;
}

const char* what() const noexcept override { return what_.c_str(); }

int get_errno() const { return errno_; }

private:
std::string what_;
int errno_;
};

struct block_cache_options;
struct dir_entry;
struct directory;

class filesystem_writer;
class progress;

class filesystem {
public:
filesystem(logger& lgr, std::shared_ptr<mmif> mm,
const block_cache_options& bc_options,
const struct ::stat* stat_defaults = nullptr,
int inode_offset = 0);

static void rewrite(logger& lgr, progress& prog, std::shared_ptr<mmif> mm,
filesystem_writer& writer);

static void identify(logger& lgr, std::shared_ptr<mmif> mm, std::ostream& os);

void dump(std::ostream& os) const { impl_->dump(os); }

void walk(std::function<void(const dir_entry*)> const& func) {
impl_->walk(func);
}

const dir_entry* find(const char* path) const { return impl_->find(path); }

const dir_entry* find(int inode) const { return impl_->find(inode); }

const dir_entry* find(int inode, const char* name) const {
return impl_->find(inode, name);
}

int getattr(const dir_entry* de, struct ::stat* stbuf) const {
return impl_->getattr(de, stbuf);
}

int access(const dir_entry* de, int mode, uid_t uid, gid_t gid) const {
return impl_->access(de, mode, uid, gid);
}

const directory* opendir(const dir_entry* de) const {
return impl_->opendir(de);
}

const dir_entry*
readdir(const directory* d, size_t offset, std::string* name) const {
return impl_->readdir(d, offset, name);
}

size_t dirsize(const directory* d) const { return impl_->dirsize(d); }

int readlink(const dir_entry* de, char* buf, size_t size) const {
return impl_->readlink(de, buf, size);
}

int readlink(const dir_entry* de, std::string* buf) const {
return impl_->readlink(de, buf);
}

int statvfs(struct ::statvfs* stbuf) const { return impl_->statvfs(stbuf); }

int open(const dir_entry* de) const { return impl_->open(de); }

ssize_t read(uint32_t inode, char* buf, size_t size, off_t offset) const {
return impl_->read(inode, buf, size, offset);
}

ssize_t
readv(uint32_t inode, iovec_read_buf& buf, size_t size, off_t offset) const {
return impl_->readv(inode, buf, size, offset);
}

class impl {
public:
virtual ~impl() = default;

virtual void dump(std::ostream& os) const = 0;
virtual void
walk(std::function<void(const dir_entry*)> const& func) const = 0;
virtual const dir_entry* find(const char* path) const = 0;
virtual const dir_entry* find(int inode) const = 0;
virtual const dir_entry* find(int inode, const char* name) const = 0;
virtual int getattr(const dir_entry* de, struct ::stat* stbuf) const = 0;
virtual int
access(const dir_entry* de, int mode, uid_t uid, gid_t gid) const = 0;
virtual const directory* opendir(const dir_entry* de) const = 0;
virtual const dir_entry*
readdir(const directory* d, size_t offset, std::string* name) const = 0;
virtual size_t dirsize(const directory* d) const = 0;
virtual int readlink(const dir_entry* de, char* buf, size_t size) const = 0;
virtual int readlink(const dir_entry* de, std::string* buf) const = 0;
virtual int statvfs(struct ::statvfs* stbuf) const = 0;
virtual int open(const dir_entry* de) const = 0;
virtual ssize_t
read(uint32_t inode, char* buf, size_t size, off_t offset) const = 0;
virtual ssize_t readv(uint32_t inode, iovec_read_buf& buf, size_t size,
off_t offset) const = 0;
};

private:
std::unique_ptr<impl> impl_;
};
} // namespace dwarfs
87 changes: 87 additions & 0 deletions include/dwarfs/filesystem_writer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <ostream>
#include <vector>

#include "fstypes.h"
#include "worker_group.h"

namespace dwarfs {

class block_compressor;
class logger;
class progress;

class section {
public:
class impl {
public:
virtual ~impl() = default;

// TODO
};

section(std::unique_ptr<impl>&& i);

private:
std::unique_ptr<impl> impl_;
};

class filesystem_writer {
public:
filesystem_writer(std::ostream& os, logger& lgr, worker_group& wg,
progress& prog, const block_compressor& bc,
size_t max_queue_size);

// section create_block();
// section create_metadata();

// void add_section(section&& section);

void write_block(std::vector<uint8_t>&& data) {
impl_->write_block(std::move(data));
}

void write_metadata(std::vector<uint8_t>&& data) {
impl_->write_metadata(std::move(data));
}

void flush() { impl_->flush(); }

size_t size() const { return impl_->size(); }

class impl {
public:
virtual ~impl() = default;

virtual void write_block(std::vector<uint8_t>&& data) = 0;
virtual void write_metadata(std::vector<uint8_t>&& data) = 0;
virtual void flush() = 0;
virtual size_t size() const = 0;
};

private:
std::unique_ptr<impl> impl_;
};
} // namespace dwarfs
234 changes: 234 additions & 0 deletions include/dwarfs/fstypes.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <cstdint>
#include <iostream>
#include <stdexcept>

#include "block_compressor.h" // TODO: or the other way round?

namespace dwarfs {

/*************************
---------------------
file_header
---------------------
section_header [BLOCK]
block 0
---------------------
section_header [BLOCK]
block n
---------------------
section_header [METADATA]
metadata
---------------------
TODO: better description ;-)
metadata:
links_table -> vector<uint8_t> // links first, potential re-use for names
table :-)
names_table -> vector<uint8_t>
inode_table -> vector<chunk> // sizeof(chunk) aligned (64-bit)
directories...
inode_index: inode -> dir_entry offset
chunk_index: (inode - file_inode_offset) -> chunk offset
*************************/

constexpr uint8_t MAJOR_VERSION = 0;
constexpr uint8_t MINOR_VERSION = 0;

enum class section_type : uint16_t {
BLOCK = 0,
// Optionally compressed block data.

METADATA = 1,
// Optionally compressed metadata. This is just
// another section list.

META_TABLEDATA = 2,
// This is raw data that is indexed from the other
// sections by offset. It contains all names, link
// targets and chunk lists.
// Names are referenced by offset/length. Link targets
// are referenced by offset and actually start with a
// uint16_t storing the length of the remaining string.
// Names are free to share data with links targets.
// Chunk lists are just a vector of chunks, aligned to
// the size of a chunk for efficient access.

META_INODE_INDEX = 3,
// The inode index is a vector of offsets to all inodes
// (i.e. dir_entry* structs). The vector may be offset
// by inode_index_offset if inodes do not start at zero.

META_CHUNK_INDEX = 4,
// The chunk index is a vector of offsets to the start
// of the chunk list for file inodes. As all link and
// directory inodes precede all file inodes, this vector
// is offset by chunk_index_offset. There is one more
// element in the chunk index vector that holds an offset
// to the end of the chunk lists.

META_DIRECTORIES = 5,
// All directory structures, in top-down order. These
// are referenced from within the inode index. The root
// directory also has its dir_entry* struct stored here.

META_CONFIG = 6,
// Configuration data for this filesystem. Defines the
// type of dir_entry* structure being used as well as
// the block size which is needed for working with the
// chunk lists. Also defines inode offsets being used
// and the total inode count (for out-of-bounds checks).
};

enum class dir_entry_type : uint8_t {
DIR_ENTRY = 0, // filesystem uses dir_entry
DIR_ENTRY_UG = 1, // filesystem uses dir_entry_ug
DIR_ENTRY_UG_TIME = 2 // filesystem uses dir_entry_ug_time
};

struct file_header {
char magic[6]; // "DWARFS"
uint8_t major; // major version
uint8_t minor; // minor version
};

struct section_header {
section_type type;
compression_type compression;
uint8_t unused;
uint32_t length;

std::string to_string() const;
void dump(std::ostream& os) const;
};

struct dir_entry { // 128 bits (16 bytes) / entry
uint32_t name_offset;
uint16_t name_size;
uint16_t mode;
uint32_t inode; // dirs start at 1, then links, then files
union {
uint32_t file_size; // for files only
uint32_t offset; // for dirs, offset to directory,
} u; // for links, offset to content in link table
};

struct dir_entry_ug { // 160 bits (20 bytes) / entry
dir_entry de;
uint16_t owner;
uint16_t group;
};

struct dir_entry_ug_time { // 256 bits (32 bytes) / entry
dir_entry_ug ug;
uint32_t atime; // yeah, I know... in a few years we can switch to 64 bits
uint32_t mtime;
uint32_t ctime;
};

struct directory {
uint32_t count;
uint32_t self;
uint32_t parent;
union {
dir_entry entries[1];
dir_entry_ug entries_ug[1];
dir_entry_ug_time entries_ug_time[1];
} u;
};

struct meta_config {
uint8_t block_size_bits;
dir_entry_type de_type;
uint16_t unused;
uint32_t inode_count;
uint64_t orig_fs_size;
uint32_t chunk_index_offset;
uint32_t inode_index_offset;
};

using chunk_type = uint64_t;

template <unsigned BlockSizeBits>
struct chunk_access {
static_assert(BlockSizeBits < 32, "invalid value for BlockSizeBits");

static const unsigned block_bits = 64 - 2 * BlockSizeBits;
static const unsigned block_shift = 64 - block_bits;
static const chunk_type block_mask =
(static_cast<chunk_type>(1) << block_bits) - 1;
static const unsigned offset_shift = BlockSizeBits;
static const chunk_type offset_mask =
(static_cast<chunk_type>(1) << BlockSizeBits) - 1;
static const unsigned size_shift = 0;
static const chunk_type size_mask =
(static_cast<chunk_type>(1) << BlockSizeBits) - 1;
static const chunk_type max_size = size_mask + 1;

static void set(chunk_type& chunk, size_t block, size_t offset, size_t size) {
if (block > block_mask) {
std::cerr << "block out of range: " << block << " > " << block_mask
<< " [" << block_bits << "]";
throw std::runtime_error("block out of range");
}

if (offset > offset_mask) {
std::cerr << "offset out of range: " << offset << " > " << offset_mask
<< " [" << block_bits << "]";
throw std::runtime_error("offset out of range");
}

if (size > max_size or size == 0) {
std::cerr << "size out of range: " << size << " > " << size_mask << " ["
<< block_bits << "]";
throw std::runtime_error("size out of range");
}

chunk = (static_cast<chunk_type>(block) << block_shift) |
(static_cast<chunk_type>(offset) << offset_shift) |
(static_cast<chunk_type>(size - 1) << size_shift);
}

static size_t block(chunk_type chunk) {
return (chunk >> block_shift) & block_mask;
};

static size_t offset(chunk_type chunk) {
return (chunk >> offset_shift) & offset_mask;
};

static size_t size(chunk_type chunk) {
return ((chunk >> size_shift) & size_mask) + 1;
};
};

std::string get_compression_name(compression_type type);
} // namespace dwarfs
42 changes: 42 additions & 0 deletions include/dwarfs/inode.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#include <ostream>
#include <vector>

#include "fstypes.h"

namespace dwarfs {

class file;
class file_interface;

class inode : public file_interface {
public:
virtual void set_file(const file* f) = 0;
virtual void set_num(uint32_t num) = 0;
virtual uint32_t num() const = 0;
virtual uint32_t similarity_hash() const = 0;
virtual const file_interface* any() const = 0; // TODO
virtual void add_chunk(size_t block, size_t offset, size_t size) = 0;
virtual const std::vector<chunk_type>& chunks() const = 0;
};
} // namespace dwarfs
83 changes: 83 additions & 0 deletions include/dwarfs/inode_hasher.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <unordered_map>
#include <vector>

#include "cyclic_hash.h"
#include "logger.h"

namespace dwarfs {

template <typename LoggerPolicy, typename HashType>
class inode_hasher {
public:
using result_type =
typename std::unordered_map<size_t, std::vector<HashType>>;

inode_hasher(logger& lgr, byte_hash<HashType>& byte_hasher,
const std::vector<size_t>& blockhash_window_size)
: byte_hasher_(byte_hasher)
, window_(blockhash_window_size)
, log_(lgr) {}

void operator()(result_type& m, const uint8_t* data, size_t size) const {
auto tt = log_.timed_trace();

for (size_t wsize : window_) {
if (size >= wsize) {
hashit(m[wsize], wsize, data, size);
}
}

tt << "hashed " << size << " bytes";
}

private:
void hashit(std::vector<HashType>& vec, size_t window, const uint8_t* data,
size_t size) const {
cyclic_hash<HashType> hasher(window, byte_hasher_);

vec.clear();
vec.reserve(size - window);

size_t i = 0;

while (i < window) {
hasher.update(data[i++]);
}

vec.push_back(hasher());

while (i < size) {
hasher.update(data[i - window], data[i]);
vec.push_back(hasher());
++i;
}
}

byte_hash<HashType>& byte_hasher_;
const std::vector<size_t> window_;
log_proxy<LoggerPolicy> log_;
};
} // namespace dwarfs
56 changes: 56 additions & 0 deletions include/dwarfs/inode_manager.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <functional>
#include <memory>
#include <ostream>

#include "file_interface.h"
#include "inode.h"

namespace dwarfs {

class script;

class inode_manager {
public:
static std::shared_ptr<inode_manager> create(unsigned block_size_bits);

virtual ~inode_manager() = default;
virtual std::shared_ptr<inode> create() = 0;
virtual size_t count() const = 0;
virtual size_t block_size() const = 0;
virtual unsigned block_size_bits() const = 0;
virtual size_t chunk_size() const = 0;
virtual void order_inodes() = 0;
virtual void order_inodes(std::shared_ptr<script> scr) = 0;
virtual void order_inodes_by_similarity() = 0;
virtual void number_inodes(size_t first_no) = 0;
virtual void for_each_inode(
std::function<void(std::shared_ptr<inode> const&)> const& fn) const = 0;

private:
template <unsigned BlockSizeBits>
static std::shared_ptr<inode_manager> create_(unsigned block_size_bits);
};
} // namespace dwarfs
83 changes: 83 additions & 0 deletions include/dwarfs/inode_reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <memory>

#include <sys/uio.h>

#include <folly/small_vector.h>

#include "block_cache.h"
#include "fstypes.h"
#include "logger.h"

namespace dwarfs {

struct iovec_read_buf {
// This covers more than 95% of reads
static constexpr size_t inline_storage = 16;

folly::small_vector<struct ::iovec, inline_storage> buf;
folly::small_vector<block_range, inline_storage> ranges;
};

class inode_reader {
public:
inode_reader() = default;

inode_reader(logger& lgr, block_cache&& bc, unsigned block_size_bits);

inode_reader& operator=(inode_reader&&) = default;

ssize_t read(char* buf, size_t size, off_t offset, const chunk_type* chunk,
size_t chunk_count) const {
return impl_->read(buf, size, offset, chunk, chunk_count);
}

ssize_t readv(iovec_read_buf& buf, size_t size, off_t offset,
const chunk_type* chunk, size_t chunk_count) const {
return impl_->readv(buf, size, offset, chunk, chunk_count);
}

void dump(std::ostream& os, const std::string& indent,
const chunk_type* chunk, size_t chunk_count) const {
impl_->dump(os, indent, chunk, chunk_count);
}

class impl {
public:
virtual ~impl() = default;

virtual ssize_t read(char* buf, size_t size, off_t offset,
const chunk_type* chunk, size_t chunk_count) const = 0;
virtual ssize_t
readv(iovec_read_buf& buf, size_t size, off_t offset,
const chunk_type* chunk, size_t chunk_count) const = 0;
virtual void dump(std::ostream& os, const std::string& indent,
const chunk_type* chunk, size_t chunk_count) const = 0;
};

private:
std::unique_ptr<impl> impl_;
};
} // namespace dwarfs
317 changes: 317 additions & 0 deletions include/dwarfs/logger.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,317 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <atomic>
#include <chrono>
#include <iostream>
#include <memory>
#include <mutex>
#include <sstream>
#include <string>
#include <tuple>
#include <type_traits>

#include "util.h"

namespace dwarfs {

class logger {
public:
enum level_type : unsigned { ERROR, WARN, INFO, DEBUG, TRACE };

virtual ~logger() = default;

virtual void write(level_type level, const std::string& output) = 0;

const std::string& policy_name() const { return policy_name_; }

template <class Policy>
void set_policy() // TODO: construction time arg?
{
policy_name_ = Policy::name();
}

void set_policy_name(const std::string& name) // TODO: construction time arg?
{
policy_name_ = name;
}

static level_type parse_level(const std::string& level);

private:
std::string policy_name_; // TODO: const?
};

class stream_logger : public logger {
public:
stream_logger(std::ostream& os = std::cerr, level_type threshold = WARN);

void write(level_type level, const std::string& output) override;

void set_threshold(level_type threshold);

private:
std::ostream& os_;
std::mutex mx_;
std::atomic<level_type> threshold_;
};

class level_logger {
public:
level_logger(logger& lgr, logger::level_type level)
: data_(std::make_unique<data>(lgr, level)) {}

level_logger(level_logger&& ll)
: data_(std::move(ll.data_)) {}

~level_logger() { data_->lgr.write(data_->level, data_->oss.str()); }

template <typename T>
level_logger& operator<<(const T& val) {
data_->oss << val;
return *this;
}

private:
struct data {
data(logger& lgr, logger::level_type level)
: lgr(lgr)
, level(level) {}

logger& lgr;
std::ostringstream oss;
const logger::level_type level;
};

std::unique_ptr<data> data_;
};

class timed_level_logger {
public:
timed_level_logger(logger& lgr, logger::level_type level)
: data_(std::make_unique<data>(lgr, level)) {}

timed_level_logger(timed_level_logger&& ll)
: data_(std::move(ll.data_)) {}

~timed_level_logger() {
std::chrono::duration<double> sec =
std::chrono::high_resolution_clock::now() - data_->start_time;
data_->oss << " [" << time_with_unit(sec.count()) << "]";
data_->lgr.write(data_->level, data_->oss.str());
}

template <typename T>
timed_level_logger& operator<<(const T& val) {
data_->oss << val;
return *this;
}

private:
struct data {
data(logger& lgr, logger::level_type level)
: lgr(lgr)
, level(level)
, start_time(std::chrono::high_resolution_clock::now()) {}

logger& lgr;
std::ostringstream oss;
const logger::level_type level;
std::chrono::time_point<std::chrono::high_resolution_clock> start_time;
};

std::unique_ptr<data> data_;
};

class no_logger {
public:
no_logger(logger&, logger::level_type) {}

template <typename T>
no_logger& operator<<(const T&) {
return *this;
}
};

namespace detail {

template <bool LoggingEnabled>
using logger_type =
typename std::conditional<LoggingEnabled, level_logger, no_logger>::type;

template <bool LoggingEnabled>
using timed_logger_type =
typename std::conditional<LoggingEnabled, timed_level_logger,
no_logger>::type;
} // namespace detail

template <unsigned MinLogLevel>
class MinimumLogLevelPolicy {
public:
template <unsigned Level>
using logger = detail::logger_type<Level <= MinLogLevel>;

template <unsigned Level>
using timed_logger = detail::timed_logger_type<Level <= MinLogLevel>;
};

template <typename LogPolicy>
class log_proxy {
public:
log_proxy(logger& lgr)
: lgr_(lgr) {}

auto error() const {
return
typename LogPolicy::template logger<logger::ERROR>(lgr_, logger::ERROR);
}

auto warn() const {
return
typename LogPolicy::template logger<logger::WARN>(lgr_, logger::WARN);
}

auto info() const {
return
typename LogPolicy::template logger<logger::INFO>(lgr_, logger::INFO);
}

auto debug() const {
return
typename LogPolicy::template logger<logger::DEBUG>(lgr_, logger::DEBUG);
}

auto trace() const {
return
typename LogPolicy::template logger<logger::TRACE>(lgr_, logger::TRACE);
}

auto timed_error() const {
return typename LogPolicy::template timed_logger<logger::ERROR>(
lgr_, logger::ERROR);
}

auto timed_warn() const {
return typename LogPolicy::template timed_logger<logger::WARN>(
lgr_, logger::WARN);
}

auto timed_info() const {
return typename LogPolicy::template timed_logger<logger::INFO>(
lgr_, logger::INFO);
}

auto timed_debug() const {
return typename LogPolicy::template timed_logger<logger::DEBUG>(
lgr_, logger::DEBUG);
}

auto timed_trace() const {
return typename LogPolicy::template timed_logger<logger::TRACE>(
lgr_, logger::TRACE);
}

private:
logger& lgr_;
};

class prod_logger_policy : public MinimumLogLevelPolicy<logger::INFO> {
public:
static std::string name() { return "prod"; }
};

class debug_logger_policy : public MinimumLogLevelPolicy<logger::TRACE> {
public:
static std::string name() { return "debug"; }
};

using logger_policies = std::tuple<debug_logger_policy, prod_logger_policy>;

template <class T>
struct unique_ptr_policy {
using return_type = std::unique_ptr<T>;

template <class U, class... Args>
static return_type create(Args&&... args) {
return std::make_unique<U>(std::forward<Args>(args)...);
}
};

template <class T>
struct shared_ptr_policy {
using return_type = std::shared_ptr<T>;

template <class U, class... Args>
static return_type create(Args&&... args) {
return std::make_shared<U>(std::forward<Args>(args)...);
}
};

template <template <class> class T, class CreatePolicy, class LoggerPolicyList,
size_t N>
struct logging_class_factory {
template <class... Args>
static typename CreatePolicy::return_type
create(logger& lgr, Args&&... args) {
if (std::tuple_element<N - 1, LoggerPolicyList>::type::name() ==
lgr.policy_name()) {
using obj_type =
T<typename std::tuple_element<N - 1, LoggerPolicyList>::type>;
return CreatePolicy::template create<obj_type>(
lgr, std::forward<Args>(args)...);
}

return logging_class_factory<T, CreatePolicy, LoggerPolicyList,
N - 1>::create(lgr,
std::forward<Args>(args)...);
}
};

template <template <class> class T, class CreatePolicy, class LoggerPolicyList>
struct logging_class_factory<T, CreatePolicy, LoggerPolicyList, 0> {
template <class... Args>
static typename CreatePolicy::return_type create(logger& lgr, Args&&...) {
throw std::runtime_error("no such logger policy: " + lgr.policy_name());
}
};

template <class Base, template <class> class T, class LoggerPolicyList,
class... Args>
std::unique_ptr<Base> make_unique_logging_object(logger& lgr, Args&&... args) {
return logging_class_factory<
T, unique_ptr_policy<Base>, LoggerPolicyList,
std::tuple_size<LoggerPolicyList>::value>::create(lgr,
std::forward<Args>(
args)...);
}

template <class Base, template <class> class T, class LoggerPolicyList,
class... Args>
std::shared_ptr<Base> make_shared_logging_object(logger& lgr, Args&&... args) {
return logging_class_factory<
T, shared_ptr_policy<Base>, LoggerPolicyList,
std::tuple_size<LoggerPolicyList>::value>::create(lgr,
std::forward<Args>(
args)...);
}
} // namespace dwarfs
44 changes: 44 additions & 0 deletions include/dwarfs/lua_script.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <memory>

#include "script.h"

namespace dwarfs {

class logger;

class lua_script : public script {
public:
lua_script(logger& lgr, const std::string& file);
~lua_script();

bool filter(file_interface const& fi) const override;
void order(file_vector& fvi) const override;

private:
class impl;
std::unique_ptr<impl> impl_;
};
} // namespace dwarfs
143 changes: 143 additions & 0 deletions include/dwarfs/metadata.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <cstdint>
#include <functional>
#include <memory>
#include <vector>

#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/types.h>

#include "fstypes.h"
#include "logger.h"

namespace dwarfs {

class metadata {
public:
metadata() = default;

metadata(logger& lgr, std::vector<uint8_t>&& data,
const struct ::stat* defaults, int inode_offset = 0);

metadata& operator=(metadata&&) = default;

static void get_stat_defaults(struct ::stat* defaults);

size_t size() const { return impl_->size(); }

bool empty() const { return !impl_ || impl_->empty(); }

size_t block_size() const { return impl_->block_size(); }

unsigned block_size_bits() const { return impl_->block_size_bits(); }

void
dump(std::ostream& os,
std::function<void(const std::string&, uint32_t)> const& icb) const {
impl_->dump(os, icb);
}

void walk(std::function<void(const dir_entry*)> const& func) const {
impl_->walk(func);
}

const dir_entry* find(const char* path) const { return impl_->find(path); }

const dir_entry* find(int inode) const { return impl_->find(inode); }

const dir_entry* find(int inode, const char* name) const {
return impl_->find(inode, name);
}

int getattr(const dir_entry* de, struct ::stat* stbuf) const {
return impl_->getattr(de, stbuf);
}

int access(const dir_entry* de, int mode, uid_t uid, gid_t gid) const {
return impl_->access(de, mode, uid, gid);
}

const directory* opendir(const dir_entry* de) const {
return impl_->opendir(de);
}

const dir_entry*
readdir(const directory* d, size_t offset, std::string* name) const {
return impl_->readdir(d, offset, name);
}

size_t dirsize(const directory* d) const { return impl_->dirsize(d); }

int readlink(const dir_entry* de, char* buf, size_t size) const {
return impl_->readlink(de, buf, size);
}

int readlink(const dir_entry* de, std::string* buf) const {
return impl_->readlink(de, buf);
}

int statvfs(struct ::statvfs* stbuf) const { return impl_->statvfs(stbuf); }

int open(const dir_entry* de) const { return impl_->open(de); }

const chunk_type* get_chunks(int inode, size_t& num) const {
return impl_->get_chunks(inode, num);
}

class impl {
public:
virtual ~impl() = default;

virtual size_t size() const = 0;
virtual bool empty() const = 0;
virtual size_t block_size() const = 0;
virtual unsigned block_size_bits() const = 0;
virtual void dump(
std::ostream& os,
std::function<void(const std::string&, uint32_t)> const& icb) const = 0;
virtual void
walk(std::function<void(const dir_entry*)> const& func) const = 0;
virtual const dir_entry* find(const char* path) const = 0;
virtual const dir_entry* find(int inode) const = 0;
virtual const dir_entry* find(int inode, const char* name) const = 0;
virtual int getattr(const dir_entry* de, struct ::stat* stbuf) const = 0;
virtual int
access(const dir_entry* de, int mode, uid_t uid, gid_t gid) const = 0;
virtual const directory* opendir(const dir_entry* de) const = 0;
virtual const dir_entry*
readdir(const directory* d, size_t offset, std::string* name) const = 0;
virtual size_t dirsize(const directory* d) const = 0;
virtual int readlink(const dir_entry* de, char* buf, size_t size) const = 0;
virtual int readlink(const dir_entry* de, std::string* buf) const = 0;
virtual int statvfs(struct ::statvfs* stbuf) const = 0;
virtual int open(const dir_entry* de) const = 0;
virtual const chunk_type* get_chunks(int inode, size_t& num) const = 0;
};

private:
std::unique_ptr<impl> impl_;
};
} // namespace dwarfs
97 changes: 97 additions & 0 deletions include/dwarfs/metadata_writer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <vector>

#include <folly/Range.h>

#include "fstypes.h"
#include "logger.h"

namespace dwarfs {

class metadata_writer {
public:
using const_iterator = std::vector<uint8_t>::const_iterator;

metadata_writer(logger& lgr, std::vector<uint8_t>& mem,
size_t section_align = 8);
void align(size_t align);
void finish_section();
void start_section(section_type type);
uint8_t* buffer(size_t size);
void write(const void* data, size_t size);

const_iterator begin() const { return mem_.begin(); }

const_iterator section_begin() const {
return mem_.begin() + section_data_offset();
}

const uint8_t* section_data() const {
return mem_.data() + section_data_offset();
}

size_t section_data_size() const {
return mem_.size() - section_data_offset();
}

size_t section_data_offset() const {
return section_header_offset_ + sizeof(section_header);
}

const_iterator end() const { return mem_.end(); }

size_t offset() const { return mem_.size(); }

template <typename T>
void write(const T& obj) {
write(&obj, sizeof(T));
}

template <typename T>
void write(const std::vector<T>& vec) {
if (!vec.empty()) {
write(vec.data(), sizeof(T) * vec.size());
}
}

void write(const std::string& str) {
if (!str.empty()) {
write(str.data(), str.size());
}
}

void write(folly::StringPiece str) {
if (!str.empty()) {
write(str.data(), str.size());
}
}

private:
std::vector<uint8_t>& mem_;
size_t section_header_offset_;
const size_t section_align_;
log_proxy<debug_logger_policy> log_;
};
} // namespace dwarfs
40 changes: 40 additions & 0 deletions include/dwarfs/mmap.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <string>

#include "mmif.h"

namespace dwarfs {

class mmap : public mmif {
public:
mmap(const std::string& path);
mmap(const std::string& path, size_t size);

virtual ~mmap() noexcept;

private:
int fd_;
};
} // namespace dwarfs
54 changes: 54 additions & 0 deletions include/dwarfs/mmif.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <string>

#include <boost/noncopyable.hpp>

namespace dwarfs {

class mmif : public boost::noncopyable {
public:
virtual ~mmif() = default;

const void* get() const { return addr_; }

template <typename T>
const T* as(size_t offset = 0) const {
return reinterpret_cast<const T*>(
reinterpret_cast<const char*>(const_cast<const void*>(addr_)) + offset);
}

size_t size() const { return size_; }

protected:
void assign(const void* addr, size_t size) {
addr_ = addr;
size_ = size;
}

private:
const void* addr_;
size_t size_;
};
} // namespace dwarfs
37 changes: 37 additions & 0 deletions include/dwarfs/options.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

namespace dwarfs {

struct block_cache_options {
size_t max_bytes{0};
size_t num_workers{0};
double decompress_ratio{1.0};
};

enum class file_order_mode { NONE, PATH, SCRIPT, SIMILARITY };

struct scanner_options {
file_order_mode file_order;
};
} // namespace dwarfs
52 changes: 52 additions & 0 deletions include/dwarfs/os_access.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <memory>
#include <string>

#include <sys/stat.h>

#include "mmif.h"

namespace dwarfs {

class dir_reader {
public:
virtual ~dir_reader() = default;

virtual bool read(std::string& name) const = 0;
};

class os_access {
public:
virtual ~os_access() = default;

virtual std::shared_ptr<dir_reader>
opendir(const std::string& path) const = 0;
virtual void lstat(const std::string& path, struct ::stat* st) const = 0;
virtual std::string readlink(const std::string& path, size_t size) const = 0;
virtual std::shared_ptr<mmif>
map_file(const std::string& path, size_t size) const = 0;
virtual int access(const std::string& path, int mode) const = 0;
};
} // namespace dwarfs
40 changes: 40 additions & 0 deletions include/dwarfs/os_access_posix.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <memory>
#include <string>

#include "os_access.h"

namespace dwarfs {

class os_access_posix : public os_access {
public:
std::shared_ptr<dir_reader> opendir(const std::string& path) const override;
void lstat(const std::string& path, struct ::stat* st) const override;
std::string readlink(const std::string& path, size_t size) const override;
std::shared_ptr<mmif>
map_file(const std::string& path, size_t size) const override;
int access(const std::string& path, int mode) const override;
};
} // namespace dwarfs
64 changes: 64 additions & 0 deletions include/dwarfs/progress.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <atomic>
#include <condition_variable>
#include <cstdint>
#include <mutex>
#include <ostream>
#include <thread>

#include <folly/Function.h>

namespace dwarfs {

class progress {
public:
progress(folly::Function<void(const progress&, bool)>&& func);
~progress() noexcept;

std::atomic<size_t> files_found{0};
std::atomic<size_t> files_scanned{0};
std::atomic<size_t> dirs_found{0};
std::atomic<size_t> dirs_scanned{0};
std::atomic<size_t> links_found{0};
std::atomic<size_t> links_scanned{0};
std::atomic<size_t> duplicate_files{0};
std::atomic<size_t> block_count{0};
std::atomic<size_t> chunk_count{0};
std::atomic<size_t> inodes_written{0};
std::atomic<size_t> blocks_written{0};
std::atomic<size_t> errors{0};
std::atomic<uint64_t> original_size{0};
std::atomic<uint64_t> saved_by_deduplication{0};
std::atomic<uint64_t> saved_by_segmentation{0};
std::atomic<uint64_t> filesystem_size{0};
std::atomic<uint64_t> compressed_size{0};

private:
std::atomic<bool> running_;
std::mutex mx_;
std::condition_variable cond_;
std::thread thread_;
};
} // namespace dwarfs
60 changes: 60 additions & 0 deletions include/dwarfs/scanner.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <memory>
#include <string>

#include "block_manager.h"
#include "worker_group.h"

namespace dwarfs {

class entry_factory;
class logger;
class os_access;
class progress;
struct scanner_options;
class script;

class scanner {
public:
scanner(logger& lgr, worker_group& wg, const block_manager::config& cfg,
std::shared_ptr<entry_factory> ef, std::shared_ptr<os_access> os,
std::shared_ptr<script> scr, const scanner_options& options);

void scan(filesystem_writer& fsw, const std::string& path, progress& prog) {
impl_->scan(fsw, path, prog);
}

class impl {
public:
virtual ~impl() = default;

virtual void
scan(filesystem_writer& fsw, const std::string& path, progress& prog) = 0;
};

private:
std::unique_ptr<impl> impl_;
};
} // namespace dwarfs
74 changes: 74 additions & 0 deletions include/dwarfs/script.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <algorithm>
#include <memory>
#include <vector>

#include "file_interface.h"
#include "file_vector.h"

namespace dwarfs {

namespace detail {

template <class T>
class file_vector_ : public file_vector {
public:
file_vector_(std::vector<std::shared_ptr<T>>& vec)
: vec_(vec) {}

const file_interface* operator[](size_t i) const override {
return vec_[i].get();
}

size_t size() const override { return vec_.size(); }

void
sort(std::function<bool(const file_interface*, const file_interface*)> const&
less) override {
std::sort(vec_.begin(), vec_.end(),
[&](const std::shared_ptr<T>& a, const std::shared_ptr<T>& b) {
return less(a.get(), b.get());
});
}

private:
std::vector<std::shared_ptr<T>>& vec_;
};
} // namespace detail

class script {
public:
virtual ~script() = default;

virtual bool filter(file_interface const& fi) const = 0;
virtual void order(file_vector& fvi) const = 0;

template <typename T>
void order(std::vector<std::shared_ptr<T>>& vec) const {
detail::file_vector_<T> fv(vec);
order(fv);
}
};
} // namespace dwarfs
29 changes: 29 additions & 0 deletions include/dwarfs/similarity.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <cstdint>

namespace dwarfs {

uint32_t get_similarity_hash(const uint8_t* data, size_t size);
}
34 changes: 34 additions & 0 deletions include/dwarfs/util.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/* vim:set ts=2 sw=2 sts=2 et: */
/**
* \author Marcus Holland-Moritz (github@mhxnet.de)
* \copyright Copyright (c) Marcus Holland-Moritz
*
* This file is part of dwarfs.
*
* dwarfs is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* dwarfs is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with dwarfs. If not, see <https://www.gnu.org/licenses/>.
*/

#pragma once

#include <array>
#include <climits>
#include <string>

namespace dwarfs {

std::string time_with_unit(double sec);
std::string size_with_unit(size_t size);
size_t parse_size_with_unit(const std::string& str);
std::string get_program_path();
} // namespace dwarfs
Loading