pisa-engine · elshize · Oct 22, 2019 · Oct 22, 2019 · Oct 23, 2019 · Oct 25, 2019
diff --git a/.gitmodules b/.gitmodules
@@ -67,3 +67,9 @@
 [submodule "external/wapopp"]
 	path = external/wapopp
 	url = https://github.com/pisa-engine/wapopp.git
+[submodule "external/optional"]
+	path = external/optional
+	url = https://github.com/TartanLlama/optional.git
+[submodule "external/expected"]
+	path = external/expected
+	url = https://github.com/TartanLlama/expected.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -69,12 +69,15 @@ endif()
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 
+file(GLOB_RECURSE PISA_SRC_FILES FOLLOW_SYMLINKS "src/v1/*cpp")
+list(SORT PISA_SRC_FILES)
+
 include_directories(include)
-add_library(pisa INTERFACE)
-target_include_directories(pisa INTERFACE
+add_library(pisa ${PISA_SRC_FILES})
+target_include_directories(pisa PUBLIC
     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include/pisa>
 )
-target_link_libraries(pisa INTERFACE
+target_link_libraries(pisa PUBLIC
     Threads::Threads
     Boost::boost
     QMX
@@ -95,10 +98,12 @@ target_link_libraries(pisa INTERFACE
     spdlog
     fmt::fmt
     range-v3
+    optional
 )
-target_include_directories(pisa INTERFACE external)
+target_include_directories(pisa PUBLIC external)
 
-add_subdirectory(src)
+add_subdirectory(v1)
+#add_subdirectory(src)
 
 if (PISA_ENABLE_TESTING AND BUILD_TESTING)
     enable_testing()

diff --git a/external/CMakeLists.txt b/external/CMakeLists.txt
@@ -116,3 +116,13 @@ add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/spdlog)
 
 # Add range-v3
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/range-v3)
+
+# Add tl::optional
+set(OPTIONAL_ENABLE_TESTS OFF CACHE BOOL "skip tl::optional testing")
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/optional EXCLUDE_FROM_ALL)
+
+# Add tl::expected
+#set(EXPECTED_BUILD_TESTS OFF CACHE BOOL "skip tl::expected testing")
+#set(EXPECTED_BUILD_PACKAGE OFF CACHE BOOL "skip tl::expected package")
+#set(EXPECTED_BUILD_PACKAGE_DEB OFF CACHE BOOL "skip tl::expected package deb")
+#add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/expected EXCLUDE_FROM_ALL)
diff --git a/external/expected b/external/expected
diff --git a/external/optional b/external/optional
diff --git a/include/pisa/codec/simdbp.hpp b/include/pisa/codec/simdbp.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
-#include <vector>
-#include "util/util.hpp"
 #include "codec/block_codecs.hpp"
+#include "util/util.hpp"
+#include <vector>
 
 extern "C" {
 #include "simdcomp/include/simdbitpacking.h"
@@ -14,7 +14,8 @@ struct simdbp_block {
     static void encode(uint32_t const *in,
                        uint32_t sum_of_values,
                        size_t n,
-                       std::vector<uint8_t> &out) {
+                       std::vector<uint8_t> &out)
+    {
 
         assert(n <= block_size);
         uint32_t *src = const_cast<uint32_t *>(in);
@@ -23,23 +24,22 @@ struct simdbp_block {
             return;
         }
         uint32_t b = maxbits(in);
-        thread_local std::vector<uint8_t> buf(8*n);
-        uint8_t * buf_ptr = buf.data();
+        thread_local std::vector<uint8_t> buf(8 * n);
+        uint8_t *buf_ptr = buf.data();
         *buf_ptr++ = b;
         simdpackwithoutmask(src, (__m128i *)buf_ptr, b);
         out.insert(out.end(), buf.data(), buf.data() + b * sizeof(__m128i) + 1);
     }
-    static uint8_t const *decode(uint8_t const *in,
-                                 uint32_t *out,
-                                 uint32_t sum_of_values,
-                                 size_t n) {
+    static uint8_t const *decode(uint8_t const *in, uint32_t *out, uint32_t sum_of_values, size_t n)
+    {
         assert(n <= block_size);
         if (PISA_UNLIKELY(n < block_size)) {
             return interpolative_block::decode(in, out, sum_of_values, n);
         }
         uint32_t b = *in++;
         simdunpack((const __m128i *)in, out, b);
-        return in +  b * sizeof(__m128i);
+        return in + b * sizeof(__m128i);
     }
 };
-} // namespace pisa
+
+} // namespace pisa
diff --git a/include/pisa/io.hpp b/include/pisa/io.hpp
@@ -61,4 +61,18 @@ void for_each_line(std::istream &is, Function fn)
     return data;
 }
 
+[[nodiscard]] inline auto load_bytes(std::string const &data_file)
+{
+    std::vector<std::byte> data;
+    std::basic_ifstream<std::byte> in(data_file.c_str(), std::ios::binary);
+    in.seekg(0, std::ios::end);
+    std::streamsize size = in.tellg();
+    in.seekg(0, std::ios::beg);
+    data.resize(size);
+    if (not in.read(data.data(), size)) {
+        throw std::runtime_error("Failed reading " + data_file);
+    }
+    return data;
+}
+
 } // namespace pisa::io
diff --git a/include/pisa/query/queries.hpp b/include/pisa/query/queries.hpp
@@ -12,7 +12,7 @@
 #include <spdlog/spdlog.h>
 
 #include "index_types.hpp"
-#include "query/queries.hpp"
+#include "query/query.hpp"
 #include "scorer/score_function.hpp"
 #include "term_processor.hpp"
 #include "tokenizer.hpp"
@@ -24,15 +24,6 @@
 
 namespace pisa {
 
-using term_id_type = uint32_t;
-using term_id_vec = std::vector<term_id_type>;
-
-struct Query {
-    std::optional<std::string> id;
-    std::vector<term_id_type> terms;
-    std::vector<float> term_weights;
-};
-
 [[nodiscard]] auto split_query_at_colon(std::string const &query_string)
     -> std::pair<std::optional<std::string>, std::string_view>
 {
@@ -98,10 +89,10 @@ struct Query {
 {
     if (terms_file) {
         auto term_processor = TermProcessor(terms_file, stopwords_filename, stemmer_type);
-        return [&queries, term_processor = std::move(term_processor)](
-                   std::string const &query_line) {
-            queries.push_back(parse_query_terms(query_line, term_processor));
-        };
+        return
+            [&queries, term_processor = std::move(term_processor)](std::string const &query_line) {
+                queries.push_back(parse_query_terms(query_line, term_processor));
+            };
     } else {
         return [&queries](std::string const &query_line) {
             queries.push_back(parse_query_ids(query_line));

diff --git a/include/pisa/query/query.hpp b/include/pisa/query/query.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace pisa {
+
+using term_id_type = std::uint32_t;
+using term_id_vec = std::vector<term_id_type>;
+
+struct Query {
+    std::optional<std::string> id;
+    std::vector<term_id_type> terms;
+    std::vector<float> term_weights;
+};
+
+} // namespace pisa
diff --git a/include/pisa/v1/README.md b/include/pisa/v1/README.md
@@ -0,0 +1,79 @@
+> This document is a **work in progress**.
+
+# Introduction
+
+In our efforts to come up with the v1.0 of both PISA and our index format,
+we should start a discussion about the shape of things from the point of view
+of both the binary format and how we can use it in our library.
+
+## Index Format specification
+
+This document mainly discusses the binary file format of each index component,
+as well as how these components come together to form a cohesive structure.
+
+## Reference Implementation
+
+Along with format description and discussion, this directory includes some
+reference implementation of the discussed structures and some algorithms working on them.
+
+The goal of this is to show how things work on certain examples,
+and find out what works and what doesn't and still needs to be thought through.
+
+> Look in `test/test_v1.cpp` for code examples.
+
+# Posting Files
+
+> Example: `v1/raw_cursor.hpp`.
+
+Each _posting file_ contains a list of blocks of data, each related to a single term,
+preceded by a header encoding information about the type of payload.
+
+> Do we need the header? I would say "yes" because even if we store the information
+> somewhere else, then we might want to (1) verify that we are reading what we think
+> we are reading, and (2) verify format version compatibility.
+> The latter should be further discussed.
+
+```
+Posting File := Header, [Posting Block]
+```
+
+Each posting block encodes a list of homogeneous values, called _postings_.
+Encoding is not fixed.
+
+> Note that _block_ here means the entire posting list area.
+> We can work on the terminology.
+
+## Header
+
+> Example: `v1/posting_format_header.hpp`.
+
+We should store the type of the postings in the file, as well as encoding used.
+**This might be tricky because we'd like it to be an open set of values/encodings.**
+
+```
+Header := Version, Type, Encoding
+Version := Major, Minor, Path
+Type := ValueId, Count
+```
+
+## Posting Types
+
+I think supporting these types will be sufficient to express about anything we
+would want to, including single-value lists, document-frequency (or score) lists,
+positional indexes, etc.
+
+```
+Type := Primitive | List[Type] | Tuple[Type]
+Primitive := int32 | float32
+```
+
+## Encodings
+
+We can identify encodings by either a name or ID/hash, or both.
+I can imagine that an index reader could **register** new encodings,
+and default to whatever we define in PISA.
+We should then also verify that this encoding implement a `Encoding<Type>` "concept".
+This is not the same as our "codecs".
+This would be more like posting list reader.
+
+> Example: `IndexRunner` in `v1/index.hpp`.
diff --git a/include/pisa/v1/bit_cast.hpp b/include/pisa/v1/bit_cast.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <cstring>
+
+#include <gsl/span>
+
+namespace pisa::v1 {
+
+template <class T>
+constexpr auto bit_cast(gsl::span<const std::byte> mem) -> std::remove_const_t<T>
+{
+    std::remove_const_t<T> dst{};
+    std::memcpy(&dst, mem.data(), sizeof(T));
+    return dst;
+}
+
+} // namespace pisa::v1