From 3c53e3fd4ed1285ae1937cf849d71a339b82d078 Mon Sep 17 00:00:00 2001 From: "Marcus D. R. Klarqvist" Date: Mon, 9 Sep 2019 09:46:29 +0100 Subject: [PATCH] refactor --- .gitignore | 5 + Makefile | 26 +-- flagstats.cpp => benchmark/flagstats.cpp | 0 generate.cpp => benchmark/generate.cpp | 2 - utility.cpp => benchmark/utility.cpp | 5 + libflagstats.h | 225 ++++++++++++++++------- positional-popcount | 1 - utility | Bin 13552 -> 13632 bytes 8 files changed, 177 insertions(+), 87 deletions(-) rename flagstats.cpp => benchmark/flagstats.cpp (100%) rename generate.cpp => benchmark/generate.cpp (90%) rename utility.cpp => benchmark/utility.cpp (60%) delete mode 160000 positional-popcount diff --git a/.gitignore b/.gitignore index 259148f..67e900a 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,8 @@ *.exe *.out *.app + +# +bench +utility +generate diff --git a/Makefile b/Makefile index 76ac527..b4742ef 100644 --- a/Makefile +++ b/Makefile @@ -19,14 +19,14 @@ OPTFLAGS := -O3 -march=native CFLAGS = -std=c99 $(OPTFLAGS) $(DEBUG_FLAGS) CPPFLAGS = -std=c++0x $(OPTFLAGS) $(DEBUG_FLAGS) -CPP_SOURCE = flagstats.cpp utility.cpp +CPP_SOURCE = benchmark/flagstats.cpp benchmark/utility.cpp benchmark/generate.cpp C_SOURCE = OBJECTS = $(CPP_SOURCE:.cpp=.o) $(C_SOURCE:.c=.o) -POSPOPCNT_PATH := libalgebra +POSPOPCNT_PATH := ../libalgebra LZ4_PATH := ZSTD_PATH := -INCLUDE_PATHS := +INCLUDE_PATHS := -I$(PWD) LIBRARY_PATHS := ifneq ($(LZ4_PATH),) INCLUDE_PATHS += -I$(LZ4_PATH)/include @@ -42,23 +42,23 @@ INCLUDE_PATHS := $(sort $(INCLUDE_PATHS)) LIBRARY_PATHS := $(sort $(LIBRARY_PATHS)) # Default target -all: flagstats utility +all: benchmark utility generate # Generic rules -utility.o: utility.cpp - $(CXX) $(CPPFLAGS) -c -o $@ $< +utility: benchmark/utility.cpp + $(CXX) $(CPPFLAGS) -o $@ $< -flagstats.o: flagstats.cpp - $(CXX) $(CPPFLAGS) -I$(POSPOPCNT_PATH) $(INCLUDE_PATHS) -c -o $@ $< +generate: benchmark/generate.cpp + $(CXX) $(CPPFLAGS) -o $@ $< -flagstats: flagstats.o - $(CXX) $(CPPFLAGS) flagstats.o -I$(POSPOPCNT_PATH) $(INCLUDE_PATHS) $(LIBRARY_PATHS) -o flagstats -llz4 -lzstd +bench.o: benchmark/flagstats.cpp + $(CXX) $(CPPFLAGS) -I$(POSPOPCNT_PATH) $(INCLUDE_PATHS) -c -o $@ $< -utility: utility.o - $(CXX) $(CPPFLAGS) utility.o -o utility +benchmark: bench.o + $(CXX) $(CPPFLAGS) bench.o -I$(POSPOPCNT_PATH) $(INCLUDE_PATHS) $(LIBRARY_PATHS) -o bench -llz4 -lzstd clean: rm -f $(OBJECTS) - rm -f flagstats utility + rm -f bench bench.o utility generate .PHONY: all clean diff --git a/flagstats.cpp b/benchmark/flagstats.cpp similarity index 100% rename from flagstats.cpp rename to benchmark/flagstats.cpp diff --git a/generate.cpp b/benchmark/generate.cpp similarity index 90% rename from generate.cpp rename to benchmark/generate.cpp index bf9c562..6632ebe 100644 --- a/generate.cpp +++ b/benchmark/generate.cpp @@ -8,8 +8,6 @@ int main(int argc, char** argv) { std::random_device rd; // obtain a random number from hardware std::mt19937 eng(rd()); // seed the generator - std::cerr << strtoull( argv[1], NULL, 10 ) << std::endl; - std::uniform_int_distribution distr(0, 4096-1); // right inclusive for (int i = 0; i < strtoull( argv[1], NULL, 10 ); ++i) { uint16_t x = distr(eng); diff --git a/utility.cpp b/benchmark/utility.cpp similarity index 60% rename from utility.cpp rename to benchmark/utility.cpp index 978bff7..ef6bb84 100644 --- a/utility.cpp +++ b/benchmark/utility.cpp @@ -1,7 +1,12 @@ #include #include #include +#include +// Utility function accepting data from cin stream and converting +// text-based FLAG values into uint16_t words. +// Intended use: +// samtools view FILE | cut -f 2 | utility > DEST_FILE.bin int main(int argc, char** argv) { std::string str; while (std::getline(std::cin, str)) { diff --git a/libflagstats.h b/libflagstats.h index 7169522..626c57f 100644 --- a/libflagstats.h +++ b/libflagstats.h @@ -106,17 +106,20 @@ extern "C" { #endif -void FLAGSTAT_samtools_single_update(uint16_t val, uint32_t* flags) { +void FLAGSTAT_scalar_update(uint16_t val, uint32_t* flags) { // If the FLAGSTAT_FQCFAIL is set the data is shift 16 values to // the right to distinguish between statistics for data // that failed and passed quality control. const int offset = ( (val & FLAGSTAT_FQCFAIL) == 0 ) ? 0 : 16; + // Count only reads that with FLAGSTAT_FQCFAIL set. The other + // reads are implicitly known and computed at the end of + // FLAGSTAT_* functions. if (offset) ++flags[offset + FLAGSTAT_FQCFAIL_OFF]; if (val & FLAGSTAT_FSECONDARY) ++flags[offset + FLAGSTAT_FSECONDARY_OFF]; else if (val & FLAGSTAT_FSUPPLEMENTARY) ++flags[offset + FLAGSTAT_FSUPPLEMENTARY_OFF]; else if (val & FLAGSTAT_FPAIRED) { - // ++(s)->n_pair_all[w]; + // ++(s)->n_pair_all[w]; if ( (val & FLAGSTAT_FPROPER_PAIR) && !(val & FLAGSTAT_FUNMAP) ) ++flags[offset + 12]; if (val & FLAGSTAT_FREAD1) ++flags[offset + FLAGSTAT_FREAD1_OFF]; if (val & FLAGSTAT_FREAD2) ++flags[offset + FLAGSTAT_FREAD2_OFF]; @@ -127,30 +130,26 @@ void FLAGSTAT_samtools_single_update(uint16_t val, uint32_t* flags) { if (val & FLAGSTAT_FDUP) ++flags[offset + FLAGSTAT_FDUP_OFF]; } -#define SAMTOOLS_flagstat_loop(s, c) do { \ - int w = (c & FLAGSTAT_FQCFAIL)? 1 : 0; \ - ++(s)->n_reads[w]; \ - if (c & FLAGSTAT_FSECONDARY ) { \ - ++(s)->n_secondary[w]; \ - } else if (c & FLAGSTAT_FSUPPLEMENTARY ) { \ - ++(s)->n_supp[w]; \ - } else if (c & FLAGSTAT_FPAIRED) { \ - ++(s)->n_pair_all[w]; \ - if ( (c & FLAGSTAT_FPROPER_PAIR) && !(c & FLAGSTAT_FUNMAP) ) ++(s)->n_pair_good[w]; \ - if (c & FLAGSTAT_FREAD1) ++(s)->n_read1[w]; \ - if (c & FLAGSTAT_FREAD2) ++(s)->n_read2[w]; \ - if ((c & FLAGSTAT_FMUNMAP) && !(c & FLAGSTAT_FUNMAP)) ++(s)->n_sgltn[w]; \ - if (!(c & FLAGSTAT_FUNMAP) && !(c & FLAGSTAT_FMUNMAP)) { \ - ++(s)->n_pair_map[w]; \ - } \ - } \ - if (!(c & FLAGSTAT_FUNMAP)) ++(s)->n_mapped[w]; \ - if (c & FLAGSTAT_FDUP) ++(s)->n_dup[w]; \ -} while (0) - -// x = ((x & FLAGSTAT_FSECONDARY) == FLAGSTAT_FSECONDARY) & (FLAGSTAT_FSECONDARY + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP) -// x = ((x & FLAGSTAT_FSUPPLEMENTARY) == FLAGSTAT_FSUPPLEMENTARY) & (FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP) -// x = ((x & FLAGSTAT_FPAIRED) == FLAGSTAT_FPAIRED) & (FLAGSTAT_FUNMAP + FLAGSTAT_FDUP + FLAGSTAT_FPAIRED + FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FREAD1 + FLAGSTAT_FREAD2 + FLAGSTAT_FMUNMAP) +// #define SAMTOOLS_flagstat_loop(s, c) do { \ +// int w = (c & FLAGSTAT_FQCFAIL)? 1 : 0; \ +// ++(s)->n_reads[w]; \ +// if (c & FLAGSTAT_FSECONDARY ) { \ +// ++(s)->n_secondary[w]; \ +// } else if (c & FLAGSTAT_FSUPPLEMENTARY ) { \ +// ++(s)->n_supp[w]; \ +// } else if (c & FLAGSTAT_FPAIRED) { \ +// ++(s)->n_pair_all[w]; \ +// if ( (c & FLAGSTAT_FPROPER_PAIR) && !(c & FLAGSTAT_FUNMAP) ) ++(s)->n_pair_good[w]; \ +// if (c & FLAGSTAT_FREAD1) ++(s)->n_read1[w]; \ +// if (c & FLAGSTAT_FREAD2) ++(s)->n_read2[w]; \ +// if ((c & FLAGSTAT_FMUNMAP) && !(c & FLAGSTAT_FUNMAP)) ++(s)->n_sgltn[w]; \ +// if (!(c & FLAGSTAT_FUNMAP) && !(c & FLAGSTAT_FMUNMAP)) { \ +// ++(s)->n_pair_map[w]; \ +// } \ +// } \ +// if (!(c & FLAGSTAT_FUNMAP)) ++(s)->n_mapped[w]; \ +// if (c & FLAGSTAT_FDUP) ++(s)->n_dup[w]; \ +// } while (0) // FLAGSTAT_FPROPER_PAIR & !FLAGSTAT_FUNMAP // x |= (x & (FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP) == FLAGSTAT_FPROPER_PAIR) & 1 << 13 @@ -167,7 +166,7 @@ int FLAGSTAT_sse4(const uint16_t* array, uint32_t len, uint32_t* flags) { const uint32_t start_qc = flags[FLAGSTAT_FQCFAIL_OFF + 16]; for (uint32_t i = len - (len % (16 * 8)); i < len; ++i) { - FLAGSTAT_samtools_single_update(array[i], flags); + FLAGSTAT_scalar_update(array[i], flags); } const __m128i* data = (const __m128i*)array; @@ -214,24 +213,51 @@ int FLAGSTAT_sse4(const uint16_t* array, uint32_t len, uint32_t* flags) { thislimit = i + (1 << 16) - 1; /////////////////////////////////////////////////////////////////////// - // We load a register of data (data + i + j) and then using a the - // resulting mask from a VPCMPEQW instruction comparing equality with - // the mask mask1 (FLAGSTAT_FSECONDARY + FLAGSTAT_FSUPPLEMENTARY). The resulting - // data is either the original data or empty as DATA & (00...0) is a - // zero register and DATA & (11...1) is the data itself. The resulting - // data is combined (bitwise or) with the mask mask2 as this information - // is required. + // We load a register of data (data + i + j) and then construct the + // conditional bits: + // 12: FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP == FLAGSTAT_FPROPER_PAIR + // 13: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == FLAGSTAT_FMUNMAP + // 14: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == 0 + // + // These construction of these bits can be described for data x as: + // x |= (x & LEFT_MASK == RIGHT_MASK) & 1 << TARGET_BIT + // with the assumption that predicate evaluatons result in the selection + // masks (00...0) or (11...1) for FALSE and TRUE, respectively. These + // construction macros are named O1, O2, and O3. + // + // The original SAMtools method is also heavily branched with three + // main branch points: + // If FLAGSTAT_FSECONDARY then count FLAGSTAT_FSECONDARY + // If FLAGSTAT_FSUPPLEMENTARY then count FLAGSTAT_FSUPPLEMENTARY + // Else then count FLAGSTAT_FREAD1, + // FLAGSTAT_FREAD2, + // Special bit 12, 13, and 14 + // Always count FLAGSTAT_FUNMAP, + // FLAGSTAT_FDUP, + // FLAGSTAT_FQCFAIL + // + // These bits can be selected using a mask-select propagate-carry approach: + // x &= x & ((x == MASK) | CARRY_BITS) + // with the arguments for MASK and CARRY_BITS as follows: + // 1. {FLAGSTAT_FSECONDARY, + // FLAGSTAT_FQCFAIL + FLAGSTAT_FSECONDARY + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP} + // 2. {FLAGSTAT_FSUPPLEMENTARY, + // FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY + // + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP} + // 3. {FLAGSTAT_FPAIRED, + // FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY + // + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP} // // FLAGSTATS outputs summary statistics separately for reads that pass // QC and those that do not. Therefore we need to partition the data // into these two classes. For data that pass QC, the L registers, we - // first bit-select the target FLAGSTAT_FQCFAIL bit using the mask mask3. The - // resulting data is used to perform another mask-select using VPCMPEQW - // against the empty vector (00...0). As above, if the data has the - // FLAGSTAT_FQCFAIL bit set then this register will be zeroed out. The exact - // process is performed for reads that fail QC, the LU registers, with - // the difference that mask-selection is based on the one vector - // (00...1). + // first bit-select the target FLAGSTAT_FQCFAIL bit using the mask + // mask3. The resulting data is used to perform another mask-select + // using VPCMPEQW against the empty vector (00...0). As above, if the + // data has the FLAGSTAT_FQCFAIL bit set then this register will be + // zeroed out. The exact process is performed for reads that fail QC, + // the LU registers, with the difference that mask-selection is based on + // the one vector (00...1). #define W(j) __m128i data##j = _mm_loadu_si128(data + i + j); #define O1(j) data##j = data##j | _mm_slli_epi16(data##j & _mm_cmpeq_epi16((data##j & _mm_set1_epi16(FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP)), _mm_set1_epi16(FLAGSTAT_FPROPER_PAIR)) & one, 12); @@ -398,7 +424,7 @@ int FLAGSTAT_avx2(const uint16_t* array, uint32_t len, uint32_t* flags) { const uint32_t start_qc = flags[FLAGSTAT_FQCFAIL_OFF + 16]; for (uint32_t i = len - (len % (16 * 16)); i < len; ++i) { - FLAGSTAT_samtools_single_update(array[i], flags); + FLAGSTAT_scalar_update(array[i], flags); } const __m256i* data = (const __m256i*)array; @@ -445,24 +471,51 @@ int FLAGSTAT_avx2(const uint16_t* array, uint32_t len, uint32_t* flags) { thislimit = i + (1 << 16) - 1; /////////////////////////////////////////////////////////////////////// - // We load a register of data (data + i + j) and then using a the - // resulting mask from a VPCMPEQW instruction comparing equality with - // the mask mask1 (FLAGSTAT_FSECONDARY + FLAGSTAT_FSUPPLEMENTARY). The resulting - // data is either the original data or empty as DATA & (00...0) is a - // zero register and DATA & (11...1) is the data itself. The resulting - // data is combined (bitwise or) with the mask mask2 as this information - // is required. + // We load a register of data (data + i + j) and then construct the + // conditional bits: + // 12: FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP == FLAGSTAT_FPROPER_PAIR + // 13: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == FLAGSTAT_FMUNMAP + // 14: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == 0 + // + // These construction of these bits can be described for data x as: + // x |= (x & LEFT_MASK == RIGHT_MASK) & 1 << TARGET_BIT + // with the assumption that predicate evaluatons result in the selection + // masks (00...0) or (11...1) for FALSE and TRUE, respectively. These + // construction macros are named O1, O2, and O3. + // + // The original SAMtools method is also heavily branched with three + // main branch points: + // If FLAGSTAT_FSECONDARY then count FLAGSTAT_FSECONDARY + // If FLAGSTAT_FSUPPLEMENTARY then count FLAGSTAT_FSUPPLEMENTARY + // Else then count FLAGSTAT_FREAD1, + // FLAGSTAT_FREAD2, + // Special bit 12, 13, and 14 + // Always count FLAGSTAT_FUNMAP, + // FLAGSTAT_FDUP, + // FLAGSTAT_FQCFAIL + // + // These bits can be selected using a mask-select propagate-carry approach: + // x &= x & ((x == MASK) | CARRY_BITS) + // with the arguments for MASK and CARRY_BITS as follows: + // 1. {FLAGSTAT_FSECONDARY, + // FLAGSTAT_FQCFAIL + FLAGSTAT_FSECONDARY + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP} + // 2. {FLAGSTAT_FSUPPLEMENTARY, + // FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY + // + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP} + // 3. {FLAGSTAT_FPAIRED, + // FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY + // + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP} // // FLAGSTATS outputs summary statistics separately for reads that pass // QC and those that do not. Therefore we need to partition the data // into these two classes. For data that pass QC, the L registers, we - // first bit-select the target FLAGSTAT_FQCFAIL bit using the mask mask3. The - // resulting data is used to perform another mask-select using VPCMPEQW - // against the empty vector (00...0). As above, if the data has the - // FLAGSTAT_FQCFAIL bit set then this register will be zeroed out. The exact - // process is performed for reads that fail QC, the LU registers, with - // the difference that mask-selection is based on the one vector - // (00...1). + // first bit-select the target FLAGSTAT_FQCFAIL bit using the mask + // mask3. The resulting data is used to perform another mask-select + // using VPCMPEQW against the empty vector (00...0). As above, if the + // data has the FLAGSTAT_FQCFAIL bit set then this register will be + // zeroed out. The exact process is performed for reads that fail QC, + // the LU registers, with the difference that mask-selection is based on + // the one vector (00...1). #define W(j) __m256i data##j = _mm256_loadu_si256(data + i + j); #define O1(j) data##j = data##j | _mm256_slli_epi16(data##j & _mm256_cmpeq_epi16((data##j & _mm256_set1_epi16(FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP)), _mm256_set1_epi16(FLAGSTAT_FPROPER_PAIR)) & one, 12); @@ -626,7 +679,7 @@ STORM_TARGET("avx512bw") static int FLAGSTAT_avx512(const uint16_t* array, size_t len, uint32_t* out) { for (uint32_t i = len - (len % (32 * 16)); i < len; ++i) { - FLAGSTAT_samtools_single_update(array[i], out); + FLAGSTAT_scalar_update(array[i], out); } const __m512i* data = (const __m512i*)array; @@ -669,21 +722,51 @@ int FLAGSTAT_avx512(const uint16_t* array, size_t len, uint32_t* out) { thislimit = i + (1 << 16) - 1; /////////////////////////////////////////////////////////////////////// - // We load a register of data (data + i + j) and then using a the resulting mask from - // a VPCMPEQW instruction comparing equality with the mask mask1 - // (FLAGSTAT_FSECONDARY + FLAGSTAT_FSUPPLEMENTARY). The resulting data is either the original data - // or empty as DATA & (00...0) is a zero register and DATA & (11...1) is the data itself. - // The resulting data is combined (bitwise or) with the mask mask2 as this information - // is required. + // We load a register of data (data + i + j) and then construct the + // conditional bits: + // 12: FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP == FLAGSTAT_FPROPER_PAIR + // 13: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == FLAGSTAT_FMUNMAP + // 14: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == 0 // - // FLAGSTATS outputs summary statistics separately for reads that pass QC and those - // that do not. Therefore we need to partition the data into these two classes. - // For data that pass QC, the L registers, we first bit-select the target FLAGSTAT_FQCFAIL bit - // using the mask mask3. The resulting data is used to perform another mask-select using VPCMPEQW - // against the empty vector (00...0). As above, if the data has the FLAGSTAT_FQCFAIL bit set then - // this register will be zeroed out. - // The exact process is performed for reads that fail QC, the LU registers, with the difference - // that mask-selection is based on the one vector (11...1). + // These construction of these bits can be described for data x as: + // x |= (x & LEFT_MASK == RIGHT_MASK) & 1 << TARGET_BIT + // with the assumption that predicate evaluatons result in the selection + // masks (00...0) or (11...1) for FALSE and TRUE, respectively. These + // construction macros are named O1, O2, and O3. + // + // The original SAMtools method is also heavily branched with three + // main branch points: + // If FLAGSTAT_FSECONDARY then count FLAGSTAT_FSECONDARY + // If FLAGSTAT_FSUPPLEMENTARY then count FLAGSTAT_FSUPPLEMENTARY + // Else then count FLAGSTAT_FREAD1, + // FLAGSTAT_FREAD2, + // Special bit 12, 13, and 14 + // Always count FLAGSTAT_FUNMAP, + // FLAGSTAT_FDUP, + // FLAGSTAT_FQCFAIL + // + // These bits can be selected using a mask-select propagate-carry approach: + // x &= x & ((x == MASK) | CARRY_BITS) + // with the arguments for MASK and CARRY_BITS as follows: + // 1. {FLAGSTAT_FSECONDARY, + // FLAGSTAT_FQCFAIL + FLAGSTAT_FSECONDARY + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP} + // 2. {FLAGSTAT_FSUPPLEMENTARY, + // FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY + // + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP} + // 3. {FLAGSTAT_FPAIRED, + // FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY + // + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP} + // + // FLAGSTATS outputs summary statistics separately for reads that pass + // QC and those that do not. Therefore we need to partition the data + // into these two classes. For data that pass QC, the L registers, we + // first bit-select the target FLAGSTAT_FQCFAIL bit using the mask + // mask3. The resulting data is used to perform another mask-select + // using VPCMPEQW against the empty vector (00...0). As above, if the + // data has the FLAGSTAT_FQCFAIL bit set then this register will be + // zeroed out. The exact process is performed for reads that fail QC, + // the LU registers, with the difference that mask-selection is based on + // the one vector (00...1). #define LOAD(j) __m512i data##j = _mm512_loadu_si512(data + i + j) & (_mm512_cmpeq_epi16_mask( _mm512_loadu_si512(data + i + j) & mask1, zero ) | mask2); #define L(j) data##j & _mm512_cmpeq_epi16_mask( data##j & mask3, zero ) diff --git a/positional-popcount b/positional-popcount deleted file mode 160000 index 1921b0c..0000000 --- a/positional-popcount +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1921b0cdbd44b4b1ac1611ed6dda452025a098d6 diff --git a/utility b/utility index 4a284e33a4c0935d7ebf0cdc980f50d762788b3e..b3a7644f368220a56f856d1807967e5328a62f4e 100755 GIT binary patch delta 3526 zcmZ`*4RBP|6~1qO$ZoRxc3*yYS*OXq4S^(9vRMKoAaSxG=vz!Rl&BH;2`!|6#ECW; zQxyXS9M)xR@Ho}6=?u1=(RR?Hp|#Ks6C|a@;uv72(;pdw6YZ|hHqKB?Ds=nZyYJFW zI=wURp7(wC+;i?b=iYPQmBJSJLWRfj2VJZ(9gkBQz9Uw zAzq|R*({zkMb}JA%e1siTBd9YT~)I>vt?VBW^Hs#xi|FxJd9Z$#;D5n(2_fjzBq7u ztm{qhw0D0=>xJ<53KxHFhn+$^a;;dcO<@kfaTi5&%vPa%N|$UpjoTblXfGU|vehff zKPlqqw{@t)Z#$eeXV=!QZ*Qfa&*@Wav@mC-l0%Q?1S3AU=dro)vxl8L6r~p4JoOUZ zoUF>iJykEb7_+l_I++t(*KD!9w`_MaW@GmN^CrUvRnd-kva;R^nB;fM9L?gpJ+$1_ zqjotwJA=s%IYyn+#=>f}+&^3(g9DEO(OJ29`<d^Lymc*(t`rO!ZD9_dbhwYaLl8W z?iTntj(Hl>9RfeYG4=pUw+j3O$GIFg3j7#w|2CfPbd4Z(fan|q7A<`@7C+co?F;S| z*i1H@pNnW9Fe$!3Gtsaa6U<~0pJyhQ92KLZu5>;|I|n0wi^b1vG~*we0~3?2Z7oUX zE%;b7>HQIc0AoYkXGPH*Oup(1HVZBSaG#lUR&k5Vrp)*!Jpp9l&jsZ1R=7Ve0GnL$ zSplu^25qkv&<5}PTQ4C)tT${9G~A0}%*GrK*7gKY>ivIbGAR>&PCJhk(P3|SfAoRq z{m1#ah~5`%JCVs?#@bbpYi4rcH{i1++3+O3+T3%c9x5%#o#jR>KC#72IObO(2xfe5 zc_juSBQ5cNqz>N5WMawYa=-c8=5ld-P6I&6QGWSh88zRDwJY(lPHsH@X{z6g6`2XY znfQ9@TC?A9`t~Kb;lzsa<2*HyYpGwstnZz%&Vi@ej7)ECXLG!#&4>=1R($V_@UZ0@ z>^R>>v1HFGmf8hvGqDO58dGLs$T`Wa?fXm5uH8@8v)*US_(!SbLd7oZ_d~CyW!aQ( z?{-Ax1a}FQ%$V=QSo#4lz93J{GWbZ1Ugwrql*a}}{VmJ>+T%=}2NM}F69+^PwwSdZ z5tZILbI&M`*Fzhk>!TZ@8>5>}rT#Euwi!Rg4HW%m>hXE0=lK9ubqz9g5FOadfB)|` z>l~ypZ)xaFRqTX?zkpmDjo*mHueDTtB2MMW8}`%(Uy@IK@N2DDKi}a)UuH5xkUL>- z6!Iy^ElA`6$UewlL+%8B7NPJ1@5HBm$n_v!1ir#Cy`j#tnb+yOTJ4?CbjZ_^G>zq z**Q_Qejuk^o!srRS1Y+{eN?T9s+CQu0cA{TQe9CTMs9xsj~{;d;peRw&lDaHxN1(^ zt@<~r*;V1-?v`~@V#rh!iK=I5xWj*3r%w??<4e3fZJqle98-&!A4p=L)tE?MRS0?Bj zke^LL>dIA)YV(;n^Fp|3uYjd^j%GDG?)(~xl<5dgq^!t3m*LBR+UYxGk83i@B6*lu zMan!~si$*gdjCpb8&lc1^_|l2DvLK}UZCLGExvme{|I=61z3BTfw$~G375OXdl752 zE_BZE_G3Kodn=P$L9pnK;8FPJ%$gvU>;6ED=CcW!o>$R-QB2a9{J|0s-K5Y$5N6q! z77B&`vzT1$TK+3F+h+-Qh3QQ0FEJfmEqFgG&S%AjAvqyO=&f=+upGl06eg|h>#%fU z{7)>f6{Goxc()aCdCYc@X4jRcNt>?+63_|RikaWCFSpifTqnY>?R@6Dc|7D2-o>mk zSnja=LQS-b!txiq4~*iBaPRVG_^Xyq9z_=D0e(m3vpgmbqBY1WNLipOwOmKpA~fT$ zqaok&pNiKt-VX6m8d)%BCCY}i`s@f}U%?CaJ!0)0UbJ{~Li%B=!`eGpma*Rpe;?6F zc)cvV`o;PXJL?O#TB-)^snC@uh0t%{?~z_hbKKIDrC}dcn zq?K?f*O3+uYpbm~BxNMx=v6mQsCEDa{{XKy|H=RW delta 3241 zcmZ`*e@xrQ6~FTju!*t1HaK6L6#O-q04=F)2*c6p3=`UZQJmHpWTOdCqcN7Sb;?v| zw5$?nEYU3L&h%BCDkW>swyv8hjf}sVvB+uCHp(^$_M=N$X$eug*)3_wv>~#Fc<;`4 z5u(~j-+TA@+~@Qq>t3T5+Kjhn@6E-=Mvqvr)zhG>O3*^l{(+8{v`|;Sw|$4z@fUdA#_F zru^r#nfLque*Ca$>x&~V8PQJ}elk`a1c6Q~C)wjVdHe zhYa(jvzJX~1H4=G4D2o5O$kgDdqWl+lI7#izy79~O6!@b>g+EwqrK{^$7p$SpCazVeG)?0ToELG!cB@LupwFs8uhmvME-Z_&&j=}G zpF=jijy1#Ykex&J^g6Z}ZX&yc?94jW3Y8@)+*e{N&8%ZfVK1_=65E(Qv1HLF%|CZ* zL_eMxG34jAMM`XrKh$#Hsf?Bmh{+IYS6Ff?{xSpn9|!FevJ(3 zlcw5wM0&Er9}ZA!U|+O1+86DQ zK6xR#M;eYXOFemk8N1(FJ(@MJ9t>s}W!8)pk{^G91HQ@9gF7nQ#%&|}FaSN}KEOrG z&v@@eOYS#o`TS5ibh9(L(61+N=o23-$Gc-G^D*2`daBgOMdTs$k$11Sv>Nhnhqr@u zMSSRKzn(ICn8%%~dh){&4=&4P7u?xYXI*r$;ac8vL8HxUm~z2hTkZV|m{l?w&?h2a z!fv!i8=F2E8AgsGMIj zX6x?2EA}TI)^Q2gc3k=_pMMeM>v!_`Ih1oKEjWcnjK3FUFUmpWzlFp61?96SN047a zIR_6rY7EP3Fzk5H97jhPIOV9JSvc=dX&SCNsvbF`Xk_bFTd^gLHgUA~{7PWkaE=i0KZEZ;)5v3 z9=pDce-TL!XdFwatFzy^x*qDNk%^4c61 z*D9TwE`#1mzc0-8CKkbyZt)Cx3N#7BbfuT-aK17?ufQ#AcY(7i;7o|?O2%M;eN|pp zkezod>{?)DwHUr%^%(7eg(@#?gj-eLpm7LQm%(*v+A<`Den!X>LcUZ2!`@x+nvYGP z5iHsj9C=Yw(Sg(FHd;szN|mm1R6>FfMHGhbV4sb|QL0pdX+(CSp;T!%EFe-y0;Ng^ zIdYL1CbyQ6Fnh!{6eBK07z0&t=Aml0aWncIK$L**Rv&fBWb!3KOCQi0l?Gv_IDHN#|O|w9QSs8!IF?^w5>*;^J2BwJITr%v!d^^+VfM~ z`fObJr}#iYg?z+4-{!rWh&*4SlN2G(qS`o97lgK{bT_XL!2q_$c>5GgZBspKXs+OH zNQm7az5R;mKzdsdtZZ}9%M5uw6>(h`v434C=ea)(_SLEMAvjZ~hPwIe?IbSpiiW%l zBJae}+I$+v6_^cHjxj5`!fQ7yV%yK_UEr*5Ha#_P_}jDb`UstUwc&tiHox6R@f`XB z0p}I5U{ZF$0J4qZlDH>fIuNk-2(|~V23kv21;Tq|OYjR$dA&$!fq`JaDpMSWmxHa& zxVS~6%)lb*C}MLc;0d)l