From 3c53e3fd4ed1285ae1937cf849d71a339b82d078 Mon Sep 17 00:00:00 2001
From: "Marcus D. R. Klarqvist" <mk819@cam.ac.uk>
Date: Mon, 9 Sep 2019 09:46:29 +0100
Subject: [PATCH] refactor

---
 .gitignore                               |   5 +
 Makefile                                 |  26 +--
 flagstats.cpp => benchmark/flagstats.cpp |   0
 generate.cpp => benchmark/generate.cpp   |   2 -
 utility.cpp => benchmark/utility.cpp     |   5 +
 libflagstats.h                           | 225 ++++++++++++++++-------
 positional-popcount                      |   1 -
 utility                                  | Bin 13552 -> 13632 bytes
 8 files changed, 177 insertions(+), 87 deletions(-)
 rename flagstats.cpp => benchmark/flagstats.cpp (100%)
 rename generate.cpp => benchmark/generate.cpp (90%)
 rename utility.cpp => benchmark/utility.cpp (60%)
 delete mode 160000 positional-popcount
diff --git a/.gitignore b/.gitignore
index 259148f..67e900a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,8 @@
 *.exe
 *.out
 *.app
+
+#
+bench
+utility
+generate
diff --git a/Makefile b/Makefile
index 76ac527..b4742ef 100644
--- a/Makefile
+++ b/Makefile
@@ -19,14 +19,14 @@
 OPTFLAGS  := -O3 -march=native
 CFLAGS     = -std=c99 $(OPTFLAGS) $(DEBUG_FLAGS)
 CPPFLAGS   = -std=c++0x $(OPTFLAGS) $(DEBUG_FLAGS)
-CPP_SOURCE = flagstats.cpp utility.cpp
+CPP_SOURCE = benchmark/flagstats.cpp benchmark/utility.cpp benchmark/generate.cpp
 C_SOURCE   = 
 OBJECTS    = $(CPP_SOURCE:.cpp=.o) $(C_SOURCE:.c=.o)
 
-POSPOPCNT_PATH  := libalgebra
+POSPOPCNT_PATH  := ../libalgebra
 LZ4_PATH :=
 ZSTD_PATH :=
-INCLUDE_PATHS :=
+INCLUDE_PATHS := -I$(PWD)
 LIBRARY_PATHS :=
 ifneq ($(LZ4_PATH),)
 	INCLUDE_PATHS += -I$(LZ4_PATH)/include
@@ -42,23 +42,23 @@ INCLUDE_PATHS := $(sort $(INCLUDE_PATHS))
 LIBRARY_PATHS := $(sort $(LIBRARY_PATHS))
 
 # Default target
-all: flagstats utility
+all: benchmark utility generate
 
 # Generic rules
-utility.o: utility.cpp
-	$(CXX) $(CPPFLAGS) -c -o $@ $<
+utility: benchmark/utility.cpp
+	$(CXX) $(CPPFLAGS) -o $@ $<
 
-flagstats.o: flagstats.cpp
-	$(CXX) $(CPPFLAGS) -I$(POSPOPCNT_PATH) $(INCLUDE_PATHS) -c -o $@ $<
+generate: benchmark/generate.cpp
+	$(CXX) $(CPPFLAGS) -o $@ $<
 
-flagstats: flagstats.o
-	$(CXX) $(CPPFLAGS) flagstats.o -I$(POSPOPCNT_PATH) $(INCLUDE_PATHS) $(LIBRARY_PATHS) -o flagstats -llz4 -lzstd
+bench.o: benchmark/flagstats.cpp
+	$(CXX) $(CPPFLAGS) -I$(POSPOPCNT_PATH) $(INCLUDE_PATHS) -c -o $@ $<
 
-utility: utility.o
-	$(CXX) $(CPPFLAGS) utility.o -o utility
+benchmark: bench.o
+	$(CXX) $(CPPFLAGS) bench.o -I$(POSPOPCNT_PATH) $(INCLUDE_PATHS) $(LIBRARY_PATHS) -o bench -llz4 -lzstd
 
 clean:
 	rm -f $(OBJECTS)
-	rm -f flagstats utility
+	rm -f bench bench.o utility generate
 
 .PHONY: all clean
diff --git a/flagstats.cpp b/benchmark/flagstats.cpp
similarity index 100%
rename from flagstats.cpp
rename to benchmark/flagstats.cpp
diff --git a/generate.cpp b/benchmark/generate.cpp
similarity index 90%
rename from generate.cpp
rename to benchmark/generate.cpp
index bf9c562..6632ebe 100644
--- a/generate.cpp
+++ b/benchmark/generate.cpp
@@ -8,8 +8,6 @@ int main(int argc, char** argv) {
     std::random_device rd; // obtain a random number from hardware
     std::mt19937 eng(rd()); // seed the generator
 
-    std::cerr << strtoull( argv[1], NULL, 10 ) << std::endl;
-
     std::uniform_int_distribution<uint16_t> distr(0, 4096-1); // right inclusive
     for (int i = 0; i < strtoull( argv[1], NULL, 10 ); ++i) {
         uint16_t x = distr(eng);
diff --git a/utility.cpp b/benchmark/utility.cpp
similarity index 60%
rename from utility.cpp
rename to benchmark/utility.cpp
index 978bff7..ef6bb84 100644
--- a/utility.cpp
+++ b/benchmark/utility.cpp
@@ -1,7 +1,12 @@
 #include <string>
 #include <iostream>
 #include <fstream>
+#include <cstring>
 
+// Utility function accepting data from cin stream and converting
+// text-based FLAG values into uint16_t words.
+// Intended use:
+// samtools view FILE | cut -f 2 | utility > DEST_FILE.bin
 int main(int argc, char** argv) {
     std::string str;
     while (std::getline(std::cin, str)) {
diff --git a/libflagstats.h b/libflagstats.h
index 7169522..626c57f 100644
--- a/libflagstats.h
+++ b/libflagstats.h
@@ -106,17 +106,20 @@
 extern "C" {
 #endif
 
-void FLAGSTAT_samtools_single_update(uint16_t val, uint32_t* flags) {
+void FLAGSTAT_scalar_update(uint16_t val, uint32_t* flags) {
     // If the FLAGSTAT_FQCFAIL is set the data is shift 16 values to
     // the right to distinguish between statistics for data
     // that failed and passed quality control.
     const int offset = ( (val & FLAGSTAT_FQCFAIL) == 0 ) ? 0 : 16;
+    // Count only reads that with FLAGSTAT_FQCFAIL set. The other
+    // reads are implicitly known and computed at the end of
+    // FLAGSTAT_* functions.
     if (offset) ++flags[offset + FLAGSTAT_FQCFAIL_OFF];
 
     if (val & FLAGSTAT_FSECONDARY) ++flags[offset + FLAGSTAT_FSECONDARY_OFF];
     else if (val & FLAGSTAT_FSUPPLEMENTARY) ++flags[offset + FLAGSTAT_FSUPPLEMENTARY_OFF];
     else if (val & FLAGSTAT_FPAIRED) {
-        // ++(s)->n_pair_all[w];                
+        // ++(s)->n_pair_all[w];              
         if ( (val & FLAGSTAT_FPROPER_PAIR) && !(val & FLAGSTAT_FUNMAP) ) ++flags[offset + 12];
         if (val & FLAGSTAT_FREAD1) ++flags[offset + FLAGSTAT_FREAD1_OFF];
         if (val & FLAGSTAT_FREAD2) ++flags[offset + FLAGSTAT_FREAD2_OFF];
@@ -127,30 +130,26 @@ void FLAGSTAT_samtools_single_update(uint16_t val, uint32_t* flags) {
     if (val & FLAGSTAT_FDUP)      ++flags[offset + FLAGSTAT_FDUP_OFF];
 }
 
-#define SAMTOOLS_flagstat_loop(s, c) do {                      \
-    int w = (c & FLAGSTAT_FQCFAIL)? 1 : 0;                       \
-    ++(s)->n_reads[w];                                      \
-    if (c & FLAGSTAT_FSECONDARY ) {                              \
-        ++(s)->n_secondary[w];                              \
-    } else if (c & FLAGSTAT_FSUPPLEMENTARY ) {                   \
-        ++(s)->n_supp[w];                                   \
-    } else if (c & FLAGSTAT_FPAIRED) {                           \
-        ++(s)->n_pair_all[w];                               \
-        if ( (c & FLAGSTAT_FPROPER_PAIR) && !(c & FLAGSTAT_FUNMAP) ) ++(s)->n_pair_good[w]; \
-        if (c & FLAGSTAT_FREAD1) ++(s)->n_read1[w];              \
-        if (c & FLAGSTAT_FREAD2) ++(s)->n_read2[w];              \
-        if ((c & FLAGSTAT_FMUNMAP) && !(c & FLAGSTAT_FUNMAP)) ++(s)->n_sgltn[w]; \
-        if (!(c & FLAGSTAT_FUNMAP) && !(c & FLAGSTAT_FMUNMAP)) {      \
-            ++(s)->n_pair_map[w];                           \
-        }                                                   \
-    }                                                       \
-    if (!(c & FLAGSTAT_FUNMAP)) ++(s)->n_mapped[w];              \
-    if (c & FLAGSTAT_FDUP) ++(s)->n_dup[w];                      \
-} while (0)
-
-// x = ((x & FLAGSTAT_FSECONDARY) == FLAGSTAT_FSECONDARY) & (FLAGSTAT_FSECONDARY + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP)
-// x = ((x & FLAGSTAT_FSUPPLEMENTARY) == FLAGSTAT_FSUPPLEMENTARY) & (FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP)
-// x = ((x & FLAGSTAT_FPAIRED) == FLAGSTAT_FPAIRED) & (FLAGSTAT_FUNMAP + FLAGSTAT_FDUP + FLAGSTAT_FPAIRED + FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FREAD1 + FLAGSTAT_FREAD2 + FLAGSTAT_FMUNMAP)
+// #define SAMTOOLS_flagstat_loop(s, c) do {                   \
+//     int w = (c & FLAGSTAT_FQCFAIL)? 1 : 0;                  \
+//     ++(s)->n_reads[w];                                      \
+//     if (c & FLAGSTAT_FSECONDARY ) {                         \
+//         ++(s)->n_secondary[w];                              \
+//     } else if (c & FLAGSTAT_FSUPPLEMENTARY ) {              \
+//         ++(s)->n_supp[w];                                   \
+//     } else if (c & FLAGSTAT_FPAIRED) {                      \
+//         ++(s)->n_pair_all[w];                               \
+//         if ( (c & FLAGSTAT_FPROPER_PAIR) && !(c & FLAGSTAT_FUNMAP) ) ++(s)->n_pair_good[w]; \
+//         if (c & FLAGSTAT_FREAD1) ++(s)->n_read1[w];         \
+//         if (c & FLAGSTAT_FREAD2) ++(s)->n_read2[w];         \
+//         if ((c & FLAGSTAT_FMUNMAP) && !(c & FLAGSTAT_FUNMAP)) ++(s)->n_sgltn[w]; \
+//         if (!(c & FLAGSTAT_FUNMAP) && !(c & FLAGSTAT_FMUNMAP)) {      \
+//             ++(s)->n_pair_map[w];                           \
+//         }                                                   \
+//     }                                                       \
+//     if (!(c & FLAGSTAT_FUNMAP)) ++(s)->n_mapped[w];         \
+//     if (c & FLAGSTAT_FDUP) ++(s)->n_dup[w];                 \
+// } while (0)
 
 // FLAGSTAT_FPROPER_PAIR & !FLAGSTAT_FUNMAP
 // x |= (x & (FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP) == FLAGSTAT_FPROPER_PAIR) & 1 << 13
@@ -167,7 +166,7 @@ int FLAGSTAT_sse4(const uint16_t* array, uint32_t len, uint32_t* flags) {
     const uint32_t start_qc = flags[FLAGSTAT_FQCFAIL_OFF + 16];
     
     for (uint32_t i = len - (len % (16 * 8)); i < len; ++i) {
-        FLAGSTAT_samtools_single_update(array[i], flags);
+        FLAGSTAT_scalar_update(array[i], flags);
     }
 
     const __m128i* data = (const __m128i*)array;
@@ -214,24 +213,51 @@ int FLAGSTAT_sse4(const uint16_t* array, uint32_t len, uint32_t* flags) {
             thislimit = i + (1 << 16) - 1;
 
         ///////////////////////////////////////////////////////////////////////
-        // We load a register of data (data + i + j) and then using a the
-        // resulting mask from a VPCMPEQW instruction comparing equality with
-        // the mask mask1 (FLAGSTAT_FSECONDARY + FLAGSTAT_FSUPPLEMENTARY). The resulting
-        // data is either the original data or empty as DATA & (00...0) is a
-        // zero register and DATA & (11...1) is the data itself. The resulting
-        // data is combined (bitwise or) with the mask mask2 as this information
-        // is required.
+        // We load a register of data (data + i + j) and then construct the
+        // conditional bits: 
+        // 12: FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP == FLAGSTAT_FPROPER_PAIR
+        // 13: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == FLAGSTAT_FMUNMAP
+        // 14: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == 0
+        //
+        // These construction of these bits can be described for data x as:
+        // x |= (x & LEFT_MASK == RIGHT_MASK) & 1 << TARGET_BIT
+        // with the assumption that predicate evaluatons result in the selection
+        // masks (00...0) or (11...1) for FALSE and TRUE, respectively. These
+        // construction macros are named O1, O2, and O3.
+        //
+        // The original SAMtools method is also heavily branched with three
+        // main branch points:
+        // If FLAGSTAT_FSECONDARY then count FLAGSTAT_FSECONDARY
+        // If FLAGSTAT_FSUPPLEMENTARY then count FLAGSTAT_FSUPPLEMENTARY
+        // Else then count FLAGSTAT_FREAD1, 
+        //                 FLAGSTAT_FREAD2,
+        //                 Special bit 12, 13, and 14
+        // Always count FLAGSTAT_FUNMAP, 
+        //              FLAGSTAT_FDUP, 
+        //              FLAGSTAT_FQCFAIL
+        //
+        // These bits can be selected using a mask-select propagate-carry approach:
+        // x &= x & ((x == MASK) | CARRY_BITS)
+        // with the arguments for MASK and CARRY_BITS as follows:
+        //    1. {FLAGSTAT_FSECONDARY, 
+        //        FLAGSTAT_FQCFAIL + FLAGSTAT_FSECONDARY + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP}
+        //    2. {FLAGSTAT_FSUPPLEMENTARY, 
+        //        FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY 
+        //        + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP}
+        //    3. {FLAGSTAT_FPAIRED, 
+        //        FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY 
+        //        + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP}
         //
         // FLAGSTATS outputs summary statistics separately for reads that pass
         // QC and those that do not. Therefore we need to partition the data
         // into these two classes. For data that pass QC, the L registers, we
-        // first bit-select the target FLAGSTAT_FQCFAIL bit using the mask mask3. The
-        // resulting data is used to perform another mask-select using VPCMPEQW
-        // against the empty vector (00...0). As above, if the data has the
-        // FLAGSTAT_FQCFAIL bit set then this register will be zeroed out. The exact
-        // process is performed for reads that fail QC, the LU registers, with
-        // the difference that mask-selection is based on the one vector
-        // (00...1).
+        // first bit-select the target FLAGSTAT_FQCFAIL bit using the mask
+        // mask3. The resulting data is used to perform another mask-select
+        // using VPCMPEQW against the empty vector (00...0). As above, if the
+        // data has the FLAGSTAT_FQCFAIL bit set then this register will be
+        // zeroed out. The exact process is performed for reads that fail QC,
+        // the LU registers, with the difference that mask-selection is based on
+        // the one vector (00...1).
 
 #define W(j) __m128i data##j = _mm_loadu_si128(data + i + j);
 #define O1(j) data##j = data##j | _mm_slli_epi16(data##j & _mm_cmpeq_epi16((data##j & _mm_set1_epi16(FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP)), _mm_set1_epi16(FLAGSTAT_FPROPER_PAIR)) & one, 12); 
@@ -398,7 +424,7 @@ int FLAGSTAT_avx2(const uint16_t* array, uint32_t len, uint32_t* flags) {
     const uint32_t start_qc = flags[FLAGSTAT_FQCFAIL_OFF + 16];
     
     for (uint32_t i = len - (len % (16 * 16)); i < len; ++i) {
-        FLAGSTAT_samtools_single_update(array[i], flags);
+        FLAGSTAT_scalar_update(array[i], flags);
     }
 
     const __m256i* data = (const __m256i*)array;
@@ -445,24 +471,51 @@ int FLAGSTAT_avx2(const uint16_t* array, uint32_t len, uint32_t* flags) {
             thislimit = i + (1 << 16) - 1;
 
         ///////////////////////////////////////////////////////////////////////
-        // We load a register of data (data + i + j) and then using a the
-        // resulting mask from a VPCMPEQW instruction comparing equality with
-        // the mask mask1 (FLAGSTAT_FSECONDARY + FLAGSTAT_FSUPPLEMENTARY). The resulting
-        // data is either the original data or empty as DATA & (00...0) is a
-        // zero register and DATA & (11...1) is the data itself. The resulting
-        // data is combined (bitwise or) with the mask mask2 as this information
-        // is required.
+        // We load a register of data (data + i + j) and then construct the
+        // conditional bits: 
+        // 12: FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP == FLAGSTAT_FPROPER_PAIR
+        // 13: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == FLAGSTAT_FMUNMAP
+        // 14: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == 0
+        //
+        // These construction of these bits can be described for data x as:
+        // x |= (x & LEFT_MASK == RIGHT_MASK) & 1 << TARGET_BIT
+        // with the assumption that predicate evaluatons result in the selection
+        // masks (00...0) or (11...1) for FALSE and TRUE, respectively. These
+        // construction macros are named O1, O2, and O3.
+        //
+        // The original SAMtools method is also heavily branched with three
+        // main branch points:
+        // If FLAGSTAT_FSECONDARY then count FLAGSTAT_FSECONDARY
+        // If FLAGSTAT_FSUPPLEMENTARY then count FLAGSTAT_FSUPPLEMENTARY
+        // Else then count FLAGSTAT_FREAD1, 
+        //                 FLAGSTAT_FREAD2,
+        //                 Special bit 12, 13, and 14
+        // Always count FLAGSTAT_FUNMAP, 
+        //              FLAGSTAT_FDUP, 
+        //              FLAGSTAT_FQCFAIL
+        //
+        // These bits can be selected using a mask-select propagate-carry approach:
+        // x &= x & ((x == MASK) | CARRY_BITS)
+        // with the arguments for MASK and CARRY_BITS as follows:
+        //    1. {FLAGSTAT_FSECONDARY, 
+        //        FLAGSTAT_FQCFAIL + FLAGSTAT_FSECONDARY + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP}
+        //    2. {FLAGSTAT_FSUPPLEMENTARY, 
+        //        FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY 
+        //        + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP}
+        //    3. {FLAGSTAT_FPAIRED, 
+        //        FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY 
+        //        + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP}
         //
         // FLAGSTATS outputs summary statistics separately for reads that pass
         // QC and those that do not. Therefore we need to partition the data
         // into these two classes. For data that pass QC, the L registers, we
-        // first bit-select the target FLAGSTAT_FQCFAIL bit using the mask mask3. The
-        // resulting data is used to perform another mask-select using VPCMPEQW
-        // against the empty vector (00...0). As above, if the data has the
-        // FLAGSTAT_FQCFAIL bit set then this register will be zeroed out. The exact
-        // process is performed for reads that fail QC, the LU registers, with
-        // the difference that mask-selection is based on the one vector
-        // (00...1).
+        // first bit-select the target FLAGSTAT_FQCFAIL bit using the mask
+        // mask3. The resulting data is used to perform another mask-select
+        // using VPCMPEQW against the empty vector (00...0). As above, if the
+        // data has the FLAGSTAT_FQCFAIL bit set then this register will be
+        // zeroed out. The exact process is performed for reads that fail QC,
+        // the LU registers, with the difference that mask-selection is based on
+        // the one vector (00...1).
 
 #define W(j) __m256i data##j = _mm256_loadu_si256(data + i + j);
 #define O1(j) data##j = data##j | _mm256_slli_epi16(data##j & _mm256_cmpeq_epi16((data##j & _mm256_set1_epi16(FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP)), _mm256_set1_epi16(FLAGSTAT_FPROPER_PAIR)) & one, 12); 
@@ -626,7 +679,7 @@ STORM_TARGET("avx512bw")
 static
 int FLAGSTAT_avx512(const uint16_t* array, size_t len, uint32_t* out) {
     for (uint32_t i = len - (len % (32 * 16)); i < len; ++i) {
-        FLAGSTAT_samtools_single_update(array[i], out);
+        FLAGSTAT_scalar_update(array[i], out);
     }
 
     const __m512i* data = (const __m512i*)array;
@@ -669,21 +722,51 @@ int FLAGSTAT_avx512(const uint16_t* array, size_t len, uint32_t* out) {
             thislimit = i + (1 << 16) - 1;
 
         ///////////////////////////////////////////////////////////////////////
-        // We load a register of data (data + i + j) and then using a the resulting mask from
-        // a VPCMPEQW instruction comparing equality with the mask mask1 
-        // (FLAGSTAT_FSECONDARY + FLAGSTAT_FSUPPLEMENTARY). The resulting data is either the original data
-        // or empty as DATA & (00...0) is a zero register and DATA & (11...1) is the data itself. 
-        // The resulting data is combined (bitwise or) with the mask mask2 as this information
-        // is required.
+        // We load a register of data (data + i + j) and then construct the
+        // conditional bits: 
+        // 12: FLAGSTAT_FPROPER_PAIR + FLAGSTAT_FUNMAP == FLAGSTAT_FPROPER_PAIR
+        // 13: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == FLAGSTAT_FMUNMAP
+        // 14: FLAGSTAT_FMUNMAP + FLAGSTAT_FUNMAP == 0
         //
-        // FLAGSTATS outputs summary statistics separately for reads that pass QC and those
-        // that do not. Therefore we need to partition the data into these two classes.
-        // For data that pass QC, the L registers, we first bit-select the target FLAGSTAT_FQCFAIL bit
-        // using the mask mask3. The resulting data is used to perform another mask-select using VPCMPEQW 
-        // against the empty vector (00...0). As above, if the data has the FLAGSTAT_FQCFAIL bit set then
-        // this register will be zeroed out.
-        // The exact process is performed for reads that fail QC, the LU registers, with the difference
-        // that mask-selection is based on the one vector (11...1).
+        // These construction of these bits can be described for data x as:
+        // x |= (x & LEFT_MASK == RIGHT_MASK) & 1 << TARGET_BIT
+        // with the assumption that predicate evaluatons result in the selection
+        // masks (00...0) or (11...1) for FALSE and TRUE, respectively. These
+        // construction macros are named O1, O2, and O3.
+        //
+        // The original SAMtools method is also heavily branched with three
+        // main branch points:
+        // If FLAGSTAT_FSECONDARY then count FLAGSTAT_FSECONDARY
+        // If FLAGSTAT_FSUPPLEMENTARY then count FLAGSTAT_FSUPPLEMENTARY
+        // Else then count FLAGSTAT_FREAD1, 
+        //                 FLAGSTAT_FREAD2,
+        //                 Special bit 12, 13, and 14
+        // Always count FLAGSTAT_FUNMAP, 
+        //              FLAGSTAT_FDUP, 
+        //              FLAGSTAT_FQCFAIL
+        //
+        // These bits can be selected using a mask-select propagate-carry approach:
+        // x &= x & ((x == MASK) | CARRY_BITS)
+        // with the arguments for MASK and CARRY_BITS as follows:
+        //    1. {FLAGSTAT_FSECONDARY, 
+        //        FLAGSTAT_FQCFAIL + FLAGSTAT_FSECONDARY + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP}
+        //    2. {FLAGSTAT_FSUPPLEMENTARY, 
+        //        FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY 
+        //        + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP}
+        //    3. {FLAGSTAT_FPAIRED, 
+        //        FLAGSTAT_FQCFAIL + FLAGSTAT_FSUPPLEMENTARY + FLAGSTAT_FSECONDARY 
+        //        + FLAGSTAT_FUNMAP + FLAGSTAT_FDUP}
+        //
+        // FLAGSTATS outputs summary statistics separately for reads that pass
+        // QC and those that do not. Therefore we need to partition the data
+        // into these two classes. For data that pass QC, the L registers, we
+        // first bit-select the target FLAGSTAT_FQCFAIL bit using the mask
+        // mask3. The resulting data is used to perform another mask-select
+        // using VPCMPEQW against the empty vector (00...0). As above, if the
+        // data has the FLAGSTAT_FQCFAIL bit set then this register will be
+        // zeroed out. The exact process is performed for reads that fail QC,
+        // the LU registers, with the difference that mask-selection is based on
+        // the one vector (00...1).
 
 #define LOAD(j) __m512i data##j = _mm512_loadu_si512(data + i + j) & (_mm512_cmpeq_epi16_mask( _mm512_loadu_si512(data + i + j) & mask1, zero ) | mask2);
 #define L(j)  data##j & _mm512_cmpeq_epi16_mask( data##j & mask3, zero )
diff --git a/positional-popcount b/positional-popcount
deleted file mode 160000
index 1921b0c..0000000
--- a/positional-popcount
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 1921b0cdbd44b4b1ac1611ed6dda452025a098d6
diff --git a/utility b/utility
index 4a284e33a4c0935d7ebf0cdc980f50d762788b3e..b3a7644f368220a56f856d1807967e5328a62f4e 100755
GIT binary patch
delta 3526
zcmZ`*4RBP|6~1qO$ZoRxc3*yYS*OXq4S^(9vRMKoAaSxG=vz!Rl&BH;2`!|6#ECW;
zQxyXS9M)xR@Ho}6=?u1=(RR?Hp|#Ks6C|a@;uv72(;pdw6YZ|hHqKB?Ds=nZyYJFW
zI=wURp7(wC+;i?b=iYPQ<o1yrgT>mBJSJLWR<MzSRj)<A@Mn>fj2VJZ(9gkBQz9Uw
zAzq|R*({zkMb}JA%e1siTBd9YT~)I>vt?VBW^Hs#xi|FxJd9Z$#;D5n(2_fjzBq7u
ztm{qhw0D0=>xJ<53KxHFhn+$^a;;dcO<@kfaTi5&%vPa%N|$UpjoTblXfGU|vehff
zKPlqqw{@t)Z#$eeXV=!QZ*Qfa&*@Wav@mC-l0%Q?1S3AU=dro)vxl8L6r~p4JoOUZ
zoUF>iJykEb7_+l_I++t(*KD!9w`_MaW@GmN^CrUvRnd-kva;R^nB;fM9L?gpJ+$1_
zq<CqUOQ*vwk2e)Ggaw`>jotwJA=s%IYyn+#=>f}+&^3(g9DEO(OJ29`<<XO^n<8NP
zEDJfH?bP7bX{X!c-9LkMQWw}EurJSGbLa<P&w_n@2J5Ex!CnXZ%NcAQ-2i)AuBZQZ
zGgzNo2&6f1#bqX(T}1{n_l@+}GGpezsO!{Cjm(nw3JsQ-e-^zj*oZfdxAF1xcm{3$
zyBzb4kscHHILAC0=@Eh7=D45lp!5+z{FD<8jt>d^Lymc*(t`rO!ZD9_dbhwYaLl8W
z?iTntj(Hl>9RfeYG4=pUw+j3O$GIFg3j7#w|2CfPbd4Z(fan|q7A<`@7C+co?F;S|
z*i1H@pNnW9Fe$!3Gtsaa6U<~0pJyhQ92KLZu5>;|I|n0wi^b1vG~*we0~3?2Z7oUX
zE%;b7>HQIc0AoYkXGPH*Oup(1HVZBSaG#lUR&k5Vrp)*!Jpp9l&jsZ1R=7Ve0GnL$
zSplu^25qkv&<5}PTQ4C)tT${9G~A0}%*GrK*7gKY>ivIbGAR>&PCJhk(P3|SfAoRq
z{m1#ah~5`%JCVs?#@bbpYi4rcH{i1++3+O3+T3%c9x5%#o#jR>KC#72IObO(2xfe5
zc_juSBQ5cNqz>N5WMawYa=-c8=5ld-P6I&6QGWSh88zRDwJY(lPHsH@X{z6g6`2XY
znfQ9@TC?A9`t~Kb;lzsa<2*HyYpGwstnZz%&Vi@ej7)ECXLG!#&4>=1R($V_@UZ0@
z>^R>>v1HFGmf8hvGqDO58dGLs$T`Wa?fXm5uH8@8v)*US_(!SbLd7oZ_d~CyW!aQ(
z?{-Ax1a}FQ%$V=QSo#4lz93J{GWbZ1Ugwrql*a}}{VmJ>+T%=}2NM}F69+^PwwSdZ
z5tZILbI&M`*Fzhk>!TZ@8>5>}rT#Euwi!Rg4HW%m>hXE0=lK9ubqz9g5FOadfB)|`
z>l~ypZ)xaFRqTX?zkpmDjo*mHueDTtB2MMW8}`%(Uy@IK@N2DDKi}a)UuH5xkUL>-
z6!Iy^ElA`6$UewlL+%8B7NPJ1@5HBm$n_v!1ir#Cy`j#tnb+yOTJ4<rS0*z?PT#!1
zA($4)g*tD%BCp4@pQ64}B|sg%!v3Q$&&@b4V4z9$f7jNe7C&#lOEr>?CbjZ_^G>zq
z**Q_Qejuk^o!srRS1Y+{eN?T9s+CQu0cA{TQe9CTMs9xsj~{;d;peRw&lDaHxN1(^
zt@<~r*;V1-<gh|GAHP8#`<g=Q@#?}Rv&uf#W|@qV_w*IzGaK7ycMh-d-|oqm<zJS2
zl=syocZ9n6jzIHf1)H!g(13+&=qDvXrIF5*gq2B}#``)7l!mo-tBgvo20d6B%x~qn
zLE}Y=w_+DPS9*`qM}H~}DywO_^jnHan}a&N9IOw`wJx3omRxGdJ1u!Xy&5W@4*zZB
zGX4nXmkb>?v`~@V#rh!iK=I5xWj*3r%w??<4e3fZJqle98-&!A4p=L)tE?MRS0?Bj
zke^LL>dIA)YV(;n^Fp|3uYjd^j%GDG?)(~xl<5dgq^!t3m*LBR+UYxGk83i@B6*lu
zMan!~si$*gdjCpb8&lc1^_|l2DvLK}UZCLGExvme{|I=61z3BTfw$~G375OXdl752
zE_BZE_G3Kodn=P$L9pnK;8FPJ%$gvU>;6ED=CcW!o>$R-QB2a9{J|0s-K5Y$5N6q!
z77B&`vzT1$TK+3F+h+-Qh3QQ0FEJfmEqFgG&S%AjAvqyO=&f=+upGl06eg|h>#%fU
z{7)>f6{Goxc()aCdCYc@X4jRcNt>?+63_|RikaWCFSpifTqnY>?R@6Dc|7D2-o>mk
zSnja=LQS-b!txiq4~*iBaPRVG_^Xyq9z_=D0e(m3vpgmbqBY1WNLipOwOmKpA~fT$
zqaok&pNiKt-VX6m8d)%BCCY}i`s@f}U%?CaJ!0)0UbJ{~Li%B=!`eGpma*Rpe;?6F
zc)cvV`o;PXJL?O#TB-)^snC@uh0t%{?~z_hbKKIDrC}d<zV615A?6laYxusfpwlH-
zSW0i9UnTlYU}`;^)-CDUy#0|y+gn?|Ib2-XpyYAL#GiA|@Lg5g9K%Nz8j4a$=c>cn
zq?K?f*O3+uYpbm~BxN<N35VT%7TZVha9ysfAt+Vvg_mlwf=OwRRugv1<aAT4rcN8Q
zYNnK9bPzfn*5>M<GtiM+P0Ghei-g^>x=v6mQs<VNb&}$dI!&G^Dck5=B&^AjB;_H}
z7KPoiAf+g_sLn0VPz}X_<grK129oE|BzJAy&8poV%x3X*T2q@*#%aysI(KDdmfK4Q
N7uP9=>CEDa{{XKy|H=RW

delta 3241
zcmZ`*e@xrQ6~FTju!*t1HaK6L6#O-q04=F)2*c6p3=`UZQJmHpWTOdCqcN7Sb;?v|
zw5$?nEYU3L&h%BCDkW>swyv8hjf}sVvB+uCHp(^$_M=N$X$eug*)3_wv>~#Fc<;`4
z5u(~j-+TA@+<Widz5Dge3@;x}S2!~^!b@90kRwwKzlc7n6p)mJXq=!}7vUGK<PfFC
z8&Y036!>~@Qq>t3T5+Kjhn@6E-=Mvqvr)zhG>O3*^l{(+8{v`|;Sw|$4z@fUdA#_F
zru^r#nfLque*Ca$>x&~V8PQJ}elk`<f(A-VoMS_j!yAS=dJPs0>a1c6Q~C)wjVdHe
zhYa(jvzJX~1H4=G4D2o5O$kgDdqWl+lI7#izy79~O<xswVD-n?GLve7M;czR5Mm@D
zSS<D)hzNFc%V8@a2J$FkR%EJBm6uFuLAHq`Nj@#sMc=@Wm%;%{JGH?ni|TO*Cro6$
z8b@q>6!@b>g+EwqrK{^$7p$SpCazVeG)?0ToELG!cB@LupwFs8uhmvME-Z_&&j=}G
zpF=jijy1#Ykex&J^g6Z}ZX&yc?94jW3Y8@)+*e{N&8%ZfVK1_=65E(Qv1HLF%|CZ*
zL_eMxG34j<iHnvCB;VpfDqmZUwZq$nErt)VsB_ozSZYfcW{aJh=h(n7OLi{9@qfz^
zvrUnk;rL$+n;1UL@kb1gu?*zWoVdXVmc`sS$L}%BvXDE%@t+uG%aluSe3@ZfW0H$=
z{3gRC47YK77V%ITF)z`tM!yo>AMM`XrKh$#Hsf?Bmh{+IYS6Ff?{xSpn9|!FevJ(3
zlcw5wM0&Er9}Z<YQyr*+Z3Xl6H=LwC*_PEOPnwsR&hhshKe&O7SSqpyC*PU88B2}$
z?H~809p}1s$CCeuB|ps$U_AOH+J8B_(xGVP(WmvvXZ_0A5~2|jTFJUte=2ei13t?#
z7)!1U;ehsR?k-G~ZXO#M|6Z7k{zOk+%f5P-xiX?D4nElyjO|<<NuJ{<c`ZNhI5(d=
zirySvf(~z&X(YRYTi|e=6Bm`(&VP=Wvr(i%8GSP54-e|iHa^?Y7F>A!U|+O1+86DQ
zK6xR#M;eYXOFemk8N1(FJ(@MJ9t>s}W!8)pk{^G91HQ@9gF7nQ#%&|}FaSN}KEOrG
z&v@@eOYS#o`TS5ibh9(L(61+N=o23-$Gc-G^D*2`daBgOMdTs$k$11Sv>Nhnhqr@u
zMSSRKzn(ICn8%%~dh){&4=&4P7u?xYXI*r$;ac8vL8HxUm~z2hTkZV|m{l?w&?h2a
z!fv!i8=F2E8Ag<S3M0wtIL_PVUzWp3n;#PPD&HdxKFapTP<BO=Yn{oJSi^_>sGMIj
zX6x?2EA}TI)^Q2gc3k=_pMMeM>v!_`Ih1oKEjWcnjK3FUFUmpWzlFp61?96SN047a
zIR_6rY7EP3Fzk5H97jhPIOV9JSvc=dX&SCNsvbF`Xk_bFTd^gLHgUA~<M%qM9zksS
zErNEt@*_jLz2bReyIo6}+U@nvnWOgLiK3`IJW;&IK2D7q>{7PWkaE=i0KZEZ;)5v3
z9=p<Sw-8J&xx?^XctL6Rb)e^ZjGFC5S@L<h>Dce-TL!XdFwatFzy^x*qDNk%^4c61
z*D9TwE`#1mzc0-8CKkbyZt)Cx3N#7BbfuT-aK17?ufQ#AcY(7i;7o|?O2%M;eN|pp
zkezod>{?)DwHUr%^%(7eg(@#?gj-eLpm7LQm%(*v+A<`Den!X>LcUZ2!`@x+nvYGP
z5iHsj9C=Yw(Sg(FHd;szN|mm1R6>FfMHGhbV4sb|QL0pdX+(CSp;T!%EFe-y0;Ng^
zIdYL1CbyQ6Fnh!{6eBK07z0&t=Aml0aWncIK$L**Rv&fBWb!3KOCQi0l?Gv<S{?f$
zVgs?0W${i)#}UB?Wuip~+c8XEE{xP4!UiQC(o?N*3YVOhQLHI(6Z!rLnw-Q2`njLT
z6_*{aaRZ!=crYc;!?nJ{dtGD)+^(q``z!Bs68R4zb+cRxld!?ZEv``hxnS_e0?X&e
zbR$oRO62C@=2$&|yh18mq(ajq$K@+<#;<ysu-l0W39&a#LMO_8Rzmk7a$yEF@q)8+
z$4VcwGE@-D2A{(atySp>_IDHN#|O|w9QSs8!IF?^w5>*;^J2BwJITr%v!d^^+VfM~
z`fObJr}#iYg?z+4-{!rWh&*4SlN2G(qS`o97lgK{bT_XL!2q_$c>5GgZBspKXs+OH
zNQm7az5R;mKzdsdtZZ}9%M5uw6>(h`v434C=ea)(_SLEMAvjZ~hPwIe?IbSpiiW%l
zBJae}+I$+v6_^cHjxj5`!fQ7yV%yK_UEr*5Ha#_P_}jDb`UstUwc&tiHox6R@f`XB
z0p}I5U{ZF$0J4qZlDH>fIuNk-2(|~V23kv21;Tq|OYjR$dA&$!fq`JaDpMSWmxHa&
zxVS~6%)lb*C}MLc;0d)l<yMih2?jy|tGxQU;pI@PRc^KfTty3cBBYFguPNZ15s8!X
zhcM8D@rVlr<7sNO%5xWntBB;W)xp!;YL(|H4o@_<-n&v0-a@)h!*p|=&cO7JR_khA
Qf%_UP?r5b4!P8RrKOKm^bpQYW