Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Speed up decompression by making the fast path for literals faster.

We do the fast-path step as soon as possible; in fact, as soon as we know the
literal length. Since we usually hit the fast path, we can then skip the checks
for long literals and available input space (beyond what the fast path check
already does).

Note that this changes the decompression Writer API; however, it does not
change the ABI, since writers are always templatized and as such never
cross compilation units. The new API is slightly more general, in that it
doesn't hard-code the value 16. Note that we also take care to check
for len <= 16 first, since the other two checks almost always succeed
(so we don't want to waste time checking for them until we have to).

The improvements are most marked on Nehalem, but are generally positive
on other platforms as well. All microbenchmarks are 64-bit, opt.

Clovertown (Core 2):

  Benchmark     Time(ns)    CPU(ns) Iterations
  --------------------------------------------
  BM_UFlat/0      110226     110224     100000 886.0MB/s  html    [ +1.5%]
  BM_UFlat/1     1036523    1036508      10000 646.0MB/s  urls    [ -0.8%]
  BM_UFlat/2       26775      26775     522570 4.4GB/s  jpg       [ +0.0%]
  BM_UFlat/3       49738      49737     280974 1.8GB/s  pdf       [ +0.3%]
  BM_UFlat/4      446790     446792      31334 874.3MB/s  html4   [ +0.8%]
  BM_UFlat/5       40561      40562     350424 578.5MB/s  cp      [ +1.3%]
  BM_UFlat/6       18722      18722     746903 568.0MB/s  c       [ +1.4%]
  BM_UFlat/7        5373       5373    2608632 660.5MB/s  lsp     [ +8.3%]
  BM_UFlat/8     1615716    1615718       8670 607.8MB/s  xls     [ +2.0%]
  BM_UFlat/9      345278     345281      40481 420.1MB/s  txt1    [ +1.4%]
  BM_UFlat/10     294855     294855      47452 404.9MB/s  txt2    [ +1.6%]
  BM_UFlat/11     914263     914263      15316 445.2MB/s  txt3    [ +1.1%]
  BM_UFlat/12    1222694    1222691      10000 375.8MB/s  txt4    [ +1.4%]
  BM_UFlat/13     584495     584489      23954 837.4MB/s  bin     [ -0.6%]
  BM_UFlat/14      66662      66662     210123 547.1MB/s  sum     [ +1.2%]
  BM_UFlat/15       7368       7368    1881856 547.1MB/s  man     [ +4.0%]
  BM_UFlat/16     110727     110726     100000 1021.4MB/s  pb     [ +2.3%]
  BM_UFlat/17     382138     382141      36616 460.0MB/s  gaviota [ -0.7%]

Westmere (Core i7):

  Benchmark     Time(ns)    CPU(ns) Iterations
  --------------------------------------------
  BM_UFlat/0       78861      78853     177703 1.2GB/s  html      [ +2.1%]
  BM_UFlat/1      739560     739491      18912 905.4MB/s  urls    [ +3.4%]
  BM_UFlat/2        9867       9866    1419014 12.0GB/s  jpg      [ +3.4%]
  BM_UFlat/3       31989      31986     438385 2.7GB/s  pdf       [ +0.2%]
  BM_UFlat/4      319406     319380      43771 1.2GB/s  html4     [ +1.9%]
  BM_UFlat/5       29639      29636     472862 791.7MB/s  cp      [ +5.2%]
  BM_UFlat/6       13478      13477    1000000 789.0MB/s  c       [ +2.3%]
  BM_UFlat/7        4030       4029    3475364 880.7MB/s  lsp     [ +8.7%]
  BM_UFlat/8     1036585    1036492      10000 947.5MB/s  xls     [ +6.9%]
  BM_UFlat/9      242127     242105      57838 599.1MB/s  txt1    [ +3.0%]
  BM_UFlat/10     206499     206480      67595 578.2MB/s  txt2    [ +3.4%]
  BM_UFlat/11     641635     641570      21811 634.4MB/s  txt3    [ +2.4%]
  BM_UFlat/12     848847     848769      16443 541.4MB/s  txt4    [ +3.1%]
  BM_UFlat/13     384968     384938      36366 1.2GB/s  bin       [ +0.3%]
  BM_UFlat/14      47106      47101     297770 774.3MB/s  sum     [ +4.4%]
  BM_UFlat/15       5063       5063    2772202 796.2MB/s  man     [ +7.7%]
  BM_UFlat/16      83663      83656     167697 1.3GB/s  pb        [ +1.8%]
  BM_UFlat/17     260224     260198      53823 675.6MB/s  gaviota [ -0.5%]

Barcelona (Opteron):

  Benchmark     Time(ns)    CPU(ns) Iterations
  --------------------------------------------
  BM_UFlat/0      112490     112457     100000 868.4MB/s  html    [ -0.4%]
  BM_UFlat/1     1066719    1066339      10000 627.9MB/s  urls    [ +1.0%]
  BM_UFlat/2       24679      24672     563802 4.8GB/s  jpg       [ +0.7%]
  BM_UFlat/3       50603      50589     277285 1.7GB/s  pdf       [ +2.6%]
  BM_UFlat/4      452982     452849      30900 862.6MB/s  html4   [ -0.2%]
  BM_UFlat/5       43860      43848     319554 535.1MB/s  cp      [ +1.2%]
  BM_UFlat/6       21419      21413     653573 496.6MB/s  c       [ +1.0%]
  BM_UFlat/7        6646       6645    2105405 534.1MB/s  lsp     [ +0.3%]
  BM_UFlat/8     1828487    1827886       7658 537.3MB/s  xls     [ +2.6%]
  BM_UFlat/9      391824     391714      35708 370.3MB/s  txt1    [ +2.2%]
  BM_UFlat/10     334913     334816      41885 356.6MB/s  txt2    [ +1.7%]
  BM_UFlat/11    1042062    1041674      10000 390.7MB/s  txt3    [ +1.1%]
  BM_UFlat/12    1398902    1398456      10000 328.6MB/s  txt4    [ +1.7%]
  BM_UFlat/13     545706     545530      25669 897.2MB/s  bin     [ -0.4%]
  BM_UFlat/14      71512      71505     196035 510.0MB/s  sum     [ +1.4%]
  BM_UFlat/15       8422       8421    1665036 478.7MB/s  man     [ +2.6%]
  BM_UFlat/16     112053     112048     100000 1009.3MB/s  pb     [ -0.4%]
  BM_UFlat/17     416723     416713      33612 421.8MB/s  gaviota [ -2.0%]

R=sanjay
  • Loading branch information...
commit 62377a4dd62d628e64b018fdfcb142c284c64364 1 parent e11aea4
snappy authored
Showing with 50 additions and 22 deletions.
  1. +50 −22 snappy.cc
View
72 snappy.cc
@@ -435,12 +435,26 @@ char* CompressFragment(const char* input,
// bool CheckLength() const;
//
// // Called repeatedly during decompression
-// bool Append(const char* ip, uint32 length, bool allow_fast_path);
+// bool Append(const char* ip, uint32 length);
// bool AppendFromSelf(uint32 offset, uint32 length);
-// };
//
-// "allow_fast_path" is a parameter that says if there is at least 16
-// readable bytes in "ip". It is currently only used by SnappyArrayWriter.
+// // The difference between TryFastAppend and Append is that TryFastAppend
+// // is allowed to read up to <available> bytes from the input buffer,
+// // whereas Append is allowed to read <length>.
+// //
+// // Also, TryFastAppend is allowed to return false, declining the append,
+// // without it being a fatal error -- just "return false" would be
+// // a perfectly legal implementation of TryFastAppend. The intention
+// // is for TryFastAppend to allow a fast path in the common case of
+// // a small append.
+// //
+// // NOTE(user): TryFastAppend must always return decline (return false)
+// // if <length> is 61 or more, as in this case the literal length is not
+// // decoded fully. In practice, this should not be a big problem,
+// // as it is unlikely that one would implement a fast path accepting
+// // this much data.
+// bool TryFastAppend(const char* ip, uint32 available, uint32 length);
+// };
// -----------------------------------------------------------------------
// Lookup table for decompression code. Generated by ComputeTable() below.
@@ -665,19 +679,23 @@ class SnappyDecompressor {
const unsigned char c = *(reinterpret_cast<const unsigned char*>(ip++));
if ((c & 0x3) == LITERAL) {
- uint32 literal_length = c >> 2;
- if (PREDICT_FALSE(literal_length >= 60)) {
+ uint32 literal_length = (c >> 2) + 1;
+ if (writer->TryFastAppend(ip, ip_limit_ - ip, literal_length)) {
+ DCHECK_LT(literal_length, 61);
+ ip += literal_length;
+ continue;
+ }
+ if (PREDICT_FALSE(literal_length >= 61)) {
// Long literal.
- const uint32 literal_length_length = literal_length - 59;
+ const uint32 literal_length_length = literal_length - 60;
literal_length =
- LittleEndian::Load32(ip) & wordmask[literal_length_length];
+ (LittleEndian::Load32(ip) & wordmask[literal_length_length]) + 1;
ip += literal_length_length;
}
- ++literal_length;
uint32 avail = ip_limit_ - ip;
while (avail < literal_length) {
- if (!writer->Append(ip, avail, false)) return;
+ if (!writer->Append(ip, avail)) return;
literal_length -= avail;
reader_->Skip(peeked_);
size_t n;
@@ -687,8 +705,7 @@ class SnappyDecompressor {
if (avail == 0) return; // Premature end of input
ip_limit_ = ip + avail;
}
- bool allow_fast_path = (avail >= 16);
- if (!writer->Append(ip, literal_length, allow_fast_path)) {
+ if (!writer->Append(ip, literal_length)) {
return;
}
ip += literal_length;
@@ -902,21 +919,29 @@ class SnappyArrayWriter {
return op_ == op_limit_;
}
- inline bool Append(const char* ip, uint32 len, bool allow_fast_path) {
+ inline bool Append(const char* ip, uint32 len) {
char* op = op_;
const int space_left = op_limit_ - op;
- if (allow_fast_path && len <= 16 && space_left >= 16) {
- // Fast path, used for the majority (about 90%) of dynamic invocations.
+ if (space_left < len) {
+ return false;
+ }
+ memcpy(op, ip, len);
+ op_ = op + len;
+ return true;
+ }
+
+ inline bool TryFastAppend(const char* ip, uint32 available, uint32 len) {
+ char* op = op_;
+ const int space_left = op_limit_ - op;
+ if (len <= 16 && available >= 16 && space_left >= 16) {
+ // Fast path, used for the majority (about 95%) of invocations.
UNALIGNED_STORE64(op, UNALIGNED_LOAD64(ip));
UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(ip + 8));
+ op_ = op + len;
+ return true;
} else {
- if (space_left < len) {
- return false;
- }
- memcpy(op, ip, len);
+ return false;
}
- op_ = op + len;
- return true;
}
inline bool AppendFromSelf(uint32 offset, uint32 len) {
@@ -985,10 +1010,13 @@ class SnappyDecompressionValidator {
inline bool CheckLength() const {
return expected_ == produced_;
}
- inline bool Append(const char* ip, uint32 len, bool allow_fast_path) {
+ inline bool Append(const char* ip, uint32 len) {
produced_ += len;
return produced_ <= expected_;
}
+ inline bool TryFastAppend(const char* ip, uint32 available, uint32 length) {
+ return false;
+ }
inline bool AppendFromSelf(uint32 offset, uint32 len) {
if (produced_ <= offset - 1u) return false; // -1u catches offset==0
produced_ += len;
Please sign in to comment.
Something went wrong with that request. Please try again.