Initial commit

pierrec · Jul 10, 2012 · eb73a28 · eb73a28
commit eb73a28
Show file tree

Hide file tree

Showing 25 changed files with 3,164 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+node_modules/
diff --git a/.npmignore b/.npmignore
@@ -0,0 +1,3 @@
+.gitignore
+node_modules/
+data/
diff --git a/README.md b/README.md
@@ -0,0 +1,77 @@
+# LZ4
+
+[LZ4](http://fastcompression.blogspot.fr/) is a very fast compression and decompression algorithm. This nodejs module provides a Javascript implementation of it, currently limited on decompression. Direct bindings may be provided in the future.
+
+This is very much a __work in progress__.
+
+
+## Install
+
+	npm install lz4
+
+
+## Usage
+
+### Decoding
+
+There are 2 ways to decode:
+
+* __asynchronous__ using nodejs Streams - slowest but can handle very large data sets (no memory limitations)
+* __synchronous__ by feeding the whole LZ4 data - faster but is limited by the amount of memory
+
+Either way, there are 2 options that the decoder takes:
+
+* `chunkSize` (_Number_): number of bytes that was used to compress the data (default=8Mb)
+* `incrementSize` (_Number_): number of bytes by which to increment the output buffer if it becomes full and there is still data to decode. Setting it to the right value has a significant impact on performance. If the output size is known, use it as the incrementSize value for maximum performance.
+
+
+#### Asynchronous decoding
+
+First, create an LZ4 decoding stream with `LZ4#createDecoderStream()`.
+The stream can then decode any data piped to it. It will emit a `data` event on each decoded sequence, which can be saved into an output stream.
+
+The following example shows how to decode an LZ4 compressed file `test.lz4` into `test`.
+
+
+```javascript
+var fs = require('fs')
+var lz4 = require('lz4')
+
+var decoder = lz4.createDecoderStream()
+
+var input = fs.createReadStream('test.lz4')
+var output = fs.createWriteStream('test')
+
+input.pipe(decoder).pipe(output)
+
+```
+
+#### Synchronous decoding
+
+Read the data into memory and feed it to `LZ4#decode()`.
+
+```javascript
+var fs = require('fs')
+var lz4 = require('lz4')
+
+var input = fs.readFileSync('test.lz4')
+var output = lz4.decode(input)
+
+fs.writeFileSync('test', output)
+
+```
+
+
+## How it works
+
+* [LZ4 stream format](http://fastcompression.blogspot.fr/2011/05/lz4-explained.html)
+
+## Restrictions
+
+Currently, the decoder handles pure LZ4 streams, without additional data. For instance, to compress data you can use `bin/lz4demo32`, which adds a header to the created file. In order to properly decode it with lz4-js, you need to strip it out. You can use `bin/lz4strip` for that task.
+
+LZ4 streams have only been tested using `bin/lz4demo32`, not `bin/lz4demo64`.
+
+## License
+
+MIT
diff --git a/TODO.md b/TODO.md
@@ -0,0 +1 @@
+* shim Buffer.concat for node versions < 0.8.0
diff --git a/data/empty.lz4 b/data/empty.lz4
@@ -0,0 +1 @@
+!L
diff --git a/data/package.json b/data/package.json
@@ -0,0 +1 @@
+[object Uint8Array]
diff --git a/data/package.json.lz4 b/data/package.json.lz4
diff --git a/doc/format.txt b/doc/format.txt
@@ -0,0 +1,11 @@
+lz4 format
+
+sequence = token(1) + literalslen(i) + literals(token >> 4 + i) + [match copy: offset(2) + length(token >> 4 << 4)]
+
+match copy:
+	position = current postition - offset (0 is invalid)
+	length = 4 + length
+
+last 5 bytes = literals
+last match starts 12 bytes before end of stream
+last sequence is incomplete and stops after the literals
diff --git a/examples/file_uncompress.js b/examples/file_uncompress.js
@@ -0,0 +1,25 @@
+/**
+ * Uncompress an LZ4 stream
+ */
+// Modules
+var path = require('path')
+var fs = require('fs')
+var lz4 = require('..')
+
+// Input/Output files
+var inputFile = process.argv[2] || 'test.lz4'
+var outputFile = process.argv[3] || path.basename(inputFile, lz4.extension)
+
+var decoder = lz4.createDecoderStream()
+// var decoder = lz4.createDecoderStream({ incrementSize: (128 << 20), chunkSize: (128 << 20) })
+
+var input = fs.createReadStream( inputFile )
+var output = fs.createWriteStream( outputFile )
+
+console.log('Uncompressing', inputFile, 'to', outputFile, '...')
+decoder.on('end', function () {
+	console.timeEnd('lz4')
+})
+
+console.time('lz4')
+input.pipe(decoder).pipe(output)
diff --git a/examples/file_uncompressSync.js b/examples/file_uncompressSync.js
@@ -0,0 +1,26 @@
+/**
+ * Uncompress a Buffer containing LZ4 compressed data
+ */
+// Modules
+var path = require('path')
+var fs = require('fs')
+var lz4 = require('..')
+
+// Input/Output files
+var inputFile = process.argv[2] || 'test.lz4'
+var outputFile = process.argv[3] || path.basename(inputFile, lz4.extension)
+
+// Load the compressed data
+var input = fs.readFileSync( inputFile )
+
+// If the final uncompressed size is known, set the incrementSize with it
+// for faster decoding (no time spent resizing the output buffer)
+var incrementSize = (128 << 20) // 128Mb
+
+console.log('Uncompressing', inputFile, 'to', outputFile, '...')
+console.time('lz4')
+var decoded = lz4.decode( input, incrementSize )
+console.timeEnd('lz4')
+
+// Save the uncompressed data
+fs.writeFileSync( outputFile, decoded )
diff --git a/lib/decoder.js b/lib/decoder.js
@@ -0,0 +1,161 @@
+/**
+	Sequence definition: name (bytes length)
+	token (1)
+	literals length (0-n)
+	literals (0-l)
+	offset (2)
+	match copy length (0-c)
+
+	Chunk definition:
+	size (4) = n [unsigned 32 bits little endian integer]
+	sequences (n)
+
+	lz4demo32 and lz4demo64 output:
+	magic number (4) [unsigned 32 bits little endian integer]
+	chunks (n)
+ */
+
+;(function (exports) {
+
+	if (!Buffer) {
+		var Buffer = Uint8Array
+
+		if (!Buffer.prototype.concat)
+			Buffer.prototype.concat = function (list, size) {
+				if (arguments.length < 1)
+					for (var i = 0, n = list.length; i < n; i++)
+						size += list[i].length
+
+				var res = new Buffer(size)
+				var pos = 0
+
+				for (i = 0; i < n; i++) {
+					var item = list[i]
+					for (var j = 0, m = item.length; j < m; j++)
+						res[pos++] = item[j]
+				}
+
+				return res
+			}
+	}
+
+	/**
+	 * Decode an encoded chunk. Assumptions: input contains all sequences of a 
+	 * chunk, output is large enough to receive the decoded data.
+	 * If the output buffer is too small, an error will be thrown.
+	 * If the returned value is negative, an error occured at the returned offset.
+	 *
+	 * @param input {Buffer} input data
+	 * @param output {Buffer} output data
+	 * @return {Number} number of decoded bytes
+	 * @private
+	 */
+	function LZ4_uncompressChunk (input, output) {
+		// Process each sequence in the incoming data
+		for (var i = 0, n = input.length, j = 0; i < n;) {
+			var token = input[i++]
+
+			// Literals
+			// length of literals
+			var literals_length = (token >> 4)
+			for (
+				var l = literals_length + 240
+			; l === 255
+			; literals_length += (l = input[i++])
+			) {}
+
+			// Copy the literals
+			if (literals_length > 0) {
+				var end = i + literals_length
+				while (i < end) output[j++] = input[i++]
+			}
+
+			// End of buffer?
+			if (i === n) return j
+
+			// Match copy
+			// 2 bytes offset (little endian)
+			var offset = input[i++] | (input[i++] << 8)
+
+			// 0 is an invalid offset value
+			if (offset === 0) return -(i-2)
+
+			// length of match copy
+			var match_length = (token & 0xf)
+			for (
+				var l = match_length + 240
+			; l === 255
+			; match_length += (l = input[i++])
+			) {}
+			match_length += 4 // minmatch = 4
+
+			// Copy the match
+			var pos = j - offset // position of the match copy in the current output
+			var end = j + match_length
+			while (j < end) output[j++] = output[pos++]
+		}
+
+		return j
+	}
+
+	function decodeError (offset) {
+		throw new Error('Invalid data at ' + offset)
+	}
+
+	/**
+	 * Decode an encoded data set.
+	 * If the output size is known beforehand, set it to increase performance.
+	 *
+	 * @param input {Buffer} input data
+	 * @param chunkSize {Number} size of the chunk (default=8Mb) (optional)
+	 * @param outputSize {Number} size of the output (optional)
+	 * @return {Buffer} decoded data
+	 * @public
+	 */
+	function LZ4_uncompress (input, chunkSize, outputSize) {
+		chunkSize = chunkSize || (8 << 20)
+
+		// Magic number check
+		if (input.length < 4
+		|| input.readUInt32LE(0, true) !== exports.ARCHIVE_MAGICNUMBER )
+			decodeError(0)
+
+		// Output size is known, allocate all of it in one call
+		if (outputSize) {
+			var output = new Buffer(outputSize)
+
+			// Current index in the output buffer
+			var pos = 0
+
+			for (var i = 4, n = input.length; i < n;) {
+				var size = input.readUInt32LE(i, true)
+				i += 4
+				var decodedSize = LZ4_uncompressChunk( input.slice(i, i + size), output.slice(pos, pos + chunkSize) )
+				if (decodedSize < 0) decodeError(-decodedSize)
+				i += size
+				pos += decodedSize
+			}
+
+			return output
+		}
+
+		// Unknown output size, allocate on each pass
+		var output = []
+		for (var i = 4, n = input.length; i < n;) {
+			var size = input.readUInt32LE(i, true)
+			i += 4
+			var buf = new Buffer(chunkSize)
+			var decodedSize = LZ4_uncompressChunk( input.slice(i, i + size), buf )
+			if (decodedSize < 0) decodeError(-decodedSize)
+			output.push( decodedSize < chunkSize ? buf.slice(0, decodedSize) : buf )
+			i += size
+		}
+
+		return Buffer.concat(output)
+	}
+
+	exports.LZ4_uncompressChunk = LZ4_uncompressChunk
+	exports.LZ4_uncompress = LZ4_uncompress
+	exports.ARCHIVE_MAGICNUMBER = 0x184C2102
+
+})( (module && module.exports) || this )