Skip to content

Commit 8c2a627

Browse files
committed
complete lz77 compress
1 parent 6ee7833 commit 8c2a627

File tree

5 files changed

+147
-62
lines changed

5 files changed

+147
-62
lines changed

readme.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ Reference to: [MersenneTwister.pdf](./docs/MersenneTwister.pdf)
3838

3939
### AhoCorasickSensitiveWordFilter
4040

41+
### LZ77Compressor
42+
Reference to: [MersenneTwister.pdf](./docs/LZ77.pdf)
43+
44+
### GolombEncoder
45+
Reference to: [MersenneTwister.pdf](./docs/HoffmanAndGolombCoding.pdf)
46+
4147
## Installation
4248
```bash
4349
<dependency>

src/main/java/com/github/myibu/algorithm/compress/LZ77Compressor.java

Lines changed: 74 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,8 @@
33
import com.github.myibu.algorithm.data.Bits;
44
import com.github.myibu.algorithm.endode.GolombEncoder;
55

6-
import java.util.ArrayList;
7-
import java.util.Arrays;
8-
import java.util.List;
6+
import java.util.*;
7+
import java.util.stream.Collectors;
98

109
/**
1110
* LZ77 compress algorithm
@@ -60,19 +59,18 @@ public int compress(byte[] in_data, int in_len, byte[] out_data) {
6059
byte[] lWindow = new byte[l];
6160
int sp = 0, lp = l, ip = 0, op = 0;
6261
while (lWindow.length > 0 && ip < in_len) {
63-
// abracadabrad
6462
// update search buffer
6563
int sStart = 0, sEnd = sp < s ? sp : s;
6664
for (int i = sStart; i < sEnd; i++) {
67-
System.out.println("ip=" + ip + ", i=" + i + ", sEnd=" + sEnd + ", sp=" + sp);
6865
sBuf[i] = in_data[ip - i - 1];
6966
}
7067
// update look ahead window
7168
int lStart = 0, lEnd = ip + l < in_len ? l : in_len - ip;
7269
for (int i = lStart; i < lEnd; i++) {
7370
lWindow[i] = in_data[ip + i];
7471
}
75-
System.out.println("all=abracadabrad, sBuf=" + new StringBuilder(new String(sBuf)).reverse().toString() + ", lWindow=" + new String(lWindow));
72+
//System.out.println("txt=" + new String() + new String(in_data) + ", sBuf="
73+
// + new StringBuilder(new String(sBuf)).reverse().toString() + ", lWindow=" + new String(lWindow));
7674

7775
int llStart = sEnd - 1, rrStart = 0, llEnd = 0, rrEnd = (lp = lEnd);
7876
int minMatched = 1, minIndex = 0;
@@ -87,64 +85,41 @@ public int compress(byte[] in_data, int in_len, byte[] out_data) {
8785
}
8886
llStart--;
8987
}
90-
System.out.println("minIndex=" + minIndex + ", all=abracadabrad, sBuf=" + new StringBuilder(new String(sBuf)).reverse().toString() + ", lWindow=" + new String(lWindow));
9188
// matched
9289
if (minIndex > 0) {
93-
// byte[] tuple = String.format("(%d,%d,%s)", minIndex + 1, minMatched, new String(new byte[]{lWindow[minMatched]})).getBytes();
94-
// System.arraycopy(tuple, 0, out_data, (op++) * tuple.length, tuple.length);
95-
System.out.println(String.format("(%d, %d, %s)", minIndex + 1, minMatched, new String(new byte[]{lWindow[minMatched]})));
9690
tuples.add(Arrays.asList( minIndex + 1, minMatched, (int)lWindow[minMatched]));
9791
sp += (minMatched + 1);
98-
// if (sp > s) {
99-
// sp = s-1;
100-
// }
10192
ip += (minMatched + 1);
10293
} else {
10394
sp++;
104-
// if (sp > s) {
105-
// sp = s-1;
106-
// }
10795
ip++;
108-
// byte[] tuple = String.format("(%d,%d,%s)", 0, 0, new String(new byte[]{lWindow[0]})).getBytes();
109-
// System.arraycopy(tuple, 0, out_data, (op++) * tuple.length, tuple.length);
110-
System.out.println(String.format("(%d, %d, %s)", 0, 0, new String(new byte[]{lWindow[0]})));
11196
tuples.add(Arrays.asList(0, 0, (int)lWindow[0]));
11297
}
11398
}
114-
System.out.println(tuples);
115-
int sum = 0;
99+
// System.out.println(tuples);
100+
return doEncode(tuples, out_data);
101+
}
102+
103+
private int doEncode(List<List<Integer>> tuples, byte[] out_data) {
104+
Bits finalRes = new Bits();
116105
GolombEncoder encoder = new GolombEncoder();
117106
for (List<Integer> tuple: tuples) {
118107
Bits bits = new Bits();
119-
bits.append(encoder.encodeToBinary(tuple.get(0), (int)(Math.ceil(Math.log(s) / Math.log(2)))));
120-
System.out.println("1" + bits);
121-
bits.append(encoder.encode(tuple.get(1), 5));
122-
System.out.println("2" + bits);
123-
bits.append(Bits.ofByte((byte)tuple.get(2).intValue()));
124-
System.out.println("3" + bits);
125-
sum += bits.length();
108+
Bits bits1 = encoder.encodeToBinary(tuple.get(0), (int)(Math.ceil(Math.log(s) / Math.log(2))));
109+
bits.append(bits1);
110+
Bits bits2 = encoder.encode(tuple.get(1), l);
111+
bits.append(bits2);
112+
Bits bits3 = Bits.ofByte((byte)tuple.get(2).intValue());
113+
bits.append(bits3);
114+
// System.out.println("("+ bits1 + ", "+ bits2 + ", "+ bits3 + ")");
115+
finalRes.append(bits);
126116
}
127-
System.out.println("compressed length: " + sum);
128-
return 0;
117+
byte[] fr = finalRes.toByteArray();
118+
System.arraycopy(fr, 0, out_data, 0, fr.length);
119+
// System.out.println("bits: " + finalRes);
120+
return fr.length;
129121
}
130122

131-
// private int indexOf(int llStart, int rrStart, int llEnd, int rrEnd, byte[] sBuf, byte[] lWindow) {
132-
// int minMatched = 1, minIndex = 0;
133-
// while (llStart >= 0) {
134-
// int matched = 0, left = llStart, right = rrStart;
135-
// while (left >= 0 && right < rrEnd && sBuf[left--] == lWindow[right++]) {
136-
// matched++;
137-
// }
138-
// if (matched >= minMatched) {
139-
// minIndex = llStart;
140-
// minMatched = matched;
141-
// }
142-
// llStart--;
143-
// }
144-
// System.out.println("minIndex=" + minIndex + ", all=abracadabrad, sBuf=" + new StringBuilder(new String(sBuf)).reverse().toString() + ", lWindow=" + new String(lWindow));
145-
// return minIndex;
146-
// }
147-
148123
/**
149124
* for each token (offset, length, symbol)
150125
* if offset = 0 then
@@ -162,6 +137,57 @@ public int compress(byte[] in_data, int in_len, byte[] out_data) {
162137
*/
163138
@Override
164139
public int decompress(byte[] in_data, int in_len, byte[] out_data) {
165-
return 0;
140+
int e1 = (int)(Math.ceil(Math.log(s) / Math.log(2)));
141+
GolombEncoder encoder = new GolombEncoder();
142+
Set<Bits> allEncodeSeq = new HashSet<>();
143+
for (int i = 0; i <= l; i++) {
144+
allEncodeSeq.add(encoder.encode(i, l));
145+
}
146+
List<Bits> sortedEncodeSeq = allEncodeSeq.stream().sorted(Comparator.comparingInt(Bits::length)).collect(Collectors.toList());
147+
Bits bits = Bits.ofByte(in_data);
148+
int ip = 0;
149+
List<List<Integer>> tuples = new ArrayList<>();
150+
while (ip < bits.length()) {
151+
Bits b1 = bits.subBits(ip, ip + e1);
152+
ip = ip + e1;
153+
int offset = encoder.encodeToBinary(b1);
154+
int length = -1;
155+
for (Bits sortedEncode: sortedEncodeSeq) {
156+
if (ip + sortedEncode.length() < bits.length()) {
157+
if (sortedEncode.equals(bits.subBits(ip, ip+sortedEncode.length()))) {
158+
length = encoder.decode(sortedEncode, l);
159+
ip += sortedEncode.length();
160+
break;
161+
}
162+
}
163+
}
164+
if (length == -1 || ip+8 > bits.length()) {
165+
break;
166+
}
167+
int symbol = (int)bits.subBits(ip, ip+8).toByte();
168+
tuples.add(Arrays.asList(offset, length, symbol));
169+
ip += 8;
170+
}
171+
// System.out.println(tuples);
172+
return doDecode(tuples, out_data);
173+
}
174+
175+
private int doDecode(List<List<Integer>> tuples, byte[] out_data) {
176+
Bits seq = new Bits();
177+
for (List<Integer> tuple: tuples) {
178+
int offset = tuple.get(0), length = tuple.get(1), symbol = tuple.get(2);
179+
Bits sb = Bits.ofByte((byte) symbol);
180+
if (offset == 0) {
181+
seq.append(sb);
182+
} else {
183+
int start = seq.byteLength() < s ? seq.byteLength() - offset: s - offset;
184+
seq.append(seq.subBits(start * 8, (start + length) * 8)).append(sb);
185+
}
186+
}
187+
int len = seq.byteLength();
188+
for (int i = 0; i < len; i++) {
189+
out_data[i] = seq.getByte(i).toByte();
190+
}
191+
return len;
166192
}
167193
}

src/main/java/com/github/myibu/algorithm/data/Bits.java

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package com.github.myibu.algorithm.data;
22

3+
import java.util.Arrays;
34
import java.util.Iterator;
45
import java.util.NoSuchElementException;
56

@@ -8,7 +9,7 @@
89
* @author myibu
910
* Created on 2021/9/14
1011
*/
11-
public class Bits implements Iterable<Bit> {
12+
public class Bits implements Iterable<Bit>, Cloneable {
1213
public static final int BYTE_SIZE = 8;
1314
public static final int SHORT_SIZE = 16;
1415
public static final int INT_SIZE = 32;
@@ -139,10 +140,18 @@ public Bits rrShift(Bits offset) {
139140
}
140141

141142
public byte[] toByteArray() {
142-
int len = byteLength();
143+
Bits bits;
144+
if (this.used % BYTE_SIZE != 0) {
145+
Bits completed = this.clone();
146+
completed.append(Bits.ofZero(BYTE_SIZE - this.used % BYTE_SIZE));
147+
bits = completed;
148+
} else {
149+
bits = this;
150+
}
151+
int len = bits.byteLength();
143152
byte[] data = new byte[len];
144153
for (int i = 0; i < len; i++) {
145-
data[i] = getByte(i).toByte();
154+
data[i] = bits.getByte(i).toByte();
146155
}
147156
return data;
148157
}
@@ -207,6 +216,23 @@ private static int pow(int m, int n){
207216
return res;
208217
}
209218

219+
public static Bits ofString(String txt) {
220+
if (txt == null || txt.length() == 0) {
221+
return new Bits();
222+
}
223+
Bits bits = new Bits();
224+
for (int i = 0; i < txt.length(); i++) {
225+
char ch = txt.charAt(i);
226+
if (ch == '0') {
227+
bits.append(Bit.ZERO);
228+
} else if (ch == '1') {
229+
bits.append(Bit.ONE);
230+
} else {
231+
throw new IllegalArgumentException("illegal character " + (ch-'0') + " in index " + i);
232+
}
233+
}
234+
return bits;
235+
}
210236
public static Bits ofByte(byte val) {
211237
return ofByte(val, BYTE_SIZE);
212238
}
@@ -453,12 +479,9 @@ public Bit[] table(){
453479
@Override
454480
public String toString() {
455481
StringBuilder builder = new StringBuilder();
456-
builder.append("Bits{table=");
457482
for (int i = 0; i < used; i++) {
458483
builder.append(table[i].value());
459484
}
460-
builder.append(", size=").append(size)
461-
.append(", used=").append(used).append("}");
462485
return builder.toString();
463486
}
464487

@@ -495,4 +518,30 @@ public Bit get(int index) {
495518
throw new IndexOutOfBoundsException();
496519
return table[index];
497520
}
521+
522+
@Override
523+
public Bits clone() {
524+
Bits dest = null;
525+
try{
526+
dest = (Bits) super.clone();
527+
dest.table = new Bit[used];
528+
System.arraycopy(table, 0, dest.table, 0, used);
529+
} catch (CloneNotSupportedException e){
530+
e.printStackTrace();
531+
}
532+
return dest;
533+
}
534+
535+
@Override
536+
public boolean equals(Object o) {
537+
if (this == o) return true;
538+
if (o == null || getClass() != o.getClass()) return false;
539+
Bits bits = (Bits) o;
540+
return Arrays.equals(table, bits.table);
541+
}
542+
543+
@Override
544+
public int hashCode() {
545+
return Arrays.hashCode(table);
546+
}
498547
}

src/main/java/com/github/myibu/algorithm/endode/GolombEncoder.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
/**
77
* Golomb code
8-
* see <a herf="https://en.wikipedia.org/wiki/Golomb_coding#Simple_algorithm">https://en.wikipedia.org/wiki/Golomb_coding#Simple_algorithm</a>
8+
* see <a href="https://en.wikipedia.org/wiki/Golomb_coding#Simple_algorithm">https://en.wikipedia.org/wiki/Golomb_coding#Simple_algorithm</a>
99
* @author myibu
1010
* Created on 2021/10/12
1111
*/

src/test/java/com/github/myibu/algorithm/AlgorithmTest.java

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import com.github.myibu.algorithm.compress.LZFCompressor;
66
import com.github.myibu.algorithm.data.Bits;
77
import com.github.myibu.algorithm.data.Bytes;
8-
import com.github.myibu.algorithm.endode.Encoder;
98
import com.github.myibu.algorithm.endode.GolombEncoder;
109
import com.github.myibu.algorithm.filter.*;
1110
import com.github.myibu.algorithm.hash.MurmurHash2;
@@ -20,10 +19,8 @@
2019
import org.junit.Test;
2120

2221
import java.nio.charset.StandardCharsets;
23-
import java.util.ArrayList;
24-
import java.util.Arrays;
25-
import java.util.List;
26-
import java.util.Set;
22+
import java.util.*;
23+
import java.util.stream.Collectors;
2724

2825
public class AlgorithmTest {
2926
@Test
@@ -171,10 +168,17 @@ public void testLZFCompressor() {
171168

172169
@Test
173170
public void testLZ77Compressor() {
174-
byte[] in_data = "abracadabrad".getBytes(StandardCharsets.UTF_8);
171+
// todo
172+
byte[] in_data = "com.github.myibu.algorithm.AlgorithmTest.testLZ77Compressor".getBytes(StandardCharsets.UTF_8);
175173
byte[] out_data = new byte[in_data.length*2];
176174
Compressor compressor = new LZ77Compressor();
177-
compressor.compress(in_data, in_data.length, out_data);
175+
int compressed = compressor.compress(in_data, in_data.length, out_data);
176+
byte[] compressed_data = Arrays.copyOf(out_data, compressed);
177+
// System.out.println(new String(compressed_data));
178+
byte[] decompressed_data = new byte[compressed * 2];
179+
int decompressed = compressor.decompress(compressed_data, compressed, decompressed_data);
180+
Assert.assertEquals("com.github.myibu.algorithm.AlgorithmTest.testLZ77Compressor",
181+
new String(Arrays.copyOf(decompressed_data, decompressed), StandardCharsets.UTF_8));
178182
}
179183

180184
@Test

0 commit comments

Comments
 (0)