-
Notifications
You must be signed in to change notification settings - Fork 2
/
Crc32Slicing.pas
456 lines (355 loc) · 13.1 KB
/
Crc32Slicing.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
(*
CRC32 Slicing-By-8 Assembler IA-32 & x86-64 / Pascal implementation
Copyright (C) 2020 Ritlabs, SRL. All rights reserved.
Copyright (C) 2020-2021 Maxim Masiutin. All rights reserved.
This code is released under GNU Lesser General Public License (LGPL) v3.
The Slicing-By-8 x86-64 Assembler version and Pascal version is written by
Maxim Masiutin <maxim@masiutin.com>
Based on code written by Aleksandr Sharahov http://guildalfa.ru/alsha/node/2 ;
Based on code from "Synopse mORMot framework" https://synopse.info/ ;
Based on code by Intel Corp. https://sourceforge.net/projects/slicing-by-8 .
IA-32 or x86-64 assembler code has 1,20 CPU clock cycles (on Skylake) per byte of data.
Define PUREPASCAL if you wish to compile Pascal implementation.
Otherwise, the Assembler implementation will be compiled.
Can be compiled by Delphi or Free Pascal Compiler (FPC).
If you define SLICING_BY_4, the Pascal implementation of Slicing-By-4
will be compiled instead (there is no assembler implementation
of Slicing-By-4).
The "crc32_slicing" function that has the following inputs:
buffer (pointer);
length (unsigned integer, 32 or 64 bits depending on platform, highest bit must be zero);
crc (32-bit unsigned integer) - initial value;
table (or 4K for Slicing-by-4) (pointer) - the pre-computed look-up table of 8K, see InitCrc32SlicingByNTable in CrcTest.dpr on how to fill this table.
output: crc (32-bit unsigned integer).
Assembler implementation of "crc32_slicing" takes parameters in the following registers.
For IA-32: eax - buffer, edx - length, ecx - crc, 32-bits in stack - table; on return eax will contain output crc;
For x86-64: rcx - buffer, rdx - length, r8 - crc, r9 - table; on return rax will contain output crc.
*)
unit Crc32Slicing;
interface
{$IFDEF FPC}
{$ASMMODE INTEL}
{$ENDIF}
{$IFDEF SLICING_BY_4}
{$DEFINE PUREPASCAL}
{$ENDIF}
{$IFDEF FPC}
type
Crc32NativeUInt = NativeUInt;
{$ELSE}
{$IFDEF DELPHI_XE}
type
Crc32NativeUInt = NativeUInt;
{$ELSE}
type
Crc32NativeUInt = Cardinal;
{$ENDIF}
{$ENDIF}
type
TCrc32SlicingByNTable = packed array[0..{$IFDEF SLICING_BY_4}4{$ELSE}8{$ENDIF}-1,Byte] of Cardinal;
PCrc32SlicingByNTable = ^TCrc32SlicingByNTable;
function crc32_slicing(const Abuf; const ALength: Crc32NativeUInt; crc: Cardinal; const Acrc32FastTable: PCrc32SlicingByNTable): Cardinal; register;
implementation
function crc32_slicing(const Abuf; const ALength: Crc32NativeUInt; crc: Cardinal; const Acrc32FastTable: PCrc32SlicingByNTable): Cardinal; register;
{$IFDEF PUREPASCAL}
const
CAlignmentMask =
{$IFDEF SLICING_BY_4}
3
{$ELSE}
{$IFDEF WIN64}
7 // under Win64 and Slicing-by-8 we align to 8-byte boundary; otherwise to 4-byte boundary
{$ELSE}
3
{$ENDIF}
{$ENDIF}
;
var
buf: PAnsiChar;
len: Crc32NativeUInt;
{$IFNDEF SLICING_BY_4}
Term2: Cardinal;
{$IFDEF WIN64}
U64: UInt64;
{$ENDIF}
{$ENDIF SLICING_BY_4}
begin
len := ALength;
buf := @ABuf;
Result := crc;
if (buf <> nil) and (len > 0) then
begin
// align source buffer to 4 or 8 bytes boundary
repeat
if (Crc32NativeUInt(buf) and CAlignmentMask) = 0 then
Break;
Result := Acrc32FastTable^[0, Byte(Result xor Ord(buf^))] xor (Result shr 8);
Dec(len);
Inc(buf);
until len=0;
{$IFDEF SLICING_BY_4}
(*
Slicing-by-4 source code taken from "Synopse mORMot framework" at https://synopse.info/
SynCommons.pas, crc32cfast
Slicing-by-4 code is given here only to be able to compare its speed with Slicing-by-8
Here are the results for a Skylake CPU, clock cycles per byte (smaller is better):
Slicing-by-8, assembler (64-bit): 1,20
Slicing-by-8, assembler (32-bit): 1,20
Slicing-by-8, Delphi (64-bit): 1,84
Slicing-by-8, Delphi (32-bit): 1,82
Slicing-by-4, Delphi (64-bit): 2,68
Slicing-by-4, Delphi (32-bit): 2,69
The clock/per/byte ratio of binary produced by Free Pascal Compiler (FPC) 3.0.4 was
worse then of that produced by Delphi 10.3.3, so I didn't include the results.
*)
while len>=4 do
begin
Result := Result xor PCardinal(buf)^;
Inc(buf,4);
Result := Acrc32FastTable^[3, Byte(Result)] xor
Acrc32FastTable^[2, Byte(Result shr 8)] xor
Acrc32FastTable^[1, Byte(Result shr 16)] xor
Acrc32FastTable^[0, Result shr 24];
Dec(len,4);
end;
{$ELSE}
// Slicing-by-8 idea is taken from the original Intel Slicing-by-8 code (released in 2006)
// available on sourceforge.net/projects/slicing-by-8
while len >= 8 do
begin
{$IFDEF WIN64}
U64 := PUint64(buf)^;
Result := Result xor Cardinal(U64);
Term2 := U64 shr 32;
Inc(buf, 8);
{$ELSE}
Result := Result xor PCardinal(buf)^;
Inc(buf,4);
Term2 := PCardinal(buf)^;
Inc(buf,4);
{$ENDIF}
Dec(len,8);
Result := Acrc32FastTable^[7, Byte(Result)] xor
Acrc32FastTable^[6, Byte(Result shr 8)] xor
Acrc32FastTable^[5, Byte(Result shr 16)] xor
Acrc32FastTable^[4, Result shr 24] xor
Acrc32FastTable^[3, Byte(Term2)] xor
Acrc32FastTable^[2, Byte(Term2 shr 8)] xor
Acrc32FastTable^[1, Byte(Term2 shr 16)] xor
Acrc32FastTable^[0, Term2 shr 24];
end;
{$ENDIF}
while len>0 do
begin
Result := Acrc32FastTable^[0, Byte(Result xor Ord(buf^))] xor (Result shr 8);
dec(Len);
inc(Buf);
end;
end;
end;
{$else}
assembler;
// Slicing-by-8 assembler implementation
// adapted from fast Aleksandr Sharahov version ( http://guildalfa.ru/alsha/node/2 ) released in 2009
// Of that version, second iteration of loop unrolling was removed because testing demonstrated it had no benefit,
// or even made code slower because of a branch in the middle that could cause branch misprediction penalty.
// See also the "High Octane CRC Generation with the Intel Slicing-by-8 Algorithm" white paper published by Intel in 2006
{$IFDEF WIN64}
// under Win64, function arguments come in the following registers:
// first - RCX, second - RDX, third R8, fourth - R9
// return - RAX
// the following registers may be destroyed after the call: RAX,RCX,RDX,R8,R9,R10:R11
// https://msdn.microsoft.com/en-us/library/ms235286.aspx
// Under Win32, a call passes first parameter in EAX, second in EDX, third in ECX
// 64 32
// --- ---
// 1) rcx eax
// 2) rdx edx
// 3) r8 ecx
// 4) r9 [stack]
asm
// rcx has buf, rdx las len, r8 has crc, r9 has table
test rcx, rcx // null pointer comparison
jz @exit
cmp rdx, 8
jge @big
test rdx, rdx
jle @exit // negative value or zero // ZF = 1 or SF <> OF
mov r8d, r8d // clear higher bits of r8 that keeps current CRC32, see http://stackoverflow.com/questions/43964922/is-mov-r8d-r8d-a-legitimate-long-term-way-to-clear-higher-bits-32-63-of-a-64
// if we have 7 bytes or less, calculate them old-fashioned way
@calc_crc32:
xor r8b,byte ptr [rcx] // get next byte from the source buffer
inc rcx
movzx r10d, r8b
shr r8d, 8
xor r8d, dword ptr [r9+r10*4 + 1024 * 0] // use just "classic" 1024-bytes table of 256 elements of 32 bits
dec rdx
jnz @calc_crc32
mov eax, r8d
jmp @exit
@big:
mov eax, r8d // now rax has crc, higher bits are again cleared by this move
neg rdx // now we have negative length
// align source buffer by 8 bytes under 64-bit
test cl, 7
jz @aligned
@unaligned:
xor al,byte ptr [rcx] // get next byte from the source buffer
inc rcx
movzx r10d, al
shr eax, 8
xor eax, dword ptr [r9+r10*4 + 1024 * 0] // use just "classic" 1024-bytes table of 256 elements of 32 bits
inc rdx
test cl, 7
jnz @unaligned
@aligned:
sub rcx, rdx
add rdx, 8
mov r10, rdx // now we have negative length in r10
jg @check_tail
@block_loop:
mov edx, eax // this also sets the rest of rdx (bits 32-63) to zero
mov r11, qword ptr[rcx + r10 - 8] // get next 8 bytes (as QWORD) from the input buffer
xor edx, r11d
mov r8, r11
shr r8, 32
movzx r11d, r8b
shr r8d, 8
mov eax, dword ptr[r11 * 4 + r9 + 1024 * 3]
movzx r11d, r8b
shr r8d, 8
xor eax, dword ptr[r11 * 4 + r9 + 1024 * 2]
movzx r11d, r8b
shr r8d, 8
xor eax, dword ptr[r11 * 4 + r9 + 1024 * 1]
xor eax, dword ptr[r8 * 4 + r9 + 1024 * 0]
movzx r11d, dl
shr edx, 8
xor eax, dword ptr[r11 * 4 + r9 + 1024 * 7]
movzx r11d, dl
shr edx, 8
xor eax, dword ptr[r11 * 4 + r9 + 1024 * 6]
movzx r11d, dl
shr edx, 8
xor eax, dword ptr[r11 * 4 + r9 + 1024 * 5]
xor eax, dword ptr[rdx * 4 + r9 + 1024 * 4]
add r10, 8
jle @block_loop
@check_tail:
sub r10, 8
jnl @exit
@tail_loop:
movzx r8, byte[rcx + r10]
xor r8b, al
shr eax, 8
xor eax, dword ptr[r8 * 4 + r9 + 1024 * 0]
inc r10
jnz @tail_loop
@exit:
end;
{$ELSE !WIN64}
// Under Win32, a call passes first parameter in EAX, second in EDX, third in ECX
// result is returned in EAX
// procedures and functions must preserve the EBX, ESI, EDI, and EBP registers, but can modify the EAX, EDX, and ECX registers.
// http://docwiki.embarcadero.com/RADStudio/Seattle/en/Program_Control#Register_Convention
asm
// buf = eax, len = edx, crc = ecx
test eax, eax // null pointer comparison
jz @@exit
cmp edx, 8
jge @big
test edx, edx
jle @@exit // exit if length is negative value or zero // ZF = 1 or SF <> OF
// length is 7 bytes or less - calculate byte-by-byte and exit
push edi
push ebx
mov edi, ss:[ebp + 8] // 8 - stack offset of fourth argument
@calc_crc32:
xor cl,[eax]
inc eax
movzx ebx,cl
shr ecx,8
xor ecx,dword ptr [edi+ebx*4]
dec edx
jnz @calc_crc32
mov eax, ecx
pop ebx
pop edi
jmp @@exit
@big:
neg edx
push ebx
push esi
mov esi, edx // save "length" (negative) to esi
mov edx, eax // now edx has "buf"
mov eax, ecx // now eax has "crc"
mov ecx, esi // now ecx has "length" (negative)
mov esi, ss:[ebp + 8] // 8 - stack offset of fourth argument
// align by 4 bytes under Win32
test dl, 3
jz @aligned
@unaligned:
movzx ebx, byte[edx]
inc edx
xor bl, al
shr eax, 8
xor eax, dword ptr[ebx * 4 + esi]
inc ecx
test dl, 3
jnz @unaligned
@aligned:
sub edx, ecx
add ecx, 8
jg @check_tail
push ebp
push edi
mov ebp, esi
mov edi, edx
@block_loop:
mov edx, eax
mov ebx, [edi + ecx - 4]
xor edx, [edi + ecx - 8]
movzx esi, bl
shr ebx, 8
mov eax, dword ptr ds:[esi * 4 + ebp + 1024 * 3]
movzx esi, bl
shr ebx, 8
xor eax, dword ptr ds:[esi * 4 + ebp + 1024 * 2]
movzx esi, bl
shr ebx, 8
xor eax, dword ptr ds:[esi * 4 + ebp + 1024 * 1]
xor eax, dword ptr ds:[ebx * 4 + ebp + 1024 * 0]
movzx esi, dl
shr edx, 8
xor eax, dword ptr ds:[esi * 4 + ebp + 1024 * 7]
movzx esi, dl
shr edx, 8
xor eax, dword ptr ds:[esi * 4 + ebp + 1024 * 6]
movzx esi, dl
shr edx, 8
xor eax, dword ptr ds:[esi * 4 + ebp + 1024 * 5]
xor eax, dword ptr ds:[edx * 4 + ebp + 1024 * 4]
add ecx, 8
jle @block_loop
mov edx, edi
pop edi
pop ebp
@check_tail:
sub ecx, 8
jnl @@pop_exit
mov esi, ss:[ebp + 8] // 8 - stack offset of fourth argument
@tail_loop:
movzx ebx, byte[edx + ecx]
xor bl, al
shr eax, 8
xor eax, dword ptr ds:[ebx * 4 + esi]
inc ecx
jnz @tail_loop
@@pop_exit:
pop esi
pop ebx
@@exit:
end;
{$ENDIF WIN64}
{$endif PUREPASCAL}
end.