-
Notifications
You must be signed in to change notification settings - Fork 18
/
x86.rs
198 lines (160 loc) Β· 5.73 KB
/
x86.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#[cfg(not(any(all(target_feature = "aes", target_feature = "sse2"), docsrs)))] // docs.rs bypasses the target_feature check
compile_error!{"Gxhash requires aes and sse2 intrinsics. Make sure the processor supports it and build with RUSTFLAGS=\"-C target-cpu=native\" or RUSTFLAGS=\"-C target-feature=+aes,+sse2\"."}
#[cfg(all(feature = "hybrid", not(any(target_feature = "aes", target_feature = "vaes", target_feature = "avx2"))))]
compile_error!{"Hybrid feature is only available on x86 processors with avx2 and vaes intrinsics."}
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
use super::*;
pub type State = __m128i;
#[inline(always)]
pub unsafe fn create_empty() -> State {
_mm_setzero_si128()
}
#[inline(always)]
pub unsafe fn create_seed(seed: i64) -> State {
_mm_set1_epi64x(seed)
}
#[inline(always)]
pub unsafe fn load_unaligned(p: *const State) -> State {
_mm_loadu_si128(p)
}
#[inline(always)]
pub unsafe fn get_partial_safe(data: *const State, len: usize) -> State {
// Temporary buffer filled with zeros
let mut buffer = [0i8; VECTOR_SIZE];
// Copy data into the buffer
core::ptr::copy(data as *const i8, buffer.as_mut_ptr(), len);
// Load the buffer into a __m256i vector
let partial_vector = _mm_loadu_si128(buffer.as_ptr() as *const State);
_mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
}
#[inline(always)]
pub unsafe fn get_partial_unsafe(data: *const State, len: usize) -> State {
let indices = _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
let mask = _mm_cmpgt_epi8(_mm_set1_epi8(len as i8), indices);
let partial_vector = _mm_and_si128(_mm_loadu_si128(data), mask);
_mm_add_epi8(partial_vector, _mm_set1_epi8(len as i8))
}
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn aes_encrypt(data: State, keys: State) -> State {
_mm_aesenc_si128(data, keys)
}
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn aes_encrypt_last(data: State, keys: State) -> State {
_mm_aesenclast_si128(data, keys)
}
#[inline(always)]
#[allow(dead_code)]
pub unsafe fn ld(array: *const u32) -> State {
_mm_loadu_si128(array as *const State)
}
#[cfg(not(feature = "hybrid"))]
#[inline(always)]
pub unsafe fn compress_8(mut ptr: *const State, end_address: usize, hash_vector: State, len: usize) -> State {
// Disambiguation vectors
let mut t1: State = create_empty();
let mut t2: State = create_empty();
// Hash is processed in two separate 128-bit parallel lanes
// This allows the same processing to be applied using 256-bit V-AES instrinsics
// so that hashes are stable in both cases.
let mut lane1 = hash_vector;
let mut lane2 = hash_vector;
while (ptr as usize) < end_address {
crate::gxhash::load_unaligned!(ptr, v0, v1, v2, v3, v4, v5, v6, v7);
let mut tmp1 = aes_encrypt(v0, v2);
let mut tmp2 = aes_encrypt(v1, v3);
tmp1 = aes_encrypt(tmp1, v4);
tmp2 = aes_encrypt(tmp2, v5);
tmp1 = aes_encrypt(tmp1, v6);
tmp2 = aes_encrypt(tmp2, v7);
t1 = _mm_add_epi8(t1, ld(KEYS.as_ptr()));
t2 = _mm_add_epi8(t2, ld(KEYS.as_ptr().offset(4)));
lane1 = aes_encrypt_last(aes_encrypt(tmp1, t1), lane1);
lane2 = aes_encrypt_last(aes_encrypt(tmp2, t2), lane2);
}
// For 'Zeroes' test
let len_vec = _mm_set1_epi32(len as i32);
lane1 = _mm_add_epi8(lane1, len_vec);
lane2 = _mm_add_epi8(lane2, len_vec);
// Merge lanes
aes_encrypt(lane1, lane2)
}
#[cfg(feature = "hybrid")]
#[inline(always)]
pub unsafe fn compress_8(ptr: *const State, end_address: usize, hash_vector: State, len: usize) -> State {
macro_rules! load_unaligned_x2 {
($ptr:ident, $($var:ident),+) => {
$(
#[allow(unused_mut)]
let mut $var = _mm256_loadu_si256($ptr);
$ptr = ($ptr).offset(1);
)+
};
}
let mut ptr = ptr as *const __m256i;
let mut t = _mm256_setzero_si256();
let mut lane = _mm256_set_m128i(hash_vector, hash_vector);
while (ptr as usize) < end_address {
load_unaligned_x2!(ptr, v0, v1, v2, v3);
let mut tmp = _mm256_aesenc_epi128(v0, v1);
tmp = _mm256_aesenc_epi128(tmp, v2);
tmp = _mm256_aesenc_epi128(tmp, v3);
t = _mm256_add_epi8(t, _mm256_loadu_si256(KEYS.as_ptr() as *const __m256i));
lane = _mm256_aesenclast_epi128(_mm256_aesenc_epi128(tmp, t), lane);
}
// Extract the two 128-bit lanes
let mut lane1 = _mm256_castsi256_si128(lane);
let mut lane2 = _mm256_extracti128_si256(lane, 1);
// For 'Zeroes' test
let len_vec = _mm_set1_epi32(len as i32);
lane1 = _mm_add_epi8(lane1, len_vec);
lane2 = _mm_add_epi8(lane2, len_vec);
// Merge lanes
aes_encrypt(lane1, lane2)
}
#[inline(always)]
pub unsafe fn load_u8(x: u8) -> State {
_mm_set1_epi8(x as i8)
}
#[inline(always)]
pub unsafe fn load_u16(x: u16) -> State {
_mm_set1_epi16(x as i16)
}
#[inline(always)]
pub unsafe fn load_u32(x: u32) -> State {
_mm_set1_epi32(x as i32)
}
#[inline(always)]
pub unsafe fn load_u64(x: u64) -> State {
_mm_set1_epi64x(x as i64)
}
#[inline(always)]
pub unsafe fn load_u128(x: u128) -> State {
let ptr = &x as *const u128 as *const State;
_mm_loadu_si128(ptr)
}
#[inline(always)]
pub unsafe fn load_i8(x: i8) -> State {
_mm_set1_epi8(x)
}
#[inline(always)]
pub unsafe fn load_i16(x: i16) -> State {
_mm_set1_epi16(x)
}
#[inline(always)]
pub unsafe fn load_i32(x: i32) -> State {
_mm_set1_epi32(x)
}
#[inline(always)]
pub unsafe fn load_i64(x: i64) -> State {
_mm_set1_epi64x(x)
}
#[inline(always)]
pub unsafe fn load_i128(x: i128) -> State {
let ptr = &x as *const i128 as *const State;
_mm_loadu_si128(ptr)
}